Diff [3c64c6684cc8f107bdea7411e2d29dfffaa2a544:58fe85a783c7d674af3da0f1fd03f42ef7f11be5] for / – Cforall

.gitignore

-              r3c64c668
+              r58fe85a
 # generated by configure
+aclocal.m4
+automake
 autom4te.cache
 config.h
 …
 config.log
 config.py
+configure
+libtool
 stamp-h1
-libtool
 /Makefile
+/Makefile.in
 **/Makefile
+**/Makefile.in
+**/Makefile.dist.in
 /version
 …
 libcfa/x64-debug/
 libcfa/x64-nodebug/
-libcfa/x64-nolib/
 libcfa/x86-debug/
 libcfa/x86-nodebug/
+libcfa/x86-nolib/
+libcfa/arm-debug/
+libcfa/arm-nodebug/
+libcfa/arm-nolib/
+libcfa/arm64-debug/
+libcfa/arm64-nodebug/
 # generated by bison and lex from parser.yy and lex.ll
 …
 doc/user/pointer2.tex
 doc/user/EHMHierarchy.tex
+# generated by npm
+package-lock.json

Jenkins/FullBuild

-              r3c64c668
+              r58fe85a
         def err = null
+        final scmVars = checkout scm
+        final commitId = scmVars.GIT_COMMIT
         try {
                 //Wrap build to add timestamp to command line
 …
                         stage('Build') {
+                                results = [null, null]
+                                parallel (
+                                        gcc_8_x86_old: { trigger_build( 'gcc-8',   'x86', false ) },
+                                        gcc_7_x86_old: { trigger_build( 'gcc-7',   'x86', false ) },
+                                        gcc_6_x86_old: { trigger_build( 'gcc-6',   'x86', false ) },
+                                        gcc_9_x64_old: { trigger_build( 'gcc-9',   'x64', false ) },
+                                        gcc_8_x64_old: { trigger_build( 'gcc-8',   'x64', false ) },
+                                        gcc_7_x64_old: { trigger_build( 'gcc-7',   'x64', false ) },
+                                        gcc_6_x64_old: { trigger_build( 'gcc-6',   'x64', false ) },
+                                        gcc_5_x64_old: { trigger_build( 'gcc-5',   'x64', false ) },
+                                        clang_x64_old: { trigger_build( 'clang',   'x64', false ) },
+                                        clang_x64_new: { trigger_build( 'clang',   'x64', true  ) },
+                                )
+                        }
+                                parallel (
+                                        clang_x86: { trigger_build( 'gcc-8',   'x86' ) },
+                                        gcc_5_x86: { trigger_build( 'gcc-7',   'x86' ) },
+                                        gcc_6_x86: { trigger_build( 'gcc-6',   'x86' ) },
+                                        gcc_9_x64: { trigger_build( 'gcc-9',   'x64' ) },
+                                        gcc_8_x64: { trigger_build( 'gcc-8',   'x64' ) },
+                                        gcc_7_x64: { trigger_build( 'gcc-7',   'x64' ) },
+                                        gcc_6_x64: { trigger_build( 'gcc-6',   'x64' ) },
+                                        gcc_5_x64: { trigger_build( 'gcc-5',   'x64' ) },
+                                        clang_x64: { trigger_build( 'clang',   'x64' ) },
+                                )
+                        stage('Package') {
+                                trigger_dist( commitId, currentBuild.number.toString() )
+                        }
+                }
 …
 //===========================================================================================================
 def trigger_build(String cc, String arch) {
+def trigger_build(String cc, String arch, boolean new_ast) {
         def result = build job: 'Cforall/master',               \
                 parameters: [                                           \
 …
                           name: 'Architecture',                         \
                           value: arch],                                 \
+                        [$class: 'BooleanParameterValue',               \
+                          name: 'NewAST',                               \
+                          value: new_ast],                              \
                         [$class: 'BooleanParameterValue',               \
                           name: 'RunAllTests',                          \
 …
                         [$class: 'BooleanParameterValue',               \
                           name: 'Publish',                              \
                           value: true],                                 \
+                          value: true],                                         \
                         [$class: 'BooleanParameterValue',               \
                           name: 'Silent',                               \
 …
+}
+//Helper routine to collect information about the git history
+def collect_git_info() {
+def trigger_dist(String commitId, String buildNum) {
+        def result = build job: 'Cforall_Distribute_Ref',       \
+                parameters: [                                           \
+                        string(name: 'GitRef', value: commitId),        \
+                        string(name: 'Build' , value: buildNum) \
+                ],                                                              \
+                propagate: false
+        //create the temporary output directory in case it doesn't already exist
+        def out_dir = pwd tmp: true
+        sh "mkdir -p ${out_dir}"
+        echo(result.result)
         //parse git logs to find what changed
         dir("../Cforall_Full_Build@script") {
                 sh "git reflog > ${out_dir}/GIT_COMMIT"
+        if(result.result != 'SUCCESS') {
+                sh("wget -q -O - https://cforall.uwaterloo.ca/jenkins/job/Cforall_Distribute_Ref/${result.number}/consoleText")
+                error(result.result)
+        }
-        git_reflog = readFile("${out_dir}/GIT_COMMIT")
-        gitRefOldValue = (git_reflog =~ /moving from (.+) to (.+)/)[0][1]
-        gitRefNewValue = (git_reflog =~ /moving from (.+) to (.+)/)[0][2]
+}

Jenkinsfile

-              r3c64c668
+              r58fe85a
 import groovy.transform.Field
-// For skipping stages
-import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 //===========================================================================================================
 …
         SrcDir    = pwd tmp: false
         Settings  = null
         StageName = ''
+        Tools     = null
         // Local variables
 …
                                 SrcDir    = pwd tmp: false
                                 clean()
                                 checkout()
+                                Tools.Clean()
+                                Tools.Checkout()
                                 build()
 …
         //attach the build log to the email
         catch (Exception caughtError) {
+                //rethrow error later
+                // Store the result of the build log
+                currentBuild.result = "FAILURE"
+                // An error has occured, the build log is relevent
+                log_needed = true
+                // rethrow error later
                 err = caughtError
+                // print the error so it shows in the log
                 echo err.toString()
-                //An error has occured, the build log is relevent
-                log_needed = true
-                //Store the result of the build log
-                currentBuild.result = "${StageName} FAILURE".trim()
+        }
 …
 // Main compilation routines
 //===========================================================================================================
-def clean() {
-        build_stage('Cleanup', true) {
-                // clean the build by wipping the build directory
-                dir(BuildDir) {
-                        deleteDir()
+                }
+        }
+}
-//Compilation script is done here but environnement set-up and error handling is done in main loop
-def checkout() {
-        build_stage('Checkout', true) {
-                //checkout the source code and clean the repo
-                final scmVars = checkout scm
-                Settings.GitNewRef = scmVars.GIT_COMMIT
-                Settings.GitOldRef = scmVars.GIT_PREVIOUS_COMMIT
-                echo GitLogMessage()
-                // This is a complete hack but it solves problems with automake thinking it needs to regenerate makefiles
-                // We fudged automake/missing to handle that but automake stills bakes prints inside the makefiles
-                // and these cause more problems.
-                sh 'find . -name Makefile.in -exec touch {} +'
+        }
+}
 def build() {
         debug = true
         release = Settings.RunAllTests || Settings.RunBenchmark
+        build_stage('Build : configure', true) {
+        Tools.BuildStage('Build : configure', true) {
+                // Configure must be run inside the tree
+                dir (SrcDir) {
+                        // Generate the necessary build files
+                        sh './autogen.sh'
+                }
                 // Build outside of the src tree to ease cleaning
                 dir (BuildDir) {
                         //Configure the conpilation (Output is not relevant)
+                        //Configure the compilation (Output is not relevant)
                         //Use the current directory as the installation target so nothing escapes the sandbox
                         //Also specify the compiler by hand
 …
+                        }
+                        sh "${SrcDir}/configure CXX=${Settings.Compiler.CXX} CC=${Settings.Compiler.CC} ${Settings.Architecture.flags} ${targets} --quiet"
+                        ast = Settings.NewAST ? "--enable-new-ast" : "--disable-new-ast"
+                        sh "${SrcDir}/configure CXX=${Settings.Compiler.CXX} CC=${Settings.Compiler.CC} ${Settings.Architecture.flags} AR=gcc-ar RANLIB=gcc-ranlib ${targets} ${ast} --quiet --prefix=${BuildDir}"
                         // Configure libcfa
 …
+        }
         build_stage('Build : cfa-cpp', true) {
+        Tools.BuildStage('Build : cfa-cpp', true) {
                 // Build outside of the src tree to ease cleaning
                 dir (BuildDir) {
 …
+        }
         build_stage('Build : libcfa(debug)', debug) {
+        Tools.BuildStage('Build : libcfa(debug)', debug) {
                 // Build outside of the src tree to ease cleaning
                 dir (BuildDir) {
 …
+        }
         build_stage('Build : libcfa(nodebug)', release) {
+        Tools.BuildStage('Build : libcfa(nodebug)', release) {
                 // Build outside of the src tree to ease cleaning
                 dir (BuildDir) {
                         sh "make -j 8 --no-print-directory -C libcfa/${Settings.Architecture.name}-nodebug"
+                }
+        }
+        Tools.BuildStage('Build : install', true) {
+                // Build outside of the src tree to ease cleaning
+                dir (BuildDir) {
+                        sh "make -j 8 --no-print-directory install"
+                }
+        }
 …
 def test() {
         try {
                 build_stage('Test: short', !Settings.RunAllTests) {
+                Tools.BuildStage('Test: short', !Settings.RunAllTests) {
                         dir (BuildDir) {
                                 //Run the tests from the tests directory
 …
+                }
                 build_stage('Test: full', Settings.RunAllTests) {
+                Tools.BuildStage('Test: full', Settings.RunAllTests) {
                         dir (BuildDir) {
                                         //Run the tests from the tests directory
 …
                 echo "Archiving core dumps"
                 dir (BuildDir) {
                         archiveArtifacts artifacts: "tests/crashes/**/*", fingerprint: true
+                        archiveArtifacts artifacts: "tests/crashes/**/*,lib/**/lib*.so*", fingerprint: true
+                }
                 throw err
 …
 def benchmark() {
         build_stage('Benchmark', Settings.RunBenchmark) {
+        Tools.BuildStage('Benchmark', Settings.RunBenchmark) {
                 dir (BuildDir) {
                         //Append bench results
 …
 def build_doc() {
         build_stage('Documentation', Settings.BuildDocumentation) {
+        Tools.BuildStage('Documentation', Settings.BuildDocumentation) {
                 dir ('doc/user') {
                         make_doc()
 …
 def publish() {
         build_stage('Publish', true) {
+        Tools.BuildStage('Publish', true) {
                 if( Settings.Publish && !Settings.RunBenchmark ) { echo 'No results to publish!!!' }
 …
 //Routine responsible of sending the email notification once the build is completed
 //===========================================================================================================
-@NonCPS
-def SplitLines(String text) {
-        def list = []
-        text.eachLine {
-                list += it
+        }
-        return list
+}
-def GitLogMessage() {
-        if (!Settings || !Settings.GitOldRef || !Settings.GitNewRef) return "\nERROR retrieveing git information!\n"
-        def oldRef = Settings.GitOldRef
-        def newRef = Settings.GitNewRef
-        def revText = sh(returnStdout: true, script: "git rev-list ${oldRef}..${newRef}").trim()
-        def revList = SplitLines( revText )
-        def gitUpdate = ""
-        revList.each { rev ->
-                def type = sh(returnStdout: true, script: "git cat-file -t ${rev}").trim()
-                gitUpdate = gitUpdate + "       via  ${rev} (${type})"
+        }
-        def rev = oldRef
-        def type = sh(returnStdout: true, script: "git cat-file -t ${rev}").trim()
-        gitUpdate = gitUpdate + "      from  ${rev} (${type})"
-        def gitLog    = sh(returnStdout: true, script: "git rev-list --format=short ${oldRef}...${newRef}").trim()
-        def gitDiff   = sh(returnStdout: true, script: "git diff --stat --color ${newRef} ${oldRef}").trim()
-        gitDiff = gitDiff.replace('[32m', '<span style="color: #00AA00;">')
-        gitDiff = gitDiff.replace('[31m', '<span style="color: #AA0000;">')
-        gitDiff = gitDiff.replace('[m', '</span>')
-        return """
-<pre>
-The branch ${env.BRANCH_NAME} has been updated.
-${gitUpdate}
-</pre>
-<p>Check console output at ${env.BUILD_URL} to view the results.</p>
-<p>- Status --------------------------------------------------------------</p>
-<p>BUILD# ${env.BUILD_NUMBER} - ${currentBuild.result}</p>
-<p>- Log -----------------------------------------------------------------</p>
-<pre>
-${gitLog}
-</pre>
-<p>-----------------------------------------------------------------------</p>
-<pre>
-Summary of changes:
-${gitDiff}
-</pre>
-"""
+}
 //Standard build email notification
 def email(boolean log) {
 …
 generated because of a git hooks/post-receive script following
 a ref change which was pushed to the C\u2200 repository.</p>
 """ + GitLogMessage()
+""" + Tools.GitLogMessage()
         def email_to = !Settings.IsSandbox ? "cforall@lists.uwaterloo.ca" : "tdelisle@uwaterloo.ca"
 …
         public String CXX
         public String CC
+        CC_Desc(String name, String CXX, String CC) {
+        public String lto
+        CC_Desc(String name, String CXX, String CC, String lto) {
                 this.name = name
                 this.CXX = CXX
+                this.CC = CC
+                this.CC  = CC
+                this.lto = lto
+        }
+}
 …
         public final CC_Desc Compiler
         public final Arch_Desc Architecture
+        public final Boolean NewAST
         public final Boolean RunAllTests
         public final Boolean RunBenchmark
 …
                 switch( param.Compiler ) {
                         case 'gcc-9':
                                 this.Compiler = new CC_Desc('gcc-9', 'g++-9', 'gcc-9')
+                                this.Compiler = new CC_Desc('gcc-9', 'g++-9', 'gcc-9', '-flto=auto')
                         break
                         case 'gcc-8':
                                 this.Compiler = new CC_Desc('gcc-8', 'g++-8', 'gcc-8')
+                                this.Compiler = new CC_Desc('gcc-8', 'g++-8', 'gcc-8', '-flto=auto')
                         break
                         case 'gcc-7':
                                 this.Compiler = new CC_Desc('gcc-7', 'g++-7', 'gcc-7')
+                                this.Compiler = new CC_Desc('gcc-7', 'g++-7', 'gcc-7', '-flto=auto')
                         break
                         case 'gcc-6':
                                 this.Compiler = new CC_Desc('gcc-6', 'g++-6', 'gcc-6')
+                                this.Compiler = new CC_Desc('gcc-6', 'g++-6', 'gcc-6', '-flto=auto')
                         break
                         case 'gcc-5':
                                 this.Compiler = new CC_Desc('gcc-5', 'g++-5', 'gcc-5')
+                                this.Compiler = new CC_Desc('gcc-5', 'g++-5', 'gcc-5', '-flto=auto')
                         break
                         case 'gcc-4.9':
                                 this.Compiler = new CC_Desc('gcc-4.9', 'g++-4.9', 'gcc-4.9')
+                                this.Compiler = new CC_Desc('gcc-4.9', 'g++-4.9', 'gcc-4.9', '-flto=auto')
                         break
                         case 'clang':
                                 this.Compiler = new CC_Desc('clang', 'clang++-6.0', 'gcc-6')
+                                this.Compiler = new CC_Desc('clang', 'clang++-10', 'gcc-9', '-flto=thin -flto-jobs=0')
                         break
                         default :
 …
                 this.IsSandbox          = (branch == "jenkins-sandbox")
+                this.NewAST             = param.NewAST
                 this.RunAllTests        = param.RunAllTests
                 this.RunBenchmark       = param.RunBenchmark
 …
                 this.DescShort = "${ this.Compiler.name }:${ this.Architecture.name }${full}"
+                final ast = this.NewAST ? "New AST" : "Old AST"
                 this.DescLong = """Compiler              : ${ this.Compiler.name } (${ this.Compiler.CXX }/${ this.Compiler.CC })
+AST Version             : ${ ast.toString() }
 Architecture            : ${ this.Architecture.name }
 Arc Flags               : ${ this.Architecture.flags }
 …
         // prepare the properties
         properties ([                                                                                                   \
+                buildDiscarder(logRotator(                                                                              \
+                        artifactDaysToKeepStr: '',                                                                      \
+                        artifactNumToKeepStr: '',                                                                       \
+                        daysToKeepStr: '730',                                                                           \
+                        numToKeepStr: '1000'                                                                            \
+                )),                                                                                                             \
                 [$class: 'ParametersDefinitionProperty',                                                                \
                         parameterDefinitions: [                                                                         \
 …
                                         description: 'Which compiler to use',                                   \
                                         name: 'Compiler',                                                                       \
                                         choices: 'gcc-9\ngcc-8\ngcc-7\ngcc-6\ngcc-5\ngcc-4.9\nclang',                                   \
+                                        choices: 'gcc-9\ngcc-8\ngcc-7\ngcc-6\ngcc-5\ngcc-4.9\nclang',   \
                                         defaultValue: 'gcc-8',                                                          \
                                 ],                                                                                              \
 …
                                 ],                                                                                              \
                                 [$class: 'BooleanParameterDefinition',                                                  \
+                                        description: 'If true, build compiler using new AST',           \
+                                        name: 'NewAST',                                                                         \
+                                        defaultValue: true,                                                             \
+                                ],                                                                                              \
+                                [$class: 'BooleanParameterDefinition',                                                  \
                                         description: 'If false, only the quick test suite is ran',              \
                                         name: 'RunAllTests',                                                            \
 …
                 ]])
+        // It's unfortunate but it looks like we need to checkout the entire repo just to get the pretty git printer
+        // It's unfortunate but it looks like we need to checkout the entire repo just to get
+        // - the pretty git printer
+        // - Jenkins.tools
         checkout scm
+        Tools = load "Jenkins/tools.groovy"
         final settings = new BuildSettings(params, env.BRANCH_NAME)
 …
         return settings
+}
-def build_stage(String name, boolean run, Closure block ) {
-        StageName = name
-        echo " -------- ${StageName} -------- "
-        if(run) {
-                stage(name, block)
-        } else {
-                stage(name) { Utils.markStageSkippedForConditional(STAGE_NAME) }
+        }
+}

Makefile.am

-              r3c64c668
+              r58fe85a
 MAINTAINERCLEANFILES = lib/* bin/* tests/.deps/* tests/.out/* # order important
+DISTCLEANFILES = version
 SUBDIRS = driver src . @LIBCFA_TARGET_DIRS@
+DIST_SUBDIRS = driver src . libcfa tests
 @LIBCFA_TARGET_MAKEFILES@ : Makefile $(srcdir)/libcfa/configure
 …
         @ls $(config_file) || (echo "Missing config.data, re-run configure script again" && false)
         @$(eval config_data = $(shell cat $(config_file)))
         @echo "Configuring libcfa with '$(config_data)''"
+        @echo "Configuring libcfa ($(abs_top_srcdir)/libcfa/configure) with '$(config_data)' from $(shell pwd) / $(dir $@)"
         @cd $(dir $@) && $(abs_top_srcdir)/libcfa/configure $(config_data)
 …
 man1_MANS = doc/man/cfa.1
+EXTRA_DIST = LICENSE doc/man/cfa.1 libcfa/configure libcfa/Makefile.dist.am libcfa/Makefile.dist.in tools/build/distcc_hash tools/build/push2dist.sh
 debug=yes
 …
         @./config.status --config | sed "s/ /\n\t/g; s/\t'/\t/g; s/'\n/\n/g; s/^'//g; s/'$$//g"
         @find libcfa -name config.status -printf "\n%h\n\t" -exec {} --config \; | sed "s/ /\n\t/g; s/\t'/\t/g; s/'\n/\n/g; s/^'//g; s/'$$//g"
+mostlyclean-local: @LIBCFA_TARGET_MAKEFILES@
+        for dir in @LIBCFA_TARGET_DIRS@; do \
+                $(MAKE) -C $${dir} mostlyclean; \
+        done
+clean-local: @LIBCFA_TARGET_MAKEFILES@
+        for dir in @LIBCFA_TARGET_DIRS@; do \
+                $(MAKE) -C $${dir} clean; \
+        done
+distclean-local: @LIBCFA_TARGET_MAKEFILES@
+        for dir in @LIBCFA_TARGET_DIRS@; do \
+                $(MAKE) -C $${dir} distclean; \
+                rm $${dir}/config.data; \
+        done

benchmark/Makefile.am

-              r3c64c668
+              r58fe85a
 ## Created On       : Sun May 31 09:08:15 2015
 ## Last Modified By : Peter A. Buhr
 ## Last Modified On : Sat Jan 25 09:20:44 2020
 ## Update Count     : 255
+## Last Modified On : Tue Mar 10 11:41:18 2020
+## Update Count     : 258
 ###############################################################################
 …
 # applies to both programs
 include $(top_srcdir)/src/cfa.make
+include $(top_srcdir)/tools/build/cfa.make
 AM_CFLAGS = -O2 -Wall -Wextra -I$(srcdir) -lrt -pthread # -Werror
 …
 # Dummy hack tricks
 EXTRA_PROGRAMS = dummy # build but do not install
 dummy_SOURCES = dummyC.c dummyCXX.cpp
+nodist_dummy_SOURCES = dummyC.c dummyCXX.cpp
 dummyC.c:
 …
 ## =========================================================================================================
+all : basic$(EXEEXT) ctxswitch$(EXEEXT) mutex$(EXEEXT) schedint$(EXEEXT) schedext$(EXEEXT) creation$(EXEEXT)
+# all is used by make dist so ignore it
+all:
+all-bench : basic$(EXEEXT) ctxswitch$(EXEEXT) mutex$(EXEEXT) schedint$(EXEEXT) schedext$(EXEEXT) creation$(EXEEXT)
 basic_loop_DURATION = 15000000000
 …
 creation_cfa_coroutine_DURATION = 100000000
 creation_cfa_coroutine_eager_DURATION = 10000000
+creation_cfa_generator_DURATION = 1000000000
 creation_upp_coroutine_DURATION = ${creation_cfa_coroutine_eager_DURATION}
-creation_cfa_thread_DURATION = 10000000
-creation_upp_thread_DURATION = ${creation_cfa_thread_DURATION}
 creation_DURATION = 10000000
 …
 cleancsv:
         rm -f compile.csv basic.csv ctxswitch.csv mutex.csv scheduling.csv
+        rm -f compile.csv basic.csv ctxswitch.csv mutex.csv schedint.csv
 jenkins$(EXEEXT): cleancsv
 …
         +make mutex.csv
         -+make mutex.diff.csv
         +make scheduling.csv
         -+make scheduling.diff.csv
+        +make schedint.csv
+        -+make schedint.diff.csv
 @DOifskipcompile@
         cat compile.csv
 …
         cat mutex.csv
         -cat mutex.diff.csv
         cat scheduling.csv
         -cat scheduling.diff.csv
+        cat schedint.csv
+        -cat schedint.diff.csv
 compile.csv:
 …
         $(srcdir)/fixcsv.sh $@
 scheduling.csv:
+schedint.csv:
         echo "building $@"
         echo "schedint-1,schedint-2,schedext-1,schedext-2" > $@
 …
 ctxswitch-python_coroutine$(EXEEXT):
         $(BENCH_V_PY)echo "#!/bin/sh" > a.out
         echo "python3.7 $(srcdir)/ctxswitch/python_cor.py" >> a.out
+        echo "python3 $(srcdir)/ctxswitch/python_cor.py \"$$""@\"" >> a.out
         chmod a+x a.out
 ctxswitch-nodejs_coroutine$(EXEEXT):
         $(BENCH_V_NODEJS)echo "#!/bin/sh" > a.out
         echo "nodejs $(srcdir)/ctxswitch/node_cor.js" >> a.out
+        echo "nodejs $(srcdir)/ctxswitch/node_cor.js \"$$""@\"" >> a.out
         chmod a+x a.out
 ctxswitch-nodejs_await$(EXEEXT):
         $(BENCH_V_NODEJS)echo "#!/bin/sh" > a.out
         echo "nodejs $(srcdir)/ctxswitch/node_await.js" >> a.out
+        echo "nodejs $(srcdir)/ctxswitch/node_await.js \"$$""@\"" >> a.out
         chmod a+x a.out
 …
         $(BENCH_V_JAVAC)javac -d $(builddir) $(srcdir)/ctxswitch/JavaThread.java
         echo "#!/bin/sh" > a.out
         echo "java JavaThread" >> a.out
+        echo "java JavaThread \"$$""@\"" >> a.out
         chmod a+x a.out
 …
         $(BENCH_V_JAVAC)javac -d $(builddir) $(srcdir)/mutex/JavaThread.java
         echo "#!/bin/sh" > a.out
         echo "java JavaThread" >> a.out
+        echo "java JavaThread \"$$""@\"" >> a.out
         chmod a+x a.out
 …
         $(BENCH_V_JAVAC)javac -d $(builddir) $(srcdir)/schedint/JavaThread.java
         echo "#!/bin/sh" > a.out
         echo "java JavaThread" >> a.out
+        echo "java JavaThread \"$$""@\"" >> a.out
         chmod a+x a.out
 …
 creation-python_coroutine$(EXEEXT):
         $(BENCH_V_PY)echo "#!/bin/sh" > a.out
         echo "python3.7 $(srcdir)/creation/python_cor.py" >> a.out
+        echo "python3 $(srcdir)/creation/python_cor.py \"$$""@\"" >> a.out
         chmod a+x a.out
 creation-nodejs_coroutine$(EXEEXT):
         $(BENCH_V_NODEJS)echo "#!/bin/sh" > a.out
         echo "nodejs $(srcdir)/creation/node_cor.js" >> a.out
+        echo "nodejs $(srcdir)/creation/node_cor.js \"$$""@\"" >> a.out
         chmod a+x a.out
 …
         $(BENCH_V_JAVAC)javac -d $(builddir) $(srcdir)/creation/JavaThread.java
         echo "#!/bin/sh" > a.out
         echo "java JavaThread" >> a.out
+        echo "java JavaThread \"$$""@\"" >> a.out
         chmod a+x a.out
 …
 ## =========================================================================================================
 compile$(EXEEXT) :              \
+bcompile$(EXEEXT) :             \
         compile-array.make      \
         compile-attributes.make \
 …
 compile-array$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(testdir)/array.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/array.cfa
 compile-attributes$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(testdir)/attributes.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/attributes.cfa
 compile-empty$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(srcdir)/compile/empty.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(srcdir)/compile/empty.cfa
 compile-expression$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(testdir)/expression.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/expression.cfa
 compile-io$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(testdir)/io1.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/io1.cfa
 compile-monitor$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(testdir)/concurrent/monitor.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/concurrent/monitor.cfa
 compile-operators$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(testdir)/operators.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/operators.cfa
 compile-thread$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(testdir)/concurrent/thread.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/concurrent/thread.cfa
 compile-typeof$(EXEEXT):
         $(CFACOMPILE) -fsyntax-only -w $(testdir)/typeof.cfa
+        $(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/typeof.cfa
 ## =========================================================================================================
 …
 size-cfa$(EXEEXT):
         $(BENCH_V_CFA)$(CFACOMPILE) $(srcdir)/size/size.cfa
+## =========================================================================================================
+%-tokio$(EXEEXT): $(srcdir)/readyQ/%.rs $(srcdir)/bench.rs
+        cd $(builddir) && cargo build --release
+        cp $(builddir)/target/release/$(basename $@) $@

benchmark/creation/JavaThread.java

-              r3c64c668
+              r58fe85a
 public class JavaThread {
         // Simplistic low-quality Marsaglia Shift-XOR pseudo-random number generator.
         // Bijective
+        // Bijective
         // Cycle length for non-zero values is 4G-1.
         // 0 is absorbing and should be avoided -- fixed point.
         // The returned value is typically masked to produce a positive value.
         static volatile int Ticket = 0 ;
+        static volatile int Ticket = 0 ;
         private static int nextRandom (int x) {
                 if (x == 0) {
+                if (x == 0) {
                         // reseed the PRNG
                         // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
                         // Note that we use a non-atomic racy increment -- the race is rare and benign.
                         // If the race is a concern switch to an AtomicInteger.
                         // In addition accesses to the RW volatile global "Ticket"  variable are not
                         // (readily) predictable at compile-time so the JIT will not be able to elide
                         // nextRandom() invocations.
                         x = ++Ticket ;
                         if (x == 0) x = 1 ;
+                        // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
+                        // Note that we use a non-atomic racy increment -- the race is rare and benign.
+                        // If the race is a concern switch to an AtomicInteger.
+                        // In addition accesses to the RW volatile global "Ticket"  variable are not
+                        // (readily) predictable at compile-time so the JIT will not be able to elide
+                        // nextRandom() invocations.
+                        x = ++Ticket ;
+                        if (x == 0) x = 1 ;
+                }
                 x ^= x << 6;
                 x ^= x >>> 21;
                 x ^= x << 7;
                 return x ;
+                return x ;
+        }
         static int x = 2;
         static private int times = Integer.parseInt("10000") ;
+        static private long times = Long.parseLong("10000") ;
         public static class MyThread extends Thread {
 …
+        }
         public static void helper() throws InterruptedException {
                 for(int i = 1; i <= times; i += 1) {
+                for(long i = 1; i <= times; i += 1) {
                         MyThread m = new MyThread();
                         x = nextRandom( x );
 …
+        }
         public static void main(String[] args) throws InterruptedException {
                 if ( args.length > 2 ) System.exit( 1 );
                 if ( args.length == 2 ) { times = Integer.parseInt(args[1]); }
+                if ( args.length > 1 ) System.exit( 1 );
+                if ( args.length == 1 ) { times = Long.parseLong(args[0]); }
                 for (int i = Integer.parseInt("5"); --i >= 0 ; ) {
+                for (int i = Integer.parseInt("5"); --i >= 0 ; ) {
                         InnerMain();
                         Thread.sleep(2000);             // 2 seconds

benchmark/creation/cfa_gen.cfa

-              r3c64c668
+              r58fe85a
 #include "bench.h"
+#include "../bench.h"
 struct C {
+generator G {
         volatile int restart; // ensure compiler does not optimize away all the code
 };
 void ?{}( C & c ) { c.restart = 0; }
 void main( C & ) {}
+void ?{}( G & g ) { g.restart = 0; }
+void main( G & ) {}
 int main( int argc, char * argv[] ) {
 …
         BENCH(
                 for ( times ) {
                          C c;
+                         G g;
                 },
                 result

benchmark/ctxswitch/JavaThread.java

-              r3c64c668
+              r58fe85a
 public class JavaThread {
         // Simplistic low-quality Marsaglia Shift-XOR pseudo-random number generator.
         // Bijective
+        // Bijective
         // Cycle length for non-zero values is 4G-1.
         // 0 is absorbing and should be avoided -- fixed point.
         // The returned value is typically masked to produce a positive value.
         static volatile int Ticket = 0 ;
+        static volatile int Ticket = 0 ;
         private static int nextRandom (int x) {
                 if (x == 0) {
+                if (x == 0) {
                         // reseed the PRNG
                         // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
                         // Note that we use a non-atomic racy increment -- the race is rare and benign.
                         // If the race is a concern switch to an AtomicInteger.
                         // In addition accesses to the RW volatile global "Ticket"  variable are not
                         // (readily) predictable at compile-time so the JIT will not be able to elide
                         // nextRandom() invocations.
                         x = ++Ticket ;
                         if (x == 0) x = 1 ;
+                        // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
+                        // Note that we use a non-atomic racy increment -- the race is rare and benign.
+                        // If the race is a concern switch to an AtomicInteger.
+                        // In addition accesses to the RW volatile global "Ticket"  variable are not
+                        // (readily) predictable at compile-time so the JIT will not be able to elide
+                        // nextRandom() invocations.
+                        x = ++Ticket ;
+                        if (x == 0) x = 1 ;
+                }
                 x ^= x << 6;
                 x ^= x >>> 21;
                 x ^= x << 7;
                 return x ;
+                return x ;
+        }
         static int x = 2;
         static private int times = Integer.parseInt("100000");
+        static private long times = Long.parseLong("100000");
         public static void helper() {
                 for(int i = 1; i <= times; i += 1) {
+                for(long i = 1; i <= times; i += 1) {
                         Thread.yield();
+                }
 …
+        }
         public static void main(String[] args) throws InterruptedException {
                 if ( args.length > 2 ) System.exit( 1 );
                 if ( args.length == 2 ) { times = Integer.parseInt(args[1]); }
+                if ( args.length > 1 ) System.exit( 1 );
+                if ( args.length == 1 ) { times = Long.parseLong(args[0]); }
                 for (int i = Integer.parseInt("5"); --i >= 0 ; ) {

benchmark/ctxswitch/cfa_cor.cfa

-              r3c64c668
+              r58fe85a
 #include <thread.hfa>
 #include "bench.h"
+#include "../bench.h"
 coroutine C {} c;
+coroutine C {};
 void main( __attribute__((unused)) C & ) {
         while () {
                 suspend();
+        for () {
+                suspend;
+        }
+}
 int main( int argc, char * argv[] ) {
+        C c;
         BENCH_START()
         BENCH(

benchmark/ctxswitch/cfa_gen.cfa

-              r3c64c668
+              r58fe85a
 #include "../bench.h"
+typedef struct {
+        void * next;
+} C;
+void comain( C * c ) {
+        if ( __builtin_expect(c->next != 0, 1) ) goto *(c->next);
+        c->next = &&s1;
+generator G {};
+void main( G & ) {
         for () {
+                return;
+          s1: ;
+                suspend;
+        }
+}
 int main( int argc, char * argv[] ) {
+        G g;
         BENCH_START()
-        C c = { 0 };
         BENCH(
                 for ( times ) {
                         comain( &c );
+                        resume( g );
                 },
                 result

benchmark/exclude

r3c64c668	r58fe85a
10	10	interrupt_linux.c
11	11	exclude
	12	io
12	13	Monitor.c

benchmark/mutex/JavaThread.java

-              r3c64c668
+              r58fe85a
 public class JavaThread {
         // Simplistic low-quality Marsaglia Shift-XOR pseudo-random number generator.
         // Bijective
+        // Bijective
         // Cycle length for non-zero values is 4G-1.
         // 0 is absorbing and should be avoided -- fixed point.
         // The returned value is typically masked to produce a positive value.
         static volatile int Ticket = 0 ;
+        static volatile int Ticket = 0 ;
         private static int nextRandom (int x) {
                 if (x == 0) {
+                if (x == 0) {
                         // reseed the PRNG
                         // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
                         // Note that we use a non-atomic racy increment -- the race is rare and benign.
                         // If the race is a concern switch to an AtomicInteger.
                         // In addition accesses to the RW volatile global "Ticket"  variable are not
                         // (readily) predictable at compile-time so the JIT will not be able to elide
                         // nextRandom() invocations.
                         x = ++Ticket ;
                         if (x == 0) x = 1 ;
+                        // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
+                        // Note that we use a non-atomic racy increment -- the race is rare and benign.
+                        // If the race is a concern switch to an AtomicInteger.
+                        // In addition accesses to the RW volatile global "Ticket"  variable are not
+                        // (readily) predictable at compile-time so the JIT will not be able to elide
+                        // nextRandom() invocations.
+                        x = ++Ticket ;
+                        if (x == 0) x = 1 ;
+                }
                 x ^= x << 6;
                 x ^= x >>> 21;
                 x ^= x << 7;
                 return x ;
+                return x ;
+        }
         static int x = 2;
         static private int times = Integer.parseInt("100000000");
+        static private long times = Long.parseLong("100000000");
         public synchronized void noop() {
 …
                 JavaThread j = new JavaThread();
                 // Inhibit biased locking ...
                 x = (j.hashCode() ^ System.identityHashCode(j)) | 1 ;
                 for(int i = 1; i <= times; i += 1) {
+                x = (j.hashCode() ^ System.identityHashCode(j)) | 1 ;
+                for(long i = 1; i <= times; i += 1) {
                         x = nextRandom(x);
                         j.noop();
 …
+        }
         public static void main(String[] args) throws InterruptedException {
                 if ( args.length > 2 ) System.exit( 1 );
                 if ( args.length == 2 ) { times = Integer.parseInt(args[1]); }
+                if ( args.length > 1 ) System.exit( 1 );
+                if ( args.length == 1 ) { times = Long.parseLong(args[0]); }
                 for (int n = Integer.parseInt("5"); --n >= 0 ; ) {
+                for (int n = Integer.parseInt("5"); --n >= 0 ; ) {
                         InnerMain();
                         Thread.sleep(2000);     // 2 seconds

benchmark/mutexC/JavaThread.java

-              r3c64c668
+              r58fe85a
 class Noop {
         // Simplistic low-quality Marsaglia Shift-XOR pseudo-random number generator.
         // Bijective
+        // Bijective
         // Cycle length for non-zero values is 4G-1.
         // 0 is absorbing and should be avoided -- fixed point.
         // The returned value is typically masked to produce a positive value.
         static volatile int Ticket = 0 ;
+        static volatile int Ticket = 0 ;
         public static int nextRandom( int x ) {
                 if (x == 0) {
+                if (x == 0) {
                         // reseed the PRNG
                         // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
                         // Note that we use a non-atomic racy increment -- the race is rare and benign.
                         // If the race is a concern switch to an AtomicInteger.
                         // In addition accesses to the RW volatile global "Ticket"  variable are not
                         // (readily) predictable at compile-time so the JIT will not be able to elide
                         // nextRandom() invocations.
                         x = ++Ticket ;
                         if (x == 0) x = 1 ;
+                        // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
+                        // Note that we use a non-atomic racy increment -- the race is rare and benign.
+                        // If the race is a concern switch to an AtomicInteger.
+                        // In addition accesses to the RW volatile global "Ticket"  variable are not
+                        // (readily) predictable at compile-time so the JIT will not be able to elide
+                        // nextRandom() invocations.
+                        x = ++Ticket ;
+                        if (x == 0) x = 1 ;
+                }
                 x ^= x << 6;
                 x ^= x >>> 21;
                 x ^= x << 7;
                 return x ;
+                return x ;
+        }
+}
 …
         static int x = 2;
         static private int times = Integer.parseInt("10000000");
+        static private long times = Long.parseLong("10000000");
         public static void call( Monitor m ) throws InterruptedException {
 …
                 m.go = true;
                 //while ( ! m.go2 );
                 for ( int i = 0; i < times; i += 1 ) {
+                for ( long i = 0; i < times; i += 1 ) {
                         m.call();
                         x = Noop.nextRandom( x );
 …
         public static void main( String[] args ) throws InterruptedException {
                 if ( args.length > 2 ) System.exit( 1 );
                 if ( args.length == 2 ) { times = Integer.parseInt(args[1]); }
+                if ( args.length == 2 ) { times = Long.parseLong(args[1]); }
+                if ( args.length > 2 ) System.exit( 1 );
+                if ( args.length == 2 ) { times = Integer.parseInt(args[1]); }
+                for ( int i = Integer.parseInt("5"); --i >= 0 ; ) {
+                for ( int i = Integer.parseInt("5"); --i >= 0 ; ) {
                         InnerMain();
                         // Thread.sleep(2000);  // 2 seconds

benchmark/schedint/JavaThread.java

-              r3c64c668
+              r58fe85a
 public class JavaThread {
         // Simplistic low-quality Marsaglia Shift-XOR pseudo-random number generator.
         // Bijective
+        // Bijective
         // Cycle length for non-zero values is 4G-1.
         // 0 is absorbing and should be avoided -- fixed point.
         // The returned value is typically masked to produce a positive value.
         static volatile int Ticket = 0 ;
+        static volatile int Ticket = 0 ;
         private static int nextRandom (int x) {
                 if (x == 0) {
+                if (x == 0) {
                         // reseed the PRNG
                         // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
                         // Note that we use a non-atomic racy increment -- the race is rare and benign.
                         // If the race is a concern switch to an AtomicInteger.
                         // In addition accesses to the RW volatile global "Ticket"  variable are not
                         // (readily) predictable at compile-time so the JIT will not be able to elide
                         // nextRandom() invocations.
                         x = ++Ticket ;
                         if (x == 0) x = 1 ;
+                        // Ticket is accessed infrequently and does not constitute a coherence hot-spot.
+                        // Note that we use a non-atomic racy increment -- the race is rare and benign.
+                        // If the race is a concern switch to an AtomicInteger.
+                        // In addition accesses to the RW volatile global "Ticket"  variable are not
+                        // (readily) predictable at compile-time so the JIT will not be able to elide
+                        // nextRandom() invocations.
+                        x = ++Ticket ;
+                        if (x == 0) x = 1 ;
+                }
                 x ^= x << 6;
                 x ^= x >>> 21;
                 x ^= x << 7;
                 return x ;
+                return x ;
+        }
         static int x = 2;
         static private int times = Integer.parseInt("1000000");
+        static private long times = Long.parseLong("1000000");
         public static void helper( Monitor m ) throws InterruptedException {
                 for(int i = 1; i <= times; i += 1) {
+                for(long i = 1; i <= times; i += 1) {
                         m.wait();               // relase monitor lock
                         m.next = true;
 …
+        }
         public static void main(String[] args) throws InterruptedException {
                 if ( args.length > 2 ) System.exit( 1 );
                 if ( args.length == 2 ) { times = Integer.parseInt(args[1]); }
+                if ( args.length > 1 ) System.exit( 1 );
+                if ( args.length == 1 ) { times = Long.parseLong(args[0]); }
                 for (int n = Integer.parseInt("5"); --n >= 0 ; ) {
+                for (int n = Integer.parseInt("5"); --n >= 0 ; ) {
                         InnerMain();
                         Thread.sleep(2000);     // 2 seconds

configure.ac

-              r3c64c668
+              r58fe85a
 AC_PREREQ([2.68])
 AC_INIT([cfa-cc],[1.0.0.0],[cforall@plg.uwaterloo.ca])
+AC_INIT([cfa-cc],[1.0.0],[cforall@plg.uwaterloo.ca])
 AC_CONFIG_AUX_DIR([automake])
 AC_CONFIG_MACRO_DIRS([automake])
-#AC_CONFIG_SRCDIR([src/main.cc])
 AC_CONFIG_HEADERS([config.h:src/config.h.in])
 AM_SILENT_RULES([yes])
 m4_include([automake/cfa.m4])
+m4_include([tools/build/cfa.m4])
 # don't use the default CFLAGS as they unconditonnaly add -O2
 : ${CFLAGS=""}
+: ${CXXFLAGS=""}
 AM_INIT_AUTOMAKE([subdir-objects])
 …
 #Trasforming cc1 will break compilation
 M4CFA_PROGRAM_NAME
+#==============================================================================
+# New AST toggling support
+AH_TEMPLATE([CFA_USE_NEW_AST],[Sets whether or not to use the new-ast, this is adefault value and can be overrided by --old-ast and --new-ast])
+DEFAULT_NEW_AST="True"
+AC_ARG_ENABLE(new-ast,
+        [  --enable-new-ast     whether or not to use new ast as the default AST algorithm],
+        [case "${enableval}" in
+                yes) newast=true ; DEFAULT_NEW_AST="True"  ;;
+                no)  newast=false; DEFAULT_NEW_AST="False" ;;
+                *) AC_MSG_ERROR([bad value ${enableval} for --enable-new-ast]) ;;
+        esac],[newast=true])
+AC_DEFINE_UNQUOTED([CFA_USE_NEW_AST], $newast)
+AC_SUBST(DEFAULT_NEW_AST)
 #==============================================================================
 …
         enable_distcc=$enableval, enable_distcc=no)
+AC_ARG_WITH(bwlimit,
+        [  --with-bwlimit=RATE     RATE the maximum rate at which rsync will be limited when using distributed builds],
+        [], [])
 AM_CONDITIONAL([ENABLE_DISTCC], [test x$enable_distcc = xyes])
 HAS_DISTCC="False"
 …
 # Create variables for commonly used targets
 TOP_SRCDIR="$(readlink -m $ac_confdir/)/"
 TOP_BUILDDIR="$(readlink -m $ac_pwd/)/"
+TOP_SRCDIR="$(readlink -e $ac_abs_confdir/)/"
+TOP_BUILDDIR="$(readlink -e $ac_pwd/)/"
 AC_DEFINE_UNQUOTED(TOP_SRCDIR, "$TOP_SRCDIR", [Top src directory])
 …
                 \'--enable-gprofiler=*) ;;
                 \'--disable-gprofiler) ;;
+                # skip the target hosts
+                \'--enable-new-ast=*) ;;
+                \'--disable-new-ast) ;;
+                # skip this, it only causes problems
+                \'--srcdir=*) ;;
                 # append all other arguments to the sub configure arguments
 …
         LIBCFA_TARGET_DIRS="${LIBCFA_TARGET_DIRS} ${lib_dir}"
+        LIBCFA_1TARGET_DIR="${lib_dir}"
         LIBCFA_TARGET_MAKEFILES="${LIBCFA_TARGET_MAKEFILES} ${lib_dir}/Makefile"
 …
 AC_SUBST(LIBCFA_TARGET_DIRS)
+AC_SUBST(LIBCFA_1TARGET_DIR)
 AC_SUBST(LIBCFA_TARGET_MAKEFILES)
 …
         driver/Makefile
         src/Makefile
         benchmark/Makefile
+        libcfa/Makefile:libcfa/Makefile.dist.in
         tests/Makefile
-        longrun_tests/Makefile
-        tools/Makefile
-        tools/prettyprinter/Makefile
         ])
+# Some of our makefile don't need to be distributed
+AM_CONDITIONAL([CFORALL_DISTRIBUTE], [test -e $TOP_SRCDIR/autogen.sh])
+AM_COND_IF([CFORALL_DISTRIBUTE], [
+        AC_CONFIG_FILES([
+                longrun_tests/Makefile
+                benchmark/Makefile
+                benchmark/io/http/Makefile
+                tools/Makefile
+                tools/prettyprinter/Makefile
+        ])
+        AC_OUTPUT(benchmark/Cargo.toml)
+])
 AC_CONFIG_LINKS([tests/test.py:tests/test.py])

doc/LaTeXmacros/common.tex

-              r3c64c668
+              r58fe85a
 %% Created On       : Sat Apr  9 10:06:17 2016
 %% Last Modified By : Peter A. Buhr
 %% Last Modified On : Fri May 24 07:59:54 2019
 %% Update Count     : 382
+%% Last Modified On : Mon Oct  5 09:34:46 2020
+%% Update Count     : 464
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 % Names used in the document.
+\usepackage{xspace}
 \newcommand{\CFAIcon}{\textsf{C}\raisebox{\depth}{\rotatebox{180}{\textsf{A}}}\xspace} % Cforall symbolic name
 \newcommand{\CFA}{\protect\CFAIcon}             % safe for section/caption
 …
 \newlength{\parindentlnth}
 \setlength{\parindentlnth}{\parindent}
-\newcommand{\LstBasicStyle}[1]{{\lst@basicstyle{#1}}}
-\newcommand{\LstKeywordStyle}[1]{{\lst@basicstyle{\lst@keywordstyle{#1}}}}
-\newcommand{\LstCommentStyle}[1]{{\lst@basicstyle{\lst@commentstyle{#1}}}}
-\newlength{\gcolumnposn}                                % temporary hack because lstlisting does not handle tabs correctly
-\newlength{\columnposn}
-\setlength{\gcolumnposn}{2.75in}
-\setlength{\columnposn}{\gcolumnposn}
-\newcommand{\C}[2][\@empty]{\ifx#1\@empty\else\global\setlength{\columnposn}{#1}\global\columnposn=\columnposn\fi\hfill\makebox[\textwidth-\columnposn][l]{\lst@basicstyle{\LstCommentStyle{#2}}}}
-\newcommand{\CRT}{\global\columnposn=\gcolumnposn}
-% allow escape sequence in lstinline
-%\usepackage{etoolbox}
-%\patchcmd{\lsthk@TextStyle}{\let\lst@DefEsc\@empty}{}{}{\errmessage{failed to patch}}
 \usepackage{pslatex}                                    % reduce size of san serif font
 …
 }%
+\usepackage{listings}                                                                   % format program code
 \usepackage{lstlang}
+\newcommand{\CFADefaults}{%
+\makeatletter
+\newcommand{\LstBasicStyle}[1]{{\lst@basicstyle{#1}}}
+\newcommand{\LstKeywordStyle}[1]{{\lst@basicstyle{\lst@keywordstyle{#1}}}}
+\newcommand{\LstCommentStyle}[1]{{\lst@basicstyle{\lst@commentstyle{#1}}}}
+\newlength{\gcolumnposn}                                % temporary hack because lstlisting does not handle tabs correctly
+\newlength{\columnposn}
+\setlength{\gcolumnposn}{2.75in}
+\setlength{\columnposn}{\gcolumnposn}
+\newcommand{\C}[2][\@empty]{\ifx#1\@empty\else\global\setlength{\columnposn}{#1}\global\columnposn=\columnposn\fi\hfill\makebox[\textwidth-\columnposn][l]{\lst@basicstyle{\LstCommentStyle{#2}}}}
+\newcommand{\CRT}{\global\columnposn=\gcolumnposn}
+% allow escape sequence in lstinline
+%\usepackage{etoolbox}
+%\patchcmd{\lsthk@TextStyle}{\let\lst@DefEsc\@empty}{}{}{\errmessage{failed to patch}}
+% allow adding to lst literate
+\def\addToLiterate#1{\protect\edef\lst@literate{\unexpanded\expandafter{\lst@literate}\unexpanded{#1}}}
+\lst@Key{add to literate}{}{\addToLiterate{#1}}
+\makeatother
+\newcommand{\CFAStyle}{%
 \lstset{
-language=CFA,
 columns=fullflexible,
 basicstyle=\linespread{0.9}\sf,                 % reduce line spacing and use sanserif font
 …
 belowskip=3pt,
 % replace/adjust listing characters that look bad in sanserif
 literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.8ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptscriptstyle\land\,$}}1
+literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.75ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptscriptstyle\land\,$}}1
         {~}{\raisebox{0.3ex}{$\scriptstyle\sim\,$}}1 {`}{\ttfamily\upshape\hspace*{-0.1ex}`}1
         {<-}{$\leftarrow$}2 {=>}{$\Rightarrow$}2 {->}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.8ex}{0.075ex}}}\kern-0.2ex\textgreater}2,
+}% lstset
+}% CFAStyle
+\ifdefined\CFALatin% extra Latin-1 escape characters
+\lstnewenvironment{cfa}[1][]{
+\lstset{
+language=CFA,
 moredelim=**[is][\color{red}]{®}{®},    % red highlighting ®...® (registered trademark symbol) emacs: C-q M-.
 moredelim=**[is][\color{blue}]{ß}{ß},   % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_
 moredelim=**[is][\color{OliveGreen}]{¢}{¢}, % green highlighting ¢...¢ (cent symbol) emacs: C-q M-"
 moredelim=[is][\lstset{keywords={}}]{¶}{¶}, % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^
+% replace/adjust listing characters that look bad in sanserif
+add to literate={`}{\ttfamily\upshape\hspace*{-0.1ex}`}1
 }% lstset
+}% CFADefaults
+\newcommand{\CFAStyle}{%
+\CFADefaults
+\lstset{#1}
+}{}
 % inline code ©...© (copyright symbol) emacs: C-q M-)
 \lstMakeShortInline©                                    % single-character for \lstinline
+}% CFAStyle
+\lstnewenvironment{cfa}[1][]
+{\CFADefaults\lstset{#1}}
+{}
+\else% regular ASCI characters
+\lstnewenvironment{cfa}[1][]{
+\lstset{
+language=CFA,
+escapechar=\$,                                                  % LaTeX escape in CFA code
+moredelim=**[is][\color{red}]{@}{@},    % red highlighting @...@
+}% lstset
+\lstset{#1}
+}{}
+% inline code @...@ (at symbol)
+\lstMakeShortInline@                                    % single-character for \lstinline
+\fi%
 % Local Variables: %

doc/LaTeXmacros/lstlang.sty

-              r3c64c668
+              r58fe85a
 %% Created On       : Sat May 13 16:34:42 2017
 %% Last Modified By : Peter A. Buhr
 %% Last Modified On : Tue Jan  8 14:40:33 2019
 %% Update Count     : 21
+%% Last Modified On : Wed Sep 23 22:40:04 2020
+%% Update Count     : 24
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
                 auto, _Bool, catch, catchResume, choose, _Complex, __complex, __complex__, __const, __const__,
                 coroutine, disable, dtype, enable, exception, __extension__, fallthrough, fallthru, finally,
                 __float80, float80, __float128, float128, forall, ftype, _Generic, _Imaginary, __imag, __imag__,
+                __float80, float80, __float128, float128, forall, ftype, generator, _Generic, _Imaginary, __imag, __imag__,
                 inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or,
                 otype, restrict, __restrict, __restrict__, __signed, __signed__, _Static_assert, thread,
+                otype, restrict, __restrict, __restrict__, __signed, __signed__, _Static_assert, suspend, thread,
                 _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,
                 virtual, __volatile, __volatile__, waitfor, when, with, zero_t,
 …
 % C++ programming language
+\lstdefinelanguage{C++}[ANSI]{C++}{}
+\lstdefinelanguage{C++}[ANSI]{C++}{
+        morekeywords={nullptr,}
+}
 % uC++ programming language, based on ANSI C++

doc/bibliography/pl.bib

-              r3c64c668
+              r58fe85a
 %    Predefined journal names:
 %  acmcs: Computing Surveys             acta: Acta Infomatica
-@string{acta="Acta Infomatica"}
 %  cacm: Communications of the ACM
 %  ibmjrd: IBM J. Research & Development ibmsj: IBM Systems Journal
 …
 %  tcs: Theoretical Computer Science
+@string{acta="Acta Infomatica"}
 string{ieeepds="IEEE Transactions on Parallel and Distributed Systems"}
 @string{ieeepds="IEEE Trans. Parallel Distrib. Syst."}
 …
     series      = {ACM Distinguished Dissertations},
     year        = 1983,
+}
+@article{Zhang19,
+    keywords    = {Algebraic effects, dynamic scoping, exceptions, parametricity, type systems},
+    author      = {Zhang, Yizhou and Myers, Andrew C.},
+    title       = {Abstraction-safe Effect Handlers via Tunneling},
+    journal     = {Proc. ACM Program. Lang.},
+    issue_date  = {January 2019},
+    volume      = {3},
+    number      = {POPL},
+    month       = jan,
+    year        = {2019},
+    issn        = {2475-1421},
+    pages       = {5:1--5:29},
+    articleno   = {5},
+    publisher   = {ACM},
+    address     = {New York, NY, USA},
+}
+@inproceedings{Zhang16,
+    keywords    = {Exception tunneling, Genus, exception handling},
+    author      = {Zhang, Yizhou and Salvaneschi, Guido and Beightol, Quinn and Liskov, Barbara and Myers, Andrew C.},
+    title       = {Accepting Blame for Safe Tunneled Exceptions},
+    booktitle   = {Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation},
+    series      = {PLDI'16},
+    year        = {2016},
+    location    = {Santa Barbara, CA, USA},
+    pages       = {281--295},
+    publisher   = {ACM},
+    address     = {New York, NY, USA},
+}
 …
     journal     = sigplan,
     year        = 1981,
+    month       = feb, volume = 16, number = 2, pages = {48-52},
+    month       = feb,
+    volume      = 16,
+    number      = 2,
+    pages       = {48-52},
     comment     = {
         A one-pass, top-down algorithm for overload resolution.  Input is a
 …
     title       = {An Alternative to Subclassing},
     journal     = sigplan,
+    volume      = {21},    number = {11},
+    volume      = {21},
+    number      = {11},
     pages       = {424-428},
+    month       = nov, year = 1986,
+    month       = nov,
+    year        = 1986,
     comment     = {
         The Smalltalk class hierarchy has three uses: factoring out code;
 …
     isbn        = {3-540-66538-2},
     location    = {Toulouse, France},
-    doi         = {http://doi.acm.org/10.1145/318773.319251},
     publisher   = {Springer},
     address     = {London, UK},
 …
     year        = 2010,
     pages       = {39--50},
-    numpages    = {12},
     publisher   = {IEEE Computer Society},
     address     = {Washington, DC, USA},
 …
+}
+@manual{C99,
+    keywords    = {ISO/IEC C 9899},
+    contributer = {pabuhr@plg},
+    key         = {C99},
+    title       = {C Programming Language {ISO/IEC} 9899:1999(E)},
+    edition     = {2nd},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
+    year        = 1999,
+    note        = {\href{https://webstore.ansi.org/Standards/INCITS/INCITSISOIEC98991999R2005}{https://webstore.ansi.org/\-Standards/\-INCITS/\-INCITSISOIEC98991999R2005}},
+}
 @manual{C11,
     keywords    = {ISO/IEC C 11},
 …
     title       = {C Programming Language {ISO/IEC} 9889:2011-12},
     edition     = {3rd},
     publisher   = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/57853.html}{https://\-www.iso.org/\-standard/\-57853.html}},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 2012,
+    note        = {\href{https://www.iso.org/standard/57853.html}{https://\-www.iso.org/\-standard/\-57853.html}},
+}
 …
     key         = {Concepts},
     title       = {{C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}} Programming language -- Extensions for concepts {ISO/IEC} {TS} 19217:2015},
     publisher   = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/64031.html}{https://\-www.iso.org/\-standard/\-64031.html}},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 2015,
+    note        = {\href{https://www.iso.org/standard/64031.html}{https://\-www.iso.org/\-standard/\-64031.html}},
+}
 …
+}
 @misc{CforallBenchMarks,
+@misc{CforallConcurrentBenchmarks,
     contributer = {pabuhr@plg},
     key         = {Cforall Benchmarks},
     author      = {{\textsf{C}{$\mathbf{\forall}$} Benchmarks}},
     howpublished= {\href{https://plg.uwaterloo.ca/~cforall/benchmark.tar}{https://\-plg.uwaterloo.ca/\-$\sim$cforall/\-benchmark.tar}},
+    howpublished= {\href{https://github.com/cforall/ConcurrentBenchmarks_SPE20}{https://\-github.com/\-cforall/\-ConcurrentBenchmarks\_SPE20}},
+}
 …
     title       = {C\# Language Specification, Standard ECMA-334},
     organization= {ECMA International Standardizing Information and Communication Systems},
+    address     = {Geneva, Switzerland},
     month       = jun,
     year        = 2006,
 …
     title       = {Programming Languages -- {Cobol} ISO/IEC 1989:2014},
     edition     = {2nd},
     institution = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/51416.html}{https://\-www.iso.org/\-standard/\-51416.html}},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 2014,
+    note        = {\href{https://www.iso.org/standard/51416.html}{https://\-www.iso.org/\-standard/\-51416.html}},
+}
 …
     location    = {London, United Kingdom},
     pages       = {41--53},
-    numpages    = {13},
-    url         = {http://doi.acm.org/10.1145/360204.360207},
-    doi         = {10.1145/360204.360207},
-    acmid       = {360207},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     title       = {$\mu${C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}} Annotated Reference Manual, Version 7.0.0},
     organization= {University of Waterloo},
+    address     = {Waterloo Ontario, Canada},
     month       = sep,
     year        = 2018,
 …
     title       = {Cooperating Sequential Processes},
     institution = {Technological University},
     address     = {Eindhoven, Netherlands},
+    address     = {Eindhoven, Neth.},
     year        = 1965,
     note        = {Reprinted in \cite{Genuys68} pp. 43--112.}
 …
     author      = {Adya, Atul and Howell, Jon and Theimer, Marvin and Bolosky, William J. and Douceur, John R.},
     title       = {Cooperative Task Management Without Manual Stack Management},
     booktitle   = {Proceedings of the General Track of the Annual Conference on USENIX Annual Technical Conference},
+    booktitle   = {Proc. of the General Track USENIX Tech. Conf.},
     series      = {ATEC '02},
     year        = {2002},
 …
     author      = {Walter Bright and Andrei Alexandrescu},
     organization= {Digital Mars},
+    address     = {Vienna Virginia, U.S.A.},
     year        = 2016,
     note        = {\href{http://dlang.org/spec/spec.html}{http://\-dlang.org/\-spec/\-spec.html}},
 …
     year        = 1993,
     pages       = {201--208},
-    url         = {http://doi.acm.org/10.1145/155360.155580},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     location    = {Boulder, Colorado, USA},
     pages       = {91--97},
-    numpages    = {7},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     issn        = {0004-5411},
     pages       = {215--225},
-    numpages    = {11},
-    url         = {http://doi.acm.org/10.1145/321879.321884},
-    doi         = {10.1145/321879.321884},
-    acmid       = {321884},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
+}
+@misc{Drepper13,
+    keywords    = {thread-local storage},
+    contributer = {pabuhr@plg},
+    author      = {Ulrich Drepper},
+    title       = {{ELF} Handling For Thread-Local Storage},
+    year        = 2013,
+    month       = aug,
+    note        = {WikipediA},
+    howpublished= {\href{http://www.akkadia.org/drepper/tls.pdf}
+                  {http://\-www.akkadia.org/\-drepper/\-tls.pdf}},
+}
 @misc{Turley99,
     keywords    = {embedded system, micrprocessor},
 …
     howpublished= {\href{https://www.eetimes.com/author.asp?sectionid=36&doc_id=1287712}
                   {https://\-www.eetimes.com/\-author.asp?sectionid=\-36&doc_id=1287712}},
+}
+@article{Xiao19,
+    keywords    = {bug classification, fault trigger, Linux operating system, regression bug},
+    contributer = {pabuhr@plg},
+    author      = {Guanping Xiao and Zheng Zheng and Beibei Yin and Kishor S. Trivedi and Xiaoting Du and Kai-Yuan Cai},
+    title       = {An Empirical Study of Fault Triggers in the Linux Operating System: An Evolutionary Perspective},
+    journal     = {IEEE Transactions on Reliability},
+    month       = dec,
+    year        = 2019,
+    volume      = 68,
+    number      = 4,
+    pages       = {1356-1383},
+}
 …
+}
+@inproceedings{Palix11,
+    keywords    = {Linux, fault-finding tools},
+    contributer = {pabuhr@plg},
+    author      = {Nicolas Palix and Ga\"el Thomas and Suman Saha and Christophe Calv\`es and Julia Lawall and Gilles Muller},
+    title       = {Faults in Linux: Ten Years Later},
+    booktitle   = {Proc. of the 16 International Conf. on Arch. Support for Prog. Lang. and Oper. Sys.},
+    series      = {ASPLOS'11},
+    month       = mar,
+    year        = 2011,
+    location    = {Newport Beach, California, USA},
+    pages       = {305-318},
+    publisher   = {ACM},
+    address     = {New York, NY, USA},
+}
 @article{Lamport87,
     keywords    = {software solutions, mutual exclusion, fast},
 …
     issn        = {0001-0782},
     pages       = {107--115},
-    numpages    = {9},
-    url         = {http://doi.acm.org/10.1145/1538788.1538814},
-    doi         = {10.1145/1538788.1538814},
-    acmid       = {1538814},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     title       = {Programming Languages -- {Fortran} Part 1:Base Language ISO/IEC 1539-1:2010},
     edition     = {3rd},
     publisher   = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/50459.html}{https://\-www.iso.org/\-standard/\-50459.html}},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 2010,
+    note        = {\href{https://www.iso.org/standard/50459.html}{https://\-www.iso.org/\-standard/\-50459.html}},
+}
 …
     title       = {Programming Languages -- {Fortran} Part 1:Base Language ISO/IEC 1539-1:2018},
     edition     = {4rd},
     publisher   = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/72320.html}{https://\-www.iso.org/\-standard/\-72320.html}},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 2018,
+    note        = {\href{https://www.iso.org/standard/72320.html}{https://\-www.iso.org/\-standard/\-72320.html}},
+}
 …
+}
+@mastersthesis{Radhakrishnan19,
+    author      = {Srihari Radhakrishnan},
+    title       = {High Performance Web Servers: A Study In Concurrent Programming Models},
+    school      = {School of Computer Sc., University of Waterloo},
+    year        = 2019,
+    optaddress  = {Waterloo, Ontario, Canada, N2L 3G1},
+    note        = {\href{https://uwspace.uwaterloo.ca/handle/10012/14706}{https://\-uwspace.uwaterloo.ca/\-handle/\-10012/\-14706}},
+}
 @article{katzenelson83b,
     contributer = {gjditchfield@plg},
 …
     pages       = {115-138},
     year        = 1971,
+}
+@inproceedings{Hagersten03,
+    keywords    = {cache storage, parallel architectures, performance evaluation, shared memory systems},
+    author      = {Zoran Radovi\'{c} and Erik Hagersten},
+    title       = {Hierarchical backoff locks for nonuniform communication architectures},
+    booktitle   = {Proceedings of the Ninth International Symposium on High-Performance Computer Architecture},
+    year        = {2003},
+    location    = {Anaheim, CA, USA},
+    pages       = {241-252},
+    publisher   = {IEEE},
+}
 …
+}
+@misc{gccValueLabels,
+    keywords    = {gcc extension, value labels},
+    contributer = {pabuhr@plg},
+    key         = {Labels as Values},
+    author      = {{gcc Extension}},
+    title       = {Labels as Values},
+    year        = {since gcc-3},
+    howpublished= {\href{https://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html}
+                  {https:\-//gcc.gnu.org/\-onlinedocs/\-gcc/\-Labels-as-Values.html}},
+}
 @mastersthesis{Clarke90,
     keywords    = {concurrency, postponing requests},
 …
+}
+@misc{libfibre,
+    key         = {libfibre},
+    author      = {Martin Karsten},
+    title       = {{libfibre:~User-Level Threading Runtime}},
+    howpublished= {\href{https://git.uwaterloo.ca/mkarsten/libfibre}
+                  {https://\-git.uwaterloo.ca/\-mkarsten/\-libfibre}},
+    note        = {[Online; accessed 2020-04-15]},
+}
 @article{Linda,
     keywords    = {Linda, concurrency},
 …
+}
+@inproceedings{Fang06,
+    author      = {Fang, Yi and McMillan, Kenneth L. and Pnueli, Amir and Zuck, Lenore D.},
+    editor      = {Najm, Elie and Pradat-Peyre, Jean-Fran{\c{c}}ois and Donzeau-Gouge, V{\'e}ronique Vigui{\'e}},
+    title       = {Liveness by Invisible Invariants},
+    booktitle   = {Formal Techniques for Networked and Distributed Systems - FORTE 2006},
+    year        = 2006,
+    publisher   = {Springer Berlin Heidelberg},
+    address     = {Berlin, Heidelberg},
+    pages       = {356--371},
+}
 @article{Pierce00,
     keywords    = {Scala},
+    keywords    = {Scala, polymorphism, subtyping, type inference},
     contributer = {a3moss@uwaterloo.ca},
     author      = {Pierce, Benjamin C. and Turner, David N.},
 …
     issn        = {0164-0925},
     pages       = {1--44},
-    numpages    = {44},
-    url         = {http://doi.acm.org/10.1145/345099.345100},
-    doi         = {10.1145/345099.345100},
-    acmid       = {345100},
     publisher   = {ACM},
     address     = {New York, NY, USA},
-    keywords    = {polymorphism, subtyping, type inference},
+}
+@article{Dice15,
+    keywords    = {Concurrency, NUMA, hierarchical locks, locks, multicore, mutex, mutual exclusion, spin locks},
+    author      = {Dice, David and Marathe, Virendra J. and Shavit, Nir},
+    title       = {Lock Cohorting: A General Technique for Designing NUMA Locks},
+    journal     = {ACM Trans. Parallel Comput.},
+    issue_date  = {January 2015},
+    volume      = 1,
+    number      = 2,
+    month       = feb,
+    year        = 2015,
+    pages       = {13:1--13:42},
+    publisher   = {ACM},
+    address     = {New York, NY, USA},
+}
 @article{Sundell08,
 …
     journal     = sigplan,
     year        = 1989,
+    month       = jun, volume = 24, number = 6, pages = {37-48},
+    month       = jun,
+    volume      = 24,
+    number      = 6,
+    pages       = {37-48},
     abstract    = {
         This paper describes a scheme we have used to manage a large
 …
     address     = {New York, NY, USA},
+}
 @techreport{Mesa,
     keywords    = {monitors, packages},
 …
     title       = {Mesa Language Manual},
     institution = {Xerox Palo Alto Research Center},
+    address     = {Palo Alto, California, U.S.A.},
     number      = {CSL--79--3},
     month       = apr,
 …
     contributer = {pabuhr@plg},
     author      = {Gregory R. Andrews},
     title       = {A Method for Solving Synronization Problems},
+    title       = {A Method for Solving Synchronization Problems},
     journal     = scp,
     volume      = 13,
 …
     title       = {Multiple Inheritance for {C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}}},
     booktitle   = {Proceedings of the Spring '87 EUUG Conference},
+    month       = may, year = 1987
+    month       = may,
+    year        = 1987,
+}
 …
     year        = 1986,
     pages       = {313--326},
-    numpages    = {14},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     year        = 1986,
     pages       = {327--348},
-    numpages    = {22},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     year        = 2005,
     pages       = {146-196},
-    numpages    = {51},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     year        = 2000,
     pages       = {29-46},
     note        = {OOPSLA'00, Oct. 15--19, 2000, Minneapolis, Minnesota, U.S.A.},
+    note        = {OOPSLA'00, Oct. 15--19, 2000, Minneapolis, Minn., U.S.A.},
+}
 …
     location    = {San Diego, California, USA},
     pages       = {101--112},
-    numpages    = {12},
-    url         = {http://doi.acm.org/10.1145/2535838.2535878},
-    doi         = {10.1145/2535838.2535878},
-    acmid       = {2535878},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     issn        = {0362-1340},
     pages       = {30--42},
-    numpages    = {13},
-    url         = {http://doi.acm.org/10.1145/947586.947589},
-    doi         = {10.1145/947586.947589},
     publisher   = {ACM},
     address     = {New York, NY, USA}
 …
+}
+@article{Bauer15,
+    keywords    = {resumption exceptions, theory},
+    contributer = {pabuhr@plg},
+    author      = {Andrej Bauer and Matija Pretnar},
+    title       = {Programming with Algebraic Effects and Handlers},
+    journal     = {Journal of Logical and Algebraic Methods in Programming},
+    publisher   = {Elsevier BV},
+    volume      = 84,
+    number      = 1,
+    month       = jan,
+    year        = 2015,
+    pages       = {108-123},
+}
 @book{Butenhof97,
     keywords    = {PThreads, concurrency},
 …
     title       = {{C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}} Programming Language ISO/IEC 14882:1998},
     edition     = {1st},
     publisher   = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/25845.html}{https://\-www.iso.org/\-standard/\-25845.html}},
+    organization  = {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 1998,
+    note        = {\href{https://www.iso.org/standard/25845.html}{https://\-www.iso.org/\-standard/\-25845.html}},
+}
 …
     title       = {{C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}} Programming Language ISO/IEC 14882:2014},
     edition     = {4th},
     publisher   = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/64029.html}{https://\-www.iso.org/\-standard/\-64029.html}},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 2014,
+    note        = {\href{https://www.iso.org/standard/64029.html}{https://\-www.iso.org/\-standard/\-64029.html}},
+}
 …
     title       = {{C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}} Programming Language ISO/IEC 14882:2017},
     edition     = {5th},
     publisher   = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/68564.html}{https://\-www.iso.org/\-standard/\-68564.html}},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 2017,
+    note        = {\href{https://www.iso.org/standard/68564.html}{https://\-www.iso.org/\-standard/\-68564.html}},
+}
 …
     title       = {The Programming Language Concurrent Pascal},
     journal     = ieeese,
+    volume      = 2,
+    volume      = {SE-1},
+    number      = 2,
     month       = jun,
     year        = 1975,
     pages       = {199-206}
+    pages       = {199-207}
+}
 …
     issn        = {0164-0925},
     pages       = {429-475},
-    url         = {http://doi.acm.org/10.1145/1133651.1133653},
-    doi         = {10.1145/1133651.1133653},
-    acmid       = {1133653},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
+}
+@article{Aravind09,
+    author      = {Alex A. Aravind and Wim H. Hesselink},
+    title       = {A Queue Based Mutual Exclusion Algorithm},
+    journal     = acta,
+    volume      = 46,
+    pages       = {73--86},
+    year        = 2009,
+}
 % R
 …
     title       = {Programming languages -- {Ada} ISO/IEC 8652:2012},
     edition     = {3rd},
     publisher   = {International Standard Organization},
     address     = {\href{https://www.iso.org/standard/61507.html}{https://\-www.iso.org/\-standard/\-61507.html}},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
     year        = 2012,
+    note        = {\href{https://www.iso.org/standard/61507.html}{https://\-www.iso.org/\-standard/\-61507.html}},
+}
 …
     issn        = {0001-0782},
     pages       = {565--569},
-    numpages    = {5},
-    url         = {http://doi.acm.org/10.1145/359545.359566},
-    doi         = {10.1145/359545.359566},
-    acmid       = {359566},
     publisher   = {ACM},
     address     = {New York, NY, USA}
 …
     issn        = {0362-1340},
     pages       = {145--147},
-    numpages    = {3},
-    url         = {http://doi.acm.org/10.1145/122598.122614},
-    doi         = {10.1145/122598.122614},
-    acmid       = {122614},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     issn        = {0362-1340},
     pages       = {82--87},
-    numpages    = {6},
-    url         = {http://doi.acm.org/10.1145/947680.947688},
-    doi         = {10.1145/947680.947688},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
+}
+@article{Cascaval08,
+    author      = {Cascaval, Calin and Blundell, Colin and Michael, Maged and Cain, Harold W. and Wu, Peng and Chiras, Stefanie and Chatterjee, Siddhartha},
+    title       = {Software Transactional Memory: Why Is It Only a Research Toy?},
+    journal     = {Queue},
+    volume      = {6},
+    number      = {5},
+    month       = sep,
+    year        = {2008},
+    pages       = {40:46--40:58},
+    publisher   = {ACM},
+    address     = {New York, NY, USA},
+}
 @article{Dijkstra65a,
     keywords    = {N-thread software-solution mutual exclusion},
 …
     year        = 1974,
     pages       = {261-301},
-    issn        = {0360-0300},
-    doi         = {http://doi.acm.org/10.1145/356635.356640},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     publisher   = {ACM Press},
     address     = {New York, NY, USA},
-    doi         = {http://doi.acm.org/10.1145/356586.356588},
+}
 …
     title       = {The Thoth System: Multi-Process Structuring and Portability},
     publisher   = {American Elsevier},
+    address     = {New York, New York, U.S.A.},
     year        = 1982
+}
 …
     howpublished= {\href{https://projects.eclipse.org/proposals/trace-compass}{https://\-projects.eclipse.org/\-proposals/\-trace-compass}},
+}
+@inproceedings{Boehm09,
+    author      = {Boehm, Hans-J.},
+    title       = {Transactional Memory Should Be an Implementation Technique, Not a Programming Interface},
+    booktitle   = {Proceedings of the First USENIX Conference on Hot Topics in Parallelism},
+    series      = {HotPar'09},
+    year        = {2009},
+    location    = {Berkeley, California},
+    publisher   = {USENIX Association},
+    address     = {Berkeley, CA, USA},
+}
 @article{Leroy00,
     keywords    = {type-systems, exceptions},
 …
     number      = {2},
     pages       = {204-214},
+    month       = apr, year = 1988,
+    month       = apr,
+    year        = 1988,
     comment     = {
         Extended record types add fields to their base record.  Assignment
 …
+}
+@article{Karsten20,
+    author      = {Karsten, Martin and Barghi, Saman},
+    title       = {{User-level Threading: Have Your Cake and Eat It Too}},
+    year        = {2020},
+    issue_date  = {March 2020},
+    publisher   = {Association for Computing Machinery},
+    address     = {New York, NY, USA},
+    volume      = {4},
+    number      = {1},
+    url         = {https://doi.org/10.1145/3379483},
+    doi         = {10.1145/3379483},
+    journal     = {Proc. ACM Meas. Anal. Comput. Syst.},
+    month       = mar,
+    numpages    = {30},
+}
 @techreport{Harmony,
     keywords    = {messages, concurrency},
 …
     contributer = {gjditchfield@plg},
     author      = {Henry Lieverman},
+    title       = {Using Prototypical Objects to Implement Shared Behavior in
+                  Object Oriented Systems},
+    title       = {Using Prototypical Objects to Implement Shared Behavior in Object Oriented Systems},
     journal     = sigplan,
+    month       = nov, year = 1986,
+    volume      = 21, number = 11, pages = {214-223}
+    month       = nov,
+    year        = 1986,
+    volume      = 21,
+    number      = 11,
+    pages       = {214-223}
+}
 …
     issn        = {0004-5411},
     pages       = {245--281},
-    numpages    = {37},
-    url         = {http://doi.acm.org/10.1145/62.2160},
-    doi         = {10.1145/62.2160},
-    acmid       = {2160},
     publisher   = {ACM},
     address     = {New York, NY, USA},
 …
     contributer = {pabuhr@plg},
     author      = {Boehm, Hans-J. and Adve, Sarita V.},
     title       = {You Don'T Know Jack About Shared Variables or Memory Models},
+    title       = {You Don't Know Jack About Shared Variables or Memory Models},
     journal     = cacm,
     volume      = 55,

doc/man/cfa.1

-              r3c64c668
+              r58fe85a
 .\" Created On       : Wed Jul 26 22:34:47 2017
 .\" Last Modified By : Peter A. Buhr
 .\" Last Modified On : Thu Jul 27 10:29:29 2017
 .\" Update Count     : 44
+.\" Last Modified On : Wed Sep  2 17:59:53 2020
+.\" Update Count     : 78
 .\"
 .\" nroff -man cfa.1
 …
 .ds Cf "Cforall
 .\"
 .TH cfa 1 2017-07-27 cfa-\*(Mg
+.TH CFA 1 "2020-09-2" cfa-\*(Mg "\*(Cf Project"
 .SH NAME
 cfa \- \*(Cf Translator and Runtime Library
+cfa \- \*(Cf project translator and runtime library to enhance C
 .SH SYNOPSIS
+cfa [gcc-options] [C/\*(Cf source-files] [assembler/loader files]
+cfa [cfa/gcc-options]
+    [cfa/c source-files]
+    [assembler/loader files]
 .SH DESCRIPTION
+\*(Cf (C-for-all) is an open-source project extending ISO C with modern safety and productivity features, while still ensuring backwards compatibility with C and its programmers.
 The cfa command compiles C and \*(Cf source files and links C/\*(Cf object
 files named on the command line.
 …
 The cfa command introduces a translator pass over the specified source files
 after the C preprocessor but before the C compilation.  The translator converts
+new \*(Cf constructs into C statements.  The cfa command also provides the
+runtime library, which is linked with each \*(Cf application.
+new \*(Cf constructs into C statements.  The cfa command also provides a fully
+concurrent (user-level threads) runtime library, which is linked with the
+\*(Cf application.
 The command line options depend on the particular C compiler used (gcc/clang
 supported).  As with most C compilers, the output is sent to the file a.out(5)
 unless the -o option is present on the command line.  See the reference pages
 for gcc(1) for more information.
+for gcc(1) for more information on command line options.
 .SH OPTIONS
 When multiple conflicting options appear on the command line, e.g.,
 …
 All of the options available to the gcc compiler are available to the cfa
 translator.  The following gcc flags are implicitly turned on:
+.IP -std=gnu99 3
+The 1999 C standard plus GNU extensions.
+.IP -fgnu89-inline
+Use the traditional GNU semantics for inline routines in C99 mode, which allows inline routines in header files.
+.IP "-std=gnu11" 3
+The 2011 C standard plus GNU extensions.
+.IP "-fgnu89-inline"
+Use the traditional GNU semantics for inline routines in C11 mode, which allows inline routines in header files.
+.IP "-imacros stdbool.h"
+Include stdbool.h to get defines for bool/true/false.
+.IP "-latomic -lm"
+Provide access to double-wide CAS instruction and math library.
 .LP
 The following additional options are available:
 .IP -CFA 3
+.IP "-CFA" 3
 Only the C preprocessor and the \*(Cf translator steps are performed and the transformed program is written to standard output, which makes it possible to examine the code generated by the \*(Cf translator.
 The generated code starts with the standard \*(Cf prelude.
 .IP -debug
+.IP "-debug"
 The program is linked with the debugging version of the runtime system.
 The debug version performs runtime checks to help during the debugging phase of a \*(Cf program, but can substantially slow program execution.
 The runtime checks should only be removed after the program is completely debugged.
 .B This option is the default.
 .IP -nodebug
+.IP "-nodebug"
 The program is linked with the non-debugging version of the runtime system, so the execution of the program is faster.
 .I However, no runtime checks or asserts are performed so errors usually result in abnormal program behaviour or termination.
 .IP -help
+.IP "-help"
 Information about the set of \*(Cf compilation flags is printed.
 .IP -nohelp
+.IP "-nohelp"
 Information about the set of \*(Cf compilation flags is not printed.
 .B This option is the default.
 .IP -quiet
+.IP "-quiet"
 The \*(Cf compilation message is not printed at the beginning of a compilation.
 .IP -noquiet
+.IP "-noquiet"
 The \*(Cf compilation message is printed at the beginning of a compilation.
 .B This option is the default.
 …
 available.  These variables allow conditional compilation of programs that must
 work differently in these situations.
 .IP __CFA_MAJOR__ 3
+.IP "__CFA_MAJOR__" 3
 is available during preprocessing and its value is the major version number of \*(Cf.
 .IP __CFA_MINOR__
+.IP "__CFA_MINOR__"
 is available during preprocessing and its value is the minor version number of \*(Cf.
 .IP __CFA_PATCH__
+.IP "__CFA_PATCH__"
 is available during preprocessing and its value is the patch level number of \*(Cf.
 .IP "__CFA__, __CFORALL__, and __cforall"
 are always available during preprocessing and have no value.
 .IP __CFA_DEBUG__
+.IP "__CFA_DEBUG__"
 is available during preprocessing if the -debug compilation option is
 specified.
 …
 .SH REFERENCES
 .HP 3
+\*(Cf Reference and Rational Manual
+.I \*(Cf Home Page
 .br
 http://plg.uwaterloo.ca/~cforall/refrat.pdf
+https://cforall.uwaterloo.ca
 .HP
 .I \*(Cf User Manual
 .br
+http://plg.uwaterloo.ca/~cforall/user.pdf
+https://cforall.uwaterloo.ca/doc/user.pdf
+.SH BUILDS
+Nightly builds are available here https://cforall.uwaterloo.ca/jenkins
 .SH BUGS
 Bugs should be reported to trac@plg.cs.uwaterloo.ca.
+Bugs reportss are available here https://cforall.uwaterloo.ca/trac
 .SH COPYRIGHT
 \*(Cf is covered under the licence agreement in the distribution.
 .SH AUTHORS
 Andrew Beach, Richard Bilson, Peter A. Buhr, Thierry Delisle, Glen Ditchfield,
 Rodolfo G. Esteves, Aaron Moss, Rob Schluntz
+Rodolfo G. Esteves, Aaron Moss, Rob Schluntz, Mubeen Zulfiqar

doc/papers/AMA/AMA-stix/ama/WileyNJD-v2.cls

-              r3c64c668
+              r58fe85a
      \@afterheading}
 \renewcommand\section{\@startsection{section}{1}{\z@}{-25pt \@plus -2pt \@minus -2pt}{12\p@}{\sectionfont}}%
 \renewcommand\subsection{\@startsection{subsection}{2}{\z@}{-22pt \@plus -2pt \@minus -2pt}{5\p@}{\subsectionfont}}%
 \renewcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}{-20pt \@plus -2pt \@minus -2pt}{2\p@}{\subsubsectionfont}}%
+\renewcommand\section{\@startsection{section}{1}{\z@}{-20pt \@plus -2pt \@minus -2pt}{7\p@}{\sectionfont}}%
+\renewcommand\subsection{\@startsection{subsection}{2}{\z@}{-18pt \@plus -2pt \@minus -2pt}{5\p@}{\subsectionfont}}%
+\renewcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}{-16pt \@plus -2pt \@minus -2pt}{2\p@}{\subsubsectionfont}}%
+%
 \newskip\secruleskip\secruleskip8.5\p@%

doc/papers/concurrency/Paper.tex

-              r3c64c668
+              r58fe85a
 \newcommand{\CCseventeen}{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}17\xspace} % C++17 symbolic name
 \newcommand{\CCtwenty}{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}20\xspace} % C++20 symbolic name
 \newcommand{\Csharp}{C\raisebox{-0.7ex}{\Large$^\sharp$}\xspace} % C# symbolic name
+\newcommand{\Csharp}{C\raisebox{-0.7ex}{\large$^\sharp$}\xspace} % C# symbolic name
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 \newcommand{\CRT}{\global\columnposn=\gcolumnposn}
 % Denote newterms in particular font and index them without particular font and in lowercase, e.g., \newterm{abc}.
 % The option parameter provides an index term different from the new term, e.g., \newterm[\texttt{abc}]{abc}
+% Denote newterms in particular font and index them without particular font and in lowercase, \eg \newterm{abc}.
+% The option parameter provides an index term different from the new term, \eg \newterm[\texttt{abc}]{abc}
 % The star version does not lowercase the index information, e.g., \newterm*{IBM}.
 \newcommand{\newtermFontInline}{\emph}
 …
 \newcommand{\abbrevFont}{\textit}                       % set empty for no italics
 \@ifundefined{eg}{
+\newcommand{\EG}{\abbrevFont{e}\abbrevFont{g}}
+%\newcommand{\EG}{\abbrevFont{e}\abbrevFont{g}}
+\newcommand{\EG}{for example}
 \newcommand*{\eg}{%
         \@ifnextchar{,}{\EG}%
 …
 }}{}%
 \@ifundefined{ie}{
+\newcommand{\IE}{\abbrevFont{i}\abbrevFont{e}}
+%\newcommand{\IE}{\abbrevFont{i}\abbrevFont{e}}
+\newcommand{\IE}{that is}
 \newcommand*{\ie}{%
         \@ifnextchar{,}{\IE}%
 …
 \newcommand*{\etc}{%
         \@ifnextchar{.}{\ETC}%
         {\ETC.\xspace}%
+                {\ETC.\xspace}%
 }}{}%
 \@ifundefined{etal}{
 \newcommand{\ETAL}{\abbrevFont{et}~\abbrevFont{al}}
 \newcommand*{\etal}{%
         \@ifnextchar{.}{\protect\ETAL}%
                 {\protect\ETAL.\xspace}%
+        \@ifnextchar{.}{\ETAL}%
+                {\ETAL.\xspace}%
 }}{}%
 \@ifundefined{viz}{
 …
                 __float80, float80, __float128, float128, forall, ftype, generator, _Generic, _Imaginary, __imag, __imag__,
                 inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or,
                 otype, restrict, __restrict, __restrict__, __signed, __signed__, _Static_assert, thread,
+                otype, restrict, resume, __restrict, __restrict__, __signed, __signed__, _Static_assert, suspend, thread,
                 _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,
                 virtual, __volatile, __volatile__, waitfor, when, with, zero_t},
         moredirectives={defined,include_next},
         % replace/adjust listing characters that look bad in sanserif
         literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.8ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptstyle\land\,$}}1
+        literate={-}{\makebox[1ex][c]{\raisebox{0.5ex}{\rule{0.8ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptstyle\land\,$}}1
                 {~}{\raisebox{0.3ex}{$\scriptstyle\sim\,$}}1 % {`}{\ttfamily\upshape\hspace*{-0.1ex}`}1
                 {<}{\textrm{\textless}}1 {>}{\textrm{\textgreater}}1
 …
                 _Else, _Enable, _Event, _Finally, _Monitor, _Mutex, _Nomutex, _PeriodicTask, _RealTimeTask,
                 _Resume, _Select, _SporadicTask, _Task, _Timeout, _When, _With, _Throw},
+}
-\lstdefinelanguage{Golang}{
-        morekeywords=[1]{package,import,func,type,struct,return,defer,panic,recover,select,var,const,iota,},
-        morekeywords=[2]{string,uint,uint8,uint16,uint32,uint64,int,int8,int16,int32,int64,
-                bool,float32,float64,complex64,complex128,byte,rune,uintptr, error,interface},
-        morekeywords=[3]{map,slice,make,new,nil,len,cap,copy,close,true,false,delete,append,real,imag,complex,chan,},
-        morekeywords=[4]{for,break,continue,range,goto,switch,case,fallthrough,if,else,default,},
-        morekeywords=[5]{Println,Printf,Error,},
-        sensitive=true,
-        morecomment=[l]{//},
-        morecomment=[s]{/*}{*/},
-        morestring=[b]',
-        morestring=[b]",
-        morestring=[s]{`}{`},
+}
 …
 {}
 \lstnewenvironment{C++}[1][]                            % use C++ style
 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{`}{`},#1}\lstset{#1}}
+{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 \lstnewenvironment{uC++}[1][]
 {\lstset{#1}}
+{\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 \lstnewenvironment{Go}[1][]
 {\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{`}{`},#1}\lstset{#1}}
+{\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 \lstnewenvironment{python}[1][]
+{\lstset{language=python,moredelim=**[is][\protect\color{red}]{`}{`},#1}\lstset{#1}}
+{\lstset{language=python,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
+{}
+\lstnewenvironment{java}[1][]
+{\lstset{language=java,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}}
 {}
 …
+}
 \newbox\myboxA
 \newbox\myboxB
 \newbox\myboxC
 \newbox\myboxD
+\newsavebox{\myboxA}
+\newsavebox{\myboxB}
+\newsavebox{\myboxC}
+\newsavebox{\myboxD}
 \title{\texorpdfstring{Advanced Control-flow and Concurrency in \protect\CFA}{Advanced Control-flow in Cforall}}
 …
 \address[1]{\orgdiv{Cheriton School of Computer Science}, \orgname{University of Waterloo}, \orgaddress{\state{Waterloo, ON}, \country{Canada}}}
 \corres{*Peter A. Buhr, Cheriton School of Computer Science, University of Waterloo, 200 University Avenue West, Waterloo, ON, N2L 3G1, Canada. \email{pabuhr{\char`\@}uwaterloo.ca}}
+\corres{*Peter A. Buhr, Cheriton School of Computer Science, University of Waterloo, 200 University Avenue West, Waterloo, ON N2L 3G1, Canada. \email{pabuhr{\char`\@}uwaterloo.ca}}
 % \fundingInfo{Natural Sciences and Engineering Research Council of Canada}
 \abstract[Summary]{
 \CFA is a polymorphic, non-object-oriented, concurrent, backwards-compatible extension of the C programming language.
+\CFA is a polymorphic, nonobject-oriented, concurrent, backwards compatible extension of the C programming language.
 This paper discusses the design philosophy and implementation of its advanced control-flow and concurrent/parallel features, along with the supporting runtime written in \CFA.
 These features are created from scratch as ISO C has only low-level and/or unimplemented concurrency, so C programmers continue to rely on library features like pthreads.
+These features are created from scratch as ISO C has only low-level and/or unimplemented concurrency, so C programmers continue to rely on library approaches like pthreads.
 \CFA introduces modern language-level control-flow mechanisms, like generators, coroutines, user-level threading, and monitors for mutual exclusion and synchronization.
 % Library extension for executors, futures, and actors are built on these basic mechanisms.
 The runtime provides significant programmer simplification and safety by eliminating spurious wakeup and monitor barging.
 The runtime also ensures multiple monitors can be safely acquired \emph{simultaneously} (deadlock free), and this feature is fully integrated with all monitor synchronization mechanisms.
+The runtime also ensures multiple monitors can be safely acquired in a deadlock-free way, and this feature is fully integrated with all monitor synchronization mechanisms.
 All control-flow features integrate with the \CFA polymorphic type-system and exception handling, while respecting the expectations and style of C programmers.
 Experimental results show comparable performance of the new features with similar mechanisms in other concurrent programming languages.
 }%
 \keywords{generator, coroutine, concurrency, parallelism, thread, monitor, runtime, C, \CFA (Cforall)}
+\keywords{C \CFA (Cforall) coroutine concurrency generator monitor parallelism runtime thread}
 \begin{document}
 \linenumbers                                            % comment out to turn off line numbering
+%\linenumbers                           % comment out to turn off line numbering
 \maketitle
 …
 \section{Introduction}
+This paper discusses the design philosophy and implementation of advanced language-level control-flow and concurrent/parallel features in \CFA~\cite{Moss18,Cforall} and its runtime, which is written entirely in \CFA.
+\CFA is a modern, polymorphic, non-object-oriented\footnote{
+\CFA has features often associated with object-oriented programming languages, such as constructors, destructors, virtuals and simple inheritance.
+However, functions \emph{cannot} be nested in structures, so there is no lexical binding between a structure and set of functions (member/method) implemented by an implicit \lstinline@this@ (receiver) parameter.},
+backwards-compatible extension of the C programming language.
+In many ways, \CFA is to C as Scala~\cite{Scala} is to Java, providing a \emph{research vehicle} for new typing and control-flow capabilities on top of a highly popular programming language allowing immediate dissemination.
+Within the \CFA framework, new control-flow features are created from scratch because ISO \Celeven defines only a subset of the \CFA extensions, where the overlapping features are concurrency~\cite[\S~7.26]{C11}.
+However, \Celeven concurrency is largely wrappers for a subset of the pthreads library~\cite{Butenhof97,Pthreads}, and \Celeven and pthreads concurrency is simple, based on thread fork/join in a function and mutex/condition locks, which is low-level and error-prone;
+no high-level language concurrency features are defined.
+Interestingly, almost a decade after publication of the \Celeven standard, neither gcc-8, clang-9 nor msvc-19 (most recent versions) support the \Celeven include @threads.h@, indicating little interest in the C11 concurrency approach (possibly because the effort to add concurrency to \CC).
+Finally, while the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}.
+\CFA~\cite{Moss18,Cforall} is a modern, polymorphic, nonobject-oriented\footnote{
+\CFA has object-oriented features, such as constructors, destructors, and simple trait/interface inheritance.
+% Go interfaces, Rust traits, Swift Protocols, Haskell Type Classes and Java Interfaces.
+% "Trait inheritance" works for me. "Interface inheritance" might also be a good choice, and distinguish clearly from implementation inheritance.
+% You'll want to be a little bit careful with terms like "structural" and "nominal" inheritance as well. CFA has structural inheritance (I think Go as well) -- it's inferred based on the structure of the code.
+% Java, Rust, and Haskell (not sure about Swift) have nominal inheritance, where there needs to be a specific statement that "this type inherits from this type".
+However, functions \emph{cannot} be nested in structures and there is no mechanism to designate a function parameter as a receiver, \lstinline@this@, parameter.},
+, backward-compatible extension of the C programming language.
+In many ways, \CFA is to C as Scala~\cite{Scala} is to Java, providing a vehicle for new typing and control-flow capabilities on top of a highly popular programming language\footnote{
+The TIOBE index~\cite{TIOBE} for May 2020 ranks the top five \emph{popular} programming languages as C 17\%, Java 16\%, Python 9\%, \CC 6\%, and \Csharp 4\% = 52\%, and over the past 30 years, C has always ranked either first or second in popularity.}
+allowing immediate dissemination.
+This paper discusses the design philosophy and implementation of \CFA's advanced control-flow and concurrent/parallel features, along with the supporting runtime written in \CFA.
+% The call/return extensions retain state between callee and caller versus losing the callee's state on return;
+% the concurrency extensions allow high-level management of threads.
+The \CFA control-flow framework extends ISO \Celeven~\cite{C11} with new call/return and concurrent/parallel control-flow.
+Call/return control-flow with argument and parameter passing appeared in the first programming languages.
+Over the past 50 years, call/return has been augmented with features like static and dynamic call, exceptions (multilevel return) and generators/coroutines (see Section~\ref{s:StatefulFunction}).
+While \CFA has mechanisms for dynamic call (algebraic effects~\cite{Zhang19}) and exceptions\footnote{
+\CFA exception handling will be presented in a separate paper.
+The key feature that dovetails with this paper is nonlocal exceptions allowing exceptions to be raised across stacks, with synchronous exceptions raised among coroutines and asynchronous exceptions raised among threads, similar to that in \uC~\cite[\S~5]{uC++}}
+, this work only discusses retaining state between calls via generators and coroutines.
+\newterm{Coroutining} was introduced by Conway~\cite{Conway63}, discussed by Knuth~\cite[\S~1.4.2]{Knuth73V1}, implemented in Simula67~\cite{Simula67}, formalized by Marlin~\cite{Marlin80}, and is now popular and appears in old and new programming languages: CLU~\cite{CLU}, \Csharp~\cite{Csharp}, Ruby~\cite{Ruby}, Python~\cite{Python}, JavaScript~\cite{JavaScript}, Lua~\cite{Lua}, \CCtwenty~\cite{C++20Coroutine19}.
+Coroutining is sequential execution requiring direct handoff among coroutines, \ie only the programmer is controlling execution order.
+If coroutines transfer to an internal event-engine for scheduling the next coroutines (as in async-await), the program transitions into the realm of concurrency~\cite[\S~3]{Buhr05a}.
+Coroutines are only a stepping stone toward concurrency where the commonality is that coroutines and threads retain state between calls.
+\Celeven and \CCeleven define concurrency~\cite[\S~7.26]{C11}, but it is largely wrappers for a subset of the pthreads library~\cite{Pthreads}.\footnote{Pthreads concurrency is based on simple thread fork and join in a function and mutex or condition locks, which is low-level and error-prone}
+Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC).
+While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC.
 In contrast, there has been a renewed interest during the past decade in user-level (M:N, green) threading in old and new programming languages.
 As multi-core hardware became available in the 1980/90s, both user and kernel threading were examined.
+As multicore hardware became available in the 1980/1990s, both user and kernel threading were examined.
 Kernel threading was chosen, largely because of its simplicity and fit with the simpler operating systems and hardware architectures at the time, which gave it a performance advantage~\cite{Drepper03}.
 Libraries like pthreads were developed for C, and the Solaris operating-system switched from user (JDK 1.1~\cite{JDK1.1}) to kernel threads.
 As a result, languages like Java, Scala, Objective-C~\cite{obj-c-book}, \CCeleven~\cite{C11}, and C\#~\cite{Csharp} adopt the 1:1 kernel-threading model, with a variety of presentation mechanisms.
 From 2000 onwards, languages like Go~\cite{Go}, Erlang~\cite{Erlang}, Haskell~\cite{Haskell}, D~\cite{D}, and \uC~\cite{uC++,uC++book} have championed the M:N user-threading model, and many user-threading libraries have appeared~\cite{Qthreads,MPC,Marcel}, including putting green threads back into Java~\cite{Quasar}.
 The main argument for user-level threading is that it is lighter weight than kernel threading (locking and context switching do not cross the kernel boundary), so there is less restriction on programming styles that encourage large numbers of threads performing medium work units to facilitate load balancing by the runtime~\cite{Verch12}.
+As a result, many languages adopt the 1:1 kernel-threading model, like Java (Scala), Objective-C~\cite{obj-c-book}, \CCeleven~\cite{C11}, C\#~\cite{Csharp} and Rust~\cite{Rust}, with a variety of presentation mechanisms.
+From 2000 onward, several language implementations have championed the M:N user-threading model, like Go~\cite{Go}, Erlang~\cite{Erlang}, Haskell~\cite{Haskell}, D~\cite{D}, and \uC~\cite{uC++,uC++book}, including putting green threads back into Java~\cite{Quasar}, and many user-threading libraries have appeared~\cite{Qthreads,MPC,Marcel}.
+The main argument for user-level threading is that it is lighter weight than kernel threading because locking and context switching do not cross the kernel boundary, so there is less restriction on programming styles that encourages large numbers of threads performing medium-sized work to facilitate load balancing by the runtime~\cite{Verch12}.
 As well, user-threading facilitates a simpler concurrency approach using thread objects that leverage sequential patterns versus events with call-backs~\cite{Adya02,vonBehren03}.
+Finally, performant user-threading implementations (both time and space) meet or exceed direct kernel-threading implementations, while achieving the programming advantages of high concurrency levels and safety.
+A further effort over the past two decades is the development of language memory models to deal with the conflict between language features and compiler/hardware optimizations, \ie some language features are unsafe in the presence of aggressive sequential optimizations~\cite{Buhr95a,Boehm05}.
+The consequence is that a language must provide sufficient tools to program around safety issues, as inline and library code is all sequential to the compiler.
+One solution is low-level qualifiers and functions (\eg @volatile@ and atomics) allowing \emph{programmers} to explicitly write safe (race-free~\cite{Boehm12}) programs.
+A safer solution is high-level language constructs so the \emph{compiler} knows the optimization boundaries, and hence, provides implicit safety.
+This problem is best known with respect to concurrency, but applies to other complex control-flow, like exceptions\footnote{
+\CFA exception handling will be presented in a separate paper.
+The key feature that dovetails with this paper is nonlocal exceptions allowing exceptions to be raised across stacks, with synchronous exceptions raised among coroutines and asynchronous exceptions raised among threads, similar to that in \uC~\cite[\S~5]{uC++}
+} and coroutines.
+Finally, language solutions allow matching constructs with language paradigm, \ie imperative and functional languages often have different presentations of the same concept to fit their programming model.
+Finally, performant user-threading implementations, both in time and space, meet or exceed direct kernel-threading implementations, while achieving the programming advantages of high concurrency levels and safety.
+A further effort over the past two decades is the development of language memory models to deal with the conflict between language features and compiler/hardware optimizations, \eg some language features are unsafe in the presence of aggressive sequential optimizations~\cite{Buhr95a,Boehm05}.
+The consequence is that a language must provide sufficient tools to program around safety issues, as inline and library code is compiled as sequential without any explicit concurrent directive.
+One solution is low-level qualifiers and functions, \eg @volatile@ and atomics, allowing \emph{programmers} to explicitly write safe, race-free~\cite{Boehm12} programs.
+A safer solution is high-level language constructs so the \emph{compiler} knows the concurrency boundaries, \ie where mutual exclusion and synchronization are acquired and released, and provide implicit safety at and across these boundaries.
+While the optimization problem is best known with respect to concurrency, it applies to other complex control-flows like exceptions and coroutines.
+As well, language solutions allow matching the language paradigm with the approach, \eg matching the functional paradigm with data-flow programming or the imperative paradigm with thread programming.
 Finally, it is important for a language to provide safety over performance \emph{as the default}, allowing careful reduction of safety for performance when necessary.
+Two concurrency violations of this philosophy are \emph{spurious wakeup} (random wakeup~\cite[\S~8]{Buhr05a}) and \emph{barging}\footnote{
+The notion of competitive succession instead of direct handoff, \ie a lock owner releases the lock and an arriving thread acquires it ahead of preexisting waiter threads.
+} (signals-as-hints~\cite[\S~8]{Buhr05a}), where one is a consequence of the other, \ie once there is spurious wakeup, signals-as-hints follow.
+However, spurious wakeup is \emph{not} a foundational concurrency property~\cite[\S~8]{Buhr05a}, it is a performance design choice.
+Similarly, signals-as-hints are often a performance decision.
+We argue removing spurious wakeup and signals-as-hints make concurrent programming significantly safer because it removes local non-determinism and matches with programmer expectation.
+(Author experience teaching concurrency is that students are highly confused by these semantics.)
+Clawing back performance, when local non-determinism is unimportant, should be an option not the default.
+\begin{comment}
+Most augmented traditional (Fortran 18~\cite{Fortran18}, Cobol 14~\cite{Cobol14}, Ada 12~\cite{Ada12}, Java 11~\cite{Java11}) and new languages (Go~\cite{Go}, Rust~\cite{Rust}, and D~\cite{D}), except \CC, diverge from C with different syntax and semantics, only interoperate indirectly with C, and are not systems languages, for those with managed memory.
+As a result, there is a significant learning curve to move to these languages, and C legacy-code must be rewritten.
+While \CC, like \CFA, takes an evolutionary approach to extend C, \CC's constantly growing complex and interdependent features-set (\eg objects, inheritance, templates, etc.) mean idiomatic \CC code is difficult to use from C, and C programmers must expend significant effort learning \CC.
+Hence, rewriting and retraining costs for these languages, even \CC, are prohibitive for companies with a large C software-base.
+\CFA with its orthogonal feature-set, its high-performance runtime, and direct access to all existing C libraries circumvents these problems.
+\end{comment}
+\CFA embraces user-level threading, language extensions for advanced control-flow, and safety as the default.
+We present comparative examples so the reader can judge if the \CFA control-flow extensions are better and safer than those in other concurrent, imperative programming languages, and perform experiments to show the \CFA runtime is competitive with other similar mechanisms.
+Two concurrency violations of this philosophy are \emph{spurious} or \emph{random wakeup}~\cite[\S~9]{Buhr05a}, and \emph{barging}\footnote{
+Barging is competitive succession instead of direct handoff, \ie after a lock is released both arriving and preexisting waiter threads compete to acquire the lock.
+Hence, an arriving thread can temporally \emph{barge} ahead of threads already waiting for an event, which can repeat indefinitely leading to starvation of waiter threads.
+} or signals-as-hints~\cite[\S~8]{Buhr05a}, where one is a consequence of the other, \ie once there is spurious wakeup, barging follows.
+(Author experience teaching concurrency is that students are confused by these semantics.)
+However, spurious wakeup is \emph{not} a foundational concurrency property~\cite[\S~9]{Buhr05a};
+it is a performance design choice.
+We argue removing spurious wakeup and signals-as-hints make concurrent programming simpler and safer as there is less local nondeterminism to manage.
+If barging acquisition is allowed, its specialized performance advantage should be available as an option not the default.
+\CFA embraces language extensions for advanced control-flow, user-level threading, and safety as the default.
+We present comparative examples to support our argument that the \CFA control-flow extensions are as expressive and safe as those in other concurrent imperative programming languages, and perform experiments to show the \CFA runtime is competitive with other similar mechanisms.
 The main contributions of this work are:
 \begin{itemize}[topsep=3pt,itemsep=1pt]
+\begin{itemize}[topsep=3pt,itemsep=0pt]
 \item
+language-level generators, coroutines and user-level threading, which respect the expectations of C programmers.
+a set of fundamental execution properties that dictate which language-level control-flow features need to be supported,
 \item
+monitor synchronization without barging, and the ability to safely acquiring multiple monitors \emph{simultaneously} (deadlock free), while seamlessly integrating these capabilities with all monitor synchronization mechanisms.
+integration of these language-level control-flow features, while respecting the style and expectations of C programmers,
 \item
+providing statically type-safe interfaces that integrate with the \CFA polymorphic type-system and other language features.
+monitor synchronization without barging, and the ability to safely acquiring multiple monitors in a deadlock-free way, while seamlessly integrating these capabilities with all monitor synchronization mechanisms,
+\item
+providing statically type-safe interfaces that integrate with the \CFA polymorphic type-system and other language features,
 % \item
 % library extensions for executors, futures, and actors built on the basic mechanisms.
 \item
+a runtime system with no spurious wakeup.
+a runtime system without spurious wake-up and no performance loss,
 \item
+a dynamic partitioning mechanism to segregate the execution environment for specialized requirements.
+a dynamic partitioning mechanism to segregate groups of executing user and kernel threads performing specialized work, \eg web-server or compute engine, or requiring different scheduling, \eg NUMA or real-time.
 % \item
+% a non-blocking I/O library
+% a nonblocking I/O library
 \item
 experimental results showing comparable performance of the new features with similar mechanisms in other programming languages.
+experimental results showing comparable performance of the \CFA features with similar mechanisms in other languages.
 \end{itemize}
+Section~\ref{s:StatefulFunction} begins advanced control by introducing sequential functions that retain data and execution state between calls, which produces constructs @generator@ and @coroutine@.
+Section~\ref{s:Concurrency} begins concurrency, or how to create (fork) and destroy (join) a thread, which produces the @thread@ construct.
+Section~\ref{s:MutualExclusionSynchronization} discusses the two mechanisms to restricted nondeterminism when controlling shared access to resources (mutual exclusion) and timing relationships among threads (synchronization).
+Section~\ref{s:FundamentalExecutionProperties} presents the compositional hierarchy of execution properties directing the design of control-flow features in \CFA.
+Section~\ref{s:StatefulFunction} begins advanced control by introducing sequential functions that retain data and execution state between calls producing constructs @generator@ and @coroutine@.
+Section~\ref{s:Concurrency} begins concurrency, or how to create (fork) and destroy (join) a thread producing the @thread@ construct.
+Section~\ref{s:MutualExclusionSynchronization} discusses the two mechanisms to restricted nondeterminism when controlling shared access to resources, called mutual exclusion, and timing relationships among threads, called synchronization.
 Section~\ref{s:Monitor} shows how both mutual exclusion and synchronization are safely embedded in the @monitor@ and @thread@ constructs.
+Section~\ref{s:CFARuntimeStructure} describes the large-scale mechanism to structure (cluster) threads and virtual processors (kernel threads).
+Section~\ref{s:Performance} uses a series of microbenchmarks to compare \CFA threading with pthreads, Java OpenJDK-9, Go 1.12.6 and \uC 7.0.0.
+Section~\ref{s:CFARuntimeStructure} describes the large-scale mechanism to structure threads and virtual processors (kernel threads).
+Section~\ref{s:Performance} uses microbenchmarks to compare \CFA threading with pthreads, Java 11.0.6, Go 1.12.6, Rust 1.37.0, Python 3.7.6, Node.js v12.18.0, and \uC 7.0.0.
+\section{Fundamental Execution Properties}
+\label{s:FundamentalExecutionProperties}
+The features in a programming language should be composed of a set of fundamental properties rather than an ad hoc collection chosen by the designers.
+To this end, the control-flow features created for \CFA are based on the fundamental properties of any language with function-stack control-flow (see also \uC~\cite[pp.~140-142]{uC++}).
+The fundamental properties are execution state, thread, and mutual-exclusion/synchronization.
+These independent properties can be used to compose different language features, forming a compositional hierarchy, where the combination of all three is the most advanced feature, called a thread.
+While it is possible for a language to only provide threads for composing programs~\cite{Hermes90}, this unnecessarily complicates and makes inefficient solutions to certain classes of problems.
+As is shown, each of the non-rejected composed language features solves a particular set of problems, and hence, has a defensible position in a programming language.
+If a compositional feature is missing, a programmer has too few fundamental properties resulting in a complex and/or inefficient solution.
+In detail, the fundamental properties are:
+\begin{description}[leftmargin=\parindent,topsep=3pt,parsep=0pt]
+\item[\newterm{execution state}:]
+It is the state information needed by a control-flow feature to initialize and manage both compute data and execution location(s), and de-initialize.
+For example, calling a function initializes a stack frame including contained objects with constructors, manages local data in blocks and return locations during calls, and de-initializes the frame by running any object destructors and management operations.
+State is retained in fixed-sized aggregate structures (objects) and dynamic-sized stack(s), often allocated in the heap(s) managed by the runtime system.
+The lifetime of state varies with the control-flow feature, where longer life-time and dynamic size provide greater power but also increase usage complexity and cost.
+Control-flow transfers among execution states in multiple ways, such as function call, context switch, asynchronous await, etc.
+Because the programming language determines what constitutes an execution state, implicitly manages this state, and defines movement mechanisms among states, execution state is an elementary property of the semantics of a programming language.
+% An execution-state is related to the notion of a process continuation \cite{Hieb90}.
+\item[\newterm{threading}:]
+It is execution of code that occurs independently of other execution, where an individual thread's execution is sequential.
+Multiple threads provide \emph{concurrent execution};
+concurrent execution becomes parallel when run on multiple processing units, \eg hyper-threading, cores, or sockets.
+A programmer needs mechanisms to create, block and unblock, and join with a thread, even if these basic mechanisms are supplied indirectly through high-level features.
+\item[\newterm{mutual-exclusion / synchronization (MES)}:]
+It is the concurrency mechanism to perform an action without interruption and establish timing relationships among multiple threads.
+We contented these two properties are independent, \ie mutual exclusion cannot provide synchronization and vice versa without introducing additional threads~\cite[\S~4]{Buhr05a}.
+Limiting MES functionality results in contrived solutions and inefficiency on multicore von Neumann computers where shared memory is a foundational aspect of its design.
+\end{description}
+These properties are fundamental as they cannot be built from existing language features, \eg a basic programming language like C99~\cite{C99} cannot create new control-flow features, concurrency, or provide MES without (atomic) hardware mechanisms.
+\subsection{Structuring execution properties}
+Programming languages seldom present the fundamental execution properties directly to programmers.
+Instead, the properties are packaged into higher-level constructs that encapsulate details and provide safety to these low-level mechanisms.
+Interestingly, language designers often pick and choose among these execution properties proving a varying subset of constructs.
+Table~\ref{t:ExecutionPropertyComposition} shows all combinations of the three fundamental execution properties available to language designers.
+(When doing combination case-analysis, not all combinations are meaningful.)
+The combinations of state, thread, and MES compose a hierarchy of control-flow features all of which have appeared in prior programming languages, where each of these languages have found the feature useful.
+To understand the table, it is important to review the basic von Neumann execution requirement of at least one thread and execution state providing some form of call stack.
+For table entries missing these minimal components, the property is borrowed from the invoker (caller).
+Each entry in the table, numbered \textbf{1}--\textbf{12}, is discussed with respect to how the execution properties combine to generate a high-level language construct.
+\begin{table}
+\caption{Execution property composition}
+\centering
+\label{t:ExecutionPropertyComposition}
+\renewcommand{\arraystretch}{1.25}
+%\setlength{\tabcolsep}{5pt}
+\vspace*{-5pt}
+\begin{tabular}{c|c||l|l}
+\multicolumn{2}{c||}{Execution properties} & \multicolumn{2}{c}{Mutual exclusion / synchronization} \\
+\hline
+stateful                        & thread        & \multicolumn{1}{c|}{No} & \multicolumn{1}{c}{Yes} \\
+\hline
+\hline
+No                                      & No            & \textbf{1}\ \ \ @struct@                              & \textbf{2}\ \ \ @mutex@ @struct@              \\
+\hline
+Yes (stackless)         & No            & \textbf{3}\ \ \ @generator@                   & \textbf{4}\ \ \ @mutex@ @generator@   \\
+\hline
+Yes (stackful)          & No            & \textbf{5}\ \ \ @coroutine@                   & \textbf{6}\ \ \ @mutex@ @coroutine@   \\
+\hline
+No                                      & Yes           & \textbf{7}\ \ \ {\color{red}rejected} & \textbf{8}\ \ \ {\color{red}rejected} \\
+\hline
+Yes (stackless)         & Yes           & \textbf{9}\ \ \ {\color{red}rejected} & \textbf{10}\ \ \ {\color{red}rejected} \\
+\hline
+Yes (stackful)          & Yes           & \textbf{11}\ \ \ @thread@                             & \textbf{12}\ \ @mutex@ @thread@               \\
+\end{tabular}
+\vspace*{-8pt}
+\end{table}
+Case 1 is a structure where access functions borrow local state (stack frame/activation) and thread from the invoker and retain this state across \emph{callees}, \ie function local-variables are retained on the borrowed stack during calls.
+Structures are a foundational mechanism for data organization, and access functions provide interface abstraction and code sharing in all programming languages.
+Case 2 is case 1 with thread safety to a structure's state where access functions provide serialization (mutual exclusion) and scheduling among calling threads (synchronization).
+A @mutex@ structure, often called a \newterm{monitor}, provides a high-level interface for race-free access of shared data in concurrent programming languages.
+Case 3 is case 1 where the structure can implicitly retain execution state and access functions use this execution state to resume/suspend across \emph{callers}, but resume/suspend does not retain a function's local state.
+A stackless structure, often called a \newterm{generator} or \emph{iterator}, is \newterm{stackless} because it still borrows the caller's stack and thread, but the stack is used only to preserve state across its callees not callers.
+Generators provide the first step toward directly solving problems like finite-state machines (FSMs) that retain data and execution state between calls, whereas normal functions restart on each call.
+Case 4 is cases 2 and 3 with thread safety during execution of the generator's access functions.
+A @mutex@ generator extends generators into the concurrent domain.
+Cases 5 and 6 are like cases 3 and 4 where the structure is extended with an implicit separate stack, so only the thread is borrowed by access functions.
+A stackful generator, often called a \newterm{coroutine}, is \newterm{stackful} because resume/suspend now context switch to/from the caller's and coroutine's stack.
+A coroutine extends the state retained between calls beyond the generator's structure to arbitrary call depth in the access functions.
+Cases 7, 8, 9 and 10 are rejected because a new thread must have its own stack, where the thread begins and stack frames are stored for calls, \ie it is unrealistic for a thread to borrow a stack.
+For cases 9 and 10, the stackless frame is not growable, precluding accepting nested calls, making calls, blocking as it requires calls, or preemption as it requires pushing an interrupt frame, all of which compound to require an unknown amount of execution state.
+Hence, if this kind of uninterruptable thread exists, it must execute to completion, \ie computation only, which severely restricts runtime management.
+Cases 11 and 12 are a stackful thread with and without safe access to shared state.
+A thread is the language mechanism to start another thread of control in a program with growable execution state for call/return execution.
+In general, language constructs with more execution properties increase the cost of creation and execution along with complexity of usage.
+Given the execution-properties taxonomy, programmers now ask three basic questions: is state necessary across callers and how much, is a separate thread necessary, is thread safety necessary.
+Table~\ref{t:ExecutionPropertyComposition} then suggests the optimal language feature needed for implementing a programming problem.
+The following sections describe how \CFA fills in \emph{all} the nonrejected table entries with language features, while other programming languages may only provide a subset of the table.
+\subsection{Design requirements}
+The following design requirements largely stem from building \CFA on top of C.
+\begin{itemize}[topsep=3pt,parsep=0pt]
+\item
+All communication must be statically type checkable for early detection of errors and efficient code generation.
+This requirement is consistent with the fact that C is a statically typed programming language.
+\item
+Direct interaction among language features must be possible allowing any feature to be selected without restricting comm\-unication.
+For example, many concurrent languages do not provide direct communication calls among threads, \ie threads only communicate indirectly through monitors, channels, messages, and/or futures.
+Indirect communication increases the number of objects, consuming more resources, and requires additional synchronization and possibly data transfer.
+\item
+All communication is performed using function calls, \ie data are transmitted from argument to parameter and results are returned from function calls.
+Alternative forms of communication, such as call-backs, message passing, channels, or communication ports, step outside of C's normal form of communication.
+\item
+All stateful features must follow the same declaration scopes and lifetimes as other language data.
+For C that means at program startup, during block and function activation, and on demand using dynamic allocation.
+\item
+MES must be available implicitly in language constructs, \eg Java built-in monitors, as well as explicitly for specialized requirements, \eg @java.util.concurrent@, because requiring programmers to build MES using low-level locks often leads to incorrect programs.
+Furthermore, reducing synchronization scope by encapsulating it within language constructs further reduces errors in concurrent programs.
+\item
+Both synchronous and asynchronous communication are needed.
+However, we believe the best way to provide asynchrony, such as call-buffering/chaining and/or returning futures~\cite{multilisp}, is building it from expressive synchronous features.
+\item
+Synchronization must be able to control the service order of requests including prioritizing selection from different kinds of outstanding requests, and postponing a request for an unspecified time while continuing to accept new requests.
+Otherwise, certain concurrency problems are difficult, \eg web server, disk scheduling, and the amount of concurrency is inhibited~\cite{Gentleman81}.
+\end{itemize}
+We have satisfied these requirements in \CFA while maintaining backwards compatibility with the huge body of legacy C programs.
+% In contrast, other new programming languages must still access C programs (\eg operating-system service routines), but do so through fragile C interfaces.
+\subsection{Asynchronous await / call}
+Asynchronous await/call is a caller mechanism for structuring programs and/or increasing concurrency, where the caller (client) postpones an action into the future, which is subsequently executed by a callee (server).
+The caller detects the action's completion through a \newterm{future} or \newterm{promise}.
+The benefit is asynchronous caller execution with respect to the callee until future resolution.
+For single-threaded languages like JavaScript, an asynchronous call passes a callee action, which is queued in the event-engine, and continues execution with a promise.
+When the caller needs the promise to be fulfilled, it executes @await@.
+A promise-completion call-back can be part of the callee action or the caller is rescheduled;
+in either case, the call back is executed after the promise is fulfilled.
+While asynchronous calls generate new callee (server) events, we contend this mechanism is insufficient for advanced control-flow mechanisms like generators or coroutines, which are discussed next.
+Specifically, control between caller and callee occurs indirectly through the event-engine precluding direct handoff and cycling among events, and requires complex resolution of a control promise and data.
+Note, @async-await@ is just syntactic-sugar over the event engine so it does not solve these deficiencies.
+For multithreaded languages like Java, the asynchronous call queues a callee action with an executor (server), which subsequently executes the work by a thread in the executor thread-pool.
+The problem is when concurrent work-units need to interact and/or block as this effects the executor by stopping threads.
+While it is possible to extend this approach to support the necessary mechanisms, \eg message passing in Actors, we show monitors and threads provide an equally competitive approach that does not deviate from normal call communication and can be used to build asynchronous call, as is done in Java.
 …
 \label{s:StatefulFunction}
+The stateful function is an old idea~\cite{Conway63,Marlin80} that is new again~\cite{C++20Coroutine19}, where execution is temporarily suspended and later resumed, \eg plugin, device driver, finite-state machine.
+Hence, a stateful function may not end when it returns to its caller, allowing it to be restarted with the data and execution location present at the point of suspension.
+This capability is accomplished by retaining a data/execution \emph{closure} between invocations.
+If the closure is fixed size, we call it a \emph{generator} (or \emph{stackless}), and its control flow is restricted, \eg suspending outside the generator is prohibited.
+If the closure is variable size, we call it a \emph{coroutine} (or \emph{stackful}), and as the names implies, often implemented with a separate stack with no programming restrictions.
+Hence, refactoring a stackless coroutine may require changing it to stackful.
+A foundational property of all \emph{stateful functions} is that resume/suspend \emph{do not} cause incremental stack growth, \ie resume/suspend operations are remembered through the closure not the stack.
+As well, activating a stateful function is \emph{asymmetric} or \emph{symmetric}, identified by resume/suspend (no cycles) and resume/resume (cycles).
+A fixed closure activated by modified call/return is faster than a variable closure activated by context switching.
+Additionally, any storage management for the closure (especially in unmanaged languages, \ie no garbage collection) must also be factored into design and performance.
+Therefore, selecting between stackless and stackful semantics is a tradeoff between programming requirements and performance, where stackless is faster and stackful is more general.
+Note, creation cost is amortized across usage, so activation cost is usually the dominant factor.
+A \emph{stateful function} has the ability to remember state between calls, where state can be either data or execution, \eg plugin, device driver, FSM.
+A simple technique to retain data state between calls is @static@ declarations within a function, which is often implemented by hoisting the declarations to the global scope but hiding the names within the function using name mangling.
+However, each call starts the function at the top making it difficult to determine the last point of execution in an algorithm, and requiring multiple flag variables and testing to reestablish the continuation point.
+Hence, the next step of generalizing function state is implicitly remembering the return point between calls and reentering the function at this point rather than the top, called \emph{generators}\,/\,\emph{iterators} or \emph{stackless coroutines}.
+For example, a Fibonacci generator retains data and execution state allowing it to remember prior values needed to generate the next value and the location in the algorithm to compute that value.
+The next step of generalization is instantiating the function to allow multiple named instances, \eg multiple Fibonacci generators, where each instance has its own state, and hence, can generate an independent sequence of values.
+Note, a subset of generator state is a function \emph{closure}, \ie the technique of capturing lexical references when returning a nested function.
+A further generalization is adding a stack to a generator's state, called a \emph{coroutine}, so it can suspend outside of itself, \eg call helper functions to arbitrary depth before suspending back to its resumer without unwinding these calls.
+For example, a coroutine iterator for a binary tree can stop the traversal at the visit point (pre, infix, post traversal), return the node value to the caller, and then continue the recursive traversal from the current node on the next call.
+There are two styles of activating a stateful function, \emph{asymmetric} or \emph{symmetric}, identified by resume/suspend (no cycles) and resume/resume (cycles).
+These styles \emph{do not} cause incremental stack growth, \eg a million resume/suspend or resume/resume cycles do not remember each cycle just the last resumer for each cycle.
+Selecting between stackless/stackful semantics and asymmetric/symmetric style is a tradeoff between programming requirements, performance, and design, where stackless is faster and smaller using modified call/return between closures, stackful is more general but slower and larger using context switching between distinct stacks, and asymmetric is simpler control-flow than symmetric.
+Additionally, storage management for the closure/stack must be factored into design and performance, especially in unmanaged languages without garbage collection.
+Note, creation cost (closure/stack) is amortized across usage, so activation cost (resume/suspend) is usually the dominant factor.
+% The stateful function is an old idea~\cite{Conway63,Marlin80} that is new again~\cite{C++20Coroutine19}, where execution is temporarily suspended and later resumed, \eg plugin, device driver, finite-state machine.
+% Hence, a stateful function may not end when it returns to its caller, allowing it to be restarted with the data and execution location present at the point of suspension.
+% If the closure is fixed size, we call it a \emph{generator} (or \emph{stackless}), and its control flow is restricted, \eg suspending outside the generator is prohibited.
+% If the closure is variable size, we call it a \emph{coroutine} (or \emph{stackful}), and as the names implies, often implemented with a separate stack with no programming restrictions.
+% Hence, refactoring a stackless coroutine may require changing it to stackful.
+% A foundational property of all \emph{stateful functions} is that resume/suspend \emph{do not} cause incremental stack growth, \ie resume/suspend operations are remembered through the closure not the stack.
+% As well, activating a stateful function is \emph{asymmetric} or \emph{symmetric}, identified by resume/suspend (no cycles) and resume/resume (cycles).
+% A fixed closure activated by modified call/return is faster than a variable closure activated by context switching.
+% Additionally, any storage management for the closure (especially in unmanaged languages, \ie no garbage collection) must also be factored into design and performance.
+% Therefore, selecting between stackless and stackful semantics is a tradeoff between programming requirements and performance, where stackless is faster and stackful is more general.
+% nppNote, creation cost is amortized across usage, so activation cost is usually the dominant factor.
+For example, Python presents asymmetric generators as a function object, \uC presents symmetric coroutines as a \lstinline[language=C++]|class|-like object, and many languages present threading using function pointers, @pthreads@~\cite{Butenhof97}, \Csharp~\cite{Csharp}, Go~\cite{Go}, and Scala~\cite{Scala}.
+\begin{center}
+\begin{tabular}{@{}l|l|l@{}}
+\multicolumn{1}{@{}c|}{Python asymmetric generator} & \multicolumn{1}{c|}{\uC symmetric coroutine} & \multicolumn{1}{c@{}}{Pthreads thread} \\
+\hline
+\begin{python}
+`def Gen():` $\LstCommentStyle{\color{red}// function}$
+        ... yield val ...
+gen = Gen()
+for i in range( 10 ):
+        print( next( gen ) )
+\end{python}
+&
+\begin{uC++}
+`_Coroutine Cycle {` $\LstCommentStyle{\color{red}// class}$
+        Cycle * p;
+        void main() { p->cycle(); }
+        void cycle() { resume(); }  `};`
+Cycle c1, c2; c1.p=&c2; c2.p=&c1; c1.cycle();
+\end{uC++}
+&
+\begin{cfa}
+void * `rtn`( void * arg ) { ... }
+int i = 3, rc;
+pthread_t t; $\C{// thread id}$
+$\LstCommentStyle{\color{red}// function pointer}$
+rc=pthread_create(&t, `rtn`, (void *)i);
+\end{cfa}
+\end{tabular}
+\end{center}
+\CFA's preferred presentation model for generators/coroutines/threads is a hybrid of functions and classes, giving an object-oriented flavor.
+Essentially, the generator/coroutine/thread function is semantically coupled with a generator/coroutine/thread custom type via the type's name.
+The custom type solves several issues, while accessing the underlying mechanisms used by the custom types is still allowed for flexibility reasons.
+Each custom type is discussed in detail in the following sections.
+\subsection{Generator}
+Stackless generators (Table~\ref{t:ExecutionPropertyComposition} case 3) have the potential to be very small and fast, \ie as small and fast as function call/return for both creation and execution.
+The \CFA goal is to achieve this performance target, possibly at the cost of some semantic complexity.
+A series of different kinds of generators and their implementation demonstrate how this goal is accomplished.\footnote{
+The \CFA operator syntax uses \lstinline|?| to denote operands, which allows precise definitions for pre, post, and infix operators, \eg \lstinline|?++|, \lstinline|++?|, and \lstinline|?+?|, in addition \lstinline|?\{\}| denotes a constructor, as in \lstinline|foo `f` = `\{`...`\}`|, \lstinline|^?\{\}| denotes a destructor, and \lstinline|?()| is \CC function call \lstinline|operator()|.
+Operator \lstinline+|+ is overloaded for printing, like bit-shift \lstinline|<<| in \CC.
+The \CFA \lstinline|with| clause opens an aggregate scope making its fields directly accessible, like Pascal \lstinline|with|, but using parallel semantics;
+multiple aggregates may be opened.
+\CFA has rebindable references \lstinline|int i, & ip = i, j; `&ip = &j;`| and nonrebindable references \lstinline|int i, & `const` ip = i, j; `&ip = &j;` // disallowed|.
+}%
 \begin{figure}
 …
         int fn = f->fn; f->fn = f->fn1;
                 f->fn1 = f->fn + fn;
         return fn;
+}
 int main() {
 …
 void `main(Fib & fib)` with(fib) {
         [fn1, fn] = [1, 0];
         for () {
 …
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 typedef struct {
         int fn1, fn;  void * `next`;
+        int `restart`, fn1, fn;
 } Fib;
 #define FibCtor { 1, 0, NULL }
+#define FibCtor { `0`, 1, 0 }
 Fib * comain( Fib * f ) {
+        if ( f->next ) goto *f->next;
+        f->next = &&s1;
+        `static void * states[] = {&&s0, &&s1};`
+        `goto *states[f->restart];`
+  s0: f->`restart` = 1;
         for ( ;; ) {
                 return f;
           s1:; int fn = f->fn + f->fn1;
                         f->fn1 = f->fn; f->fn = fn;
+                f->fn1 = f->fn; f->fn = fn;
+        }
+}
 …
 \end{lrbox}
 \subfloat[C asymmetric generator]{\label{f:CFibonacci}\usebox\myboxA}
+\subfloat[C]{\label{f:CFibonacci}\usebox\myboxA}
 \hspace{3pt}
 \vrule
 \hspace{3pt}
 \subfloat[\CFA asymmetric generator]{\label{f:CFAFibonacciGen}\usebox\myboxB}
+\subfloat[\CFA]{\label{f:CFAFibonacciGen}\usebox\myboxB}
 \hspace{3pt}
 \vrule
 \hspace{3pt}
 \subfloat[C generator implementation]{\label{f:CFibonacciSim}\usebox\myboxC}
 \caption{Fibonacci (output) asymmetric generator}
+\subfloat[C generated code for \CFA version]{\label{f:CFibonacciSim}\usebox\myboxC}
+\caption{Fibonacci output asymmetric generator}
 \label{f:FibonacciAsymmetricGenerator}
 …
 };
 void ?{}( Fmt & fmt ) { `resume(fmt);` } // constructor
 void ^?{}( Fmt & f ) with(f) { $\C[1.75in]{// destructor}$
+void ^?{}( Fmt & f ) with(f) { $\C[2.25in]{// destructor}$
         if ( g != 0 || b != 0 ) sout | nl; }
 void `main( Fmt & f )` with(f) {
 …
                 for ( ; g < 5; g += 1 ) { $\C{// groups}$
                         for ( ; b < 4; b += 1 ) { $\C{// blocks}$
                                 `suspend;` $\C{// wait for character}$
                                 while ( ch == '\n' ) `suspend;` // ignore
                                 sout | ch;                                              // newline
                         } sout | " ";  // block spacer
                 } sout | nl; // group newline
+                                do { `suspend;` $\C{// wait for character}$
+                                while ( ch == '\n' ); // ignore newline
+                                sout | ch;                      $\C{// print character}$
+                        } sout | " ";  $\C{// block separator}$
+                } sout | nl; $\C{// group separator}$
+        }
+}
 …
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 typedef struct {
         void * next;
+        int `restart`, g, b;
         char ch;
-        int g, b;
 } Fmt;
 void comain( Fmt * f ) {
+        if ( f->next ) goto *f->next;
+        f->next = &&s1;
+        `static void * states[] = {&&s0, &&s1};`
+        `goto *states[f->restart];`
+  s0: f->`restart` = 1;
         for ( ;; ) {
                 for ( f->g = 0; f->g < 5; f->g += 1 ) {
                         for ( f->b = 0; f->b < 4; f->b += 1 ) {
                                 return;
                           s1:;  while ( f->ch == '\n' ) return;
+                                do { return;  s1: ;
+                                } while ( f->ch == '\n' );
                                 printf( "%c", f->ch );
                         } printf( " " );
 …
+}
 int main() {
         Fmt fmt = { NULL };  comain( &fmt ); // prime
+        Fmt fmt = { `0` };  comain( &fmt ); // prime
         for ( ;; ) {
                 scanf( "%c", &fmt.ch );
 …
 \end{lrbox}
 \subfloat[\CFA asymmetric generator]{\label{f:CFAFormatGen}\usebox\myboxA}
 \hspace{3pt}
+\subfloat[\CFA]{\label{f:CFAFormatGen}\usebox\myboxA}
+\hspace{35pt}
 \vrule
 \hspace{3pt}
 \subfloat[C generator simulation]{\label{f:CFormatSim}\usebox\myboxB}
+\subfloat[C generated code for \CFA version]{\label{f:CFormatGenImpl}\usebox\myboxB}
 \hspace{3pt}
 \caption{Formatter (input) asymmetric generator}
+\caption{Formatter input asymmetric generator}
 \label{f:FormatterAsymmetricGenerator}
 \end{figure}
+Stateful functions appear as generators, coroutines, and threads, where presentations are based on function objects or pointers~\cite{Butenhof97, C++14, MS:VisualC++, BoostCoroutines15}.
+For example, Python presents generators as a function object:
+\begin{python}
+def Gen():
+        ... `yield val` ...
+gen = Gen()
+for i in range( 10 ):
+        print( next( gen ) )
+\end{python}
+Boost presents coroutines in terms of four functor object-types:
+\begin{cfa}
+asymmetric_coroutine<>::pull_type
+asymmetric_coroutine<>::push_type
+symmetric_coroutine<>::call_type
+symmetric_coroutine<>::yield_type
+\end{cfa}
+and many languages present threading using function pointers, @pthreads@~\cite{Butenhof97}, \Csharp~\cite{Csharp}, Go~\cite{Go}, and Scala~\cite{Scala}, \eg pthreads:
+\begin{cfa}
+void * rtn( void * arg ) { ... }
+int i = 3, rc;
+pthread_t t; $\C{// thread id}$
+`rc = pthread_create( &t, rtn, (void *)i );` $\C{// create and initialized task, type-unsafe input parameter}$
+\end{cfa}
+% void mycor( pthread_t cid, void * arg ) {
+%       int * value = (int *)arg;                               $\C{// type unsafe, pointer-size only}$
+%       // thread body
+% }
+% int main() {
+%       int input = 0, output;
+%       coroutine_t cid = coroutine_create( &mycor, (void *)&input ); $\C{// type unsafe, pointer-size only}$
+%       coroutine_resume( cid, (void *)input, (void **)&output ); $\C{// type unsafe, pointer-size only}$
+% }
+\CFA's preferred presentation model for generators/coroutines/threads is a hybrid of objects and functions, with an object-oriented flavour.
+Essentially, the generator/coroutine/thread function is semantically coupled with a generator/coroutine/thread custom type.
+The custom type solves several issues, while accessing the underlying mechanisms used by the custom types is still allowed.
+\subsection{Generator}
+Stackless generators have the potential to be very small and fast, \ie as small and fast as function call/return for both creation and execution.
+The \CFA goal is to achieve this performance target, possibly at the cost of some semantic complexity.
+A series of different kinds of generators and their implementation demonstrate how this goal is accomplished.
+Figure~\ref{f:FibonacciAsymmetricGenerator} shows an unbounded asymmetric generator for an infinite sequence of Fibonacci numbers written in C and \CFA, with a simple C implementation for the \CFA version.
+Figure~\ref{f:FibonacciAsymmetricGenerator} shows an unbounded asymmetric generator for an infinite sequence of Fibonacci numbers written left to right in C, \CFA, and showing the underlying C implementation for the \CFA version.
 This generator is an \emph{output generator}, producing a new result on each resumption.
 To compute Fibonacci, the previous two values in the sequence are retained to generate the next value, \ie @fn1@ and @fn@, plus the execution location where control restarts when the generator is resumed, \ie top or middle.
 An additional requirement is the ability to create an arbitrary number of generators (of any kind), \ie retaining one state in global variables is insufficient;
+An additional requirement is the ability to create an arbitrary number of generators of any kind, \ie retaining one state in global variables is insufficient;
 hence, state is retained in a closure between calls.
 Figure~\ref{f:CFibonacci} shows the C approach of manually creating the closure in structure @Fib@, and multiple instances of this closure provide multiple Fibonacci generators.
 The C version only has the middle execution state because the top execution state is declaration initialization.
 Figure~\ref{f:CFAFibonacciGen} shows the \CFA approach, which also has a manual closure, but replaces the structure with a custom \CFA @generator@ type.
+This generator type is then connected to a function that \emph{must be named \lstinline|main|},\footnote{
+The name \lstinline|main| has special meaning in C, specifically the function where a program starts execution.
+Hence, overloading this name for other starting points (generator/coroutine/thread) is a logical extension.}
+called a \emph{generator main},which takes as its only parameter a reference to the generator type.
+Each generator type must have a function named \lstinline|main|,
+% \footnote{
+% The name \lstinline|main| has special meaning in C, specifically the function where a program starts execution.
+% Leveraging starting semantics to this name for generator/coroutine/thread is a logical extension.}
+called a \emph{generator main} (leveraging the starting semantics for program @main@ in C), which is connected to the generator type via its single reference parameter.
 The generator main contains @suspend@ statements that suspend execution without ending the generator versus @return@.
+For the Fibonacci generator-main,\footnote{
+The \CFA \lstinline|with| opens an aggregate scope making its fields directly accessible, like Pascal \lstinline|with|, but using parallel semantics.
+Multiple aggregates may be opened.}
+the top initialization state appears at the start and the middle execution state is denoted by statement @suspend@.
+For the Fibonacci generator-main, the top initialization state appears at the start and the middle execution state is denoted by statement @suspend@.
 Any local variables in @main@ \emph{are not retained} between calls;
 hence local variables are only for temporary computations \emph{between} suspends.
 …
 Resuming an ended (returned) generator is undefined.
 Function @resume@ returns its argument generator so it can be cascaded in an expression, in this case to print the next Fibonacci value @fn@ computed in the generator instance.
+Figure~\ref{f:CFibonacciSim} shows the C implementation of the \CFA generator only needs one additional field, @next@, to handle retention of execution state.
+The computed @goto@ at the start of the generator main, which branches after the previous suspend, adds very little cost to the resume call.
+Finally, an explicit generator type provides both design and performance benefits, such as multiple type-safe interface functions taking and returning arbitrary types.\footnote{
+The \CFA operator syntax uses \lstinline|?| to denote operands, which allows precise definitions for pre, post, and infix operators, \eg \lstinline|++?|, \lstinline|?++|, and \lstinline|?+?|, in addition \lstinline|?\{\}| denotes a constructor, as in \lstinline|foo `f` = `\{`...`\}`|, \lstinline|^?\{\}| denotes a destructor, and \lstinline|?()| is \CC function call \lstinline|operator()|.
+}%
+Figure~\ref{f:CFibonacciSim} shows the C implementation of the \CFA asymmetric generator.
+Only one execution-state field, @restart@, is needed to subscript the suspension points in the generator.
+At the start of the generator main, the @static@ declaration, @states@, is initialized to the N suspend points in the generator, where operator @&&@ dereferences or references a label~\cite{gccValueLabels}.
+Next, the computed @goto@ selects the last suspend point and branches to it.
+The cost of setting @restart@ and branching via the computed @goto@ adds very little cost to the suspend and resume calls.
+An advantage of the \CFA explicit generator type is the ability to allow multiple type-safe interface functions taking and returning arbitrary types.
 \begin{cfa}
 int ?()( Fib & fib ) { return `resume( fib )`.fn; } $\C[3.9in]{// function-call interface}$
+int ?()( Fib & fib, int N ) { for ( N - 1 ) `fib()`; return `fib()`; } $\C{// use function-call interface to skip N values}$
+double ?()( Fib & fib ) { return (int)`fib()` / 3.14159; } $\C{// different return type, cast prevents recursive call}\CRT$
+sout | (int)f1() | (double)f1() | f2( 2 ); // alternative interface, cast selects call based on return type, step 2 values
+int ?()( Fib & fib, int N ) { for ( N - 1 ) `fib()`; return `fib()`; } $\C{// add parameter to skip N values}$
+double ?()( Fib & fib ) { return (int)`fib()` / 3.14159; } $\C{// different return type, cast prevents recursive call}$
+Fib f;  int i;  double d;
+i = f();  i = f( 2 );  d = f();                                         $\C{// alternative interfaces}\CRT$
 \end{cfa}
 Now, the generator can be a separately compiled opaque-type only accessed through its interface functions.
 For contrast, Figure~\ref{f:PythonFibonacci} shows the equivalent Python Fibonacci generator, which does not use a generator type, and hence only has a single interface, but an implicit closure.
+Having to manually create the generator closure by moving local-state variables into the generator type is an additional programmer burden.
+(This restriction is removed by the coroutine in Section~\ref{s:Coroutine}.)
+This requirement follows from the generality of variable-size local-state, \eg local state with a variable-length array requires dynamic allocation because the array size is unknown at compile time.
+\begin{figure}
+%\centering
+\newbox\myboxA
+\begin{lrbox}{\myboxA}
+\begin{python}[aboveskip=0pt,belowskip=0pt]
+def Fib():
+        fn1, fn = 0, 1
+        while True:
+                `yield fn1`
+                fn1, fn = fn, fn1 + fn
+f1 = Fib()
+f2 = Fib()
+for i in range( 10 ):
+        print( next( f1 ), next( f2 ) )
+\end{python}
+\end{lrbox}
+\newbox\myboxB
+\begin{lrbox}{\myboxB}
+\begin{python}[aboveskip=0pt,belowskip=0pt]
+def Fmt():
+        try:
+                while True:                                             $\C[2.5in]{\# until destructor call}$
+                        for g in range( 5 ):            $\C{\# groups}$
+                                for b in range( 4 ):    $\C{\# blocks}$
+                                        while True:
+                                                ch = (yield)    $\C{\# receive from send}$
+                                                if '\n' not in ch: $\C{\# ignore newline}$
+                                                        break
+                                        print( ch, end='' )     $\C{\# print character}$
+                                print( '  ', end='' )   $\C{\# block separator}$
+                        print()                                         $\C{\# group separator}$
+        except GeneratorExit:                           $\C{\# destructor}$
+                if g != 0 | b != 0:                             $\C{\# special case}$
+                        print()
+fmt = Fmt()
+`next( fmt )`                                                   $\C{\# prime, next prewritten}$
+for i in range( 41 ):
+        `fmt.send( 'a' );`                                      $\C{\# send to yield}$
+\end{python}
+\end{lrbox}
+\hspace{30pt}
+\subfloat[Fibonacci]{\label{f:PythonFibonacci}\usebox\myboxA}
+\hspace{3pt}
+\vrule
+\hspace{3pt}
+\subfloat[Formatter]{\label{f:PythonFormatter}\usebox\myboxB}
+\caption{Python generator}
+\label{f:PythonGenerator}
+\end{figure}
+Having to manually create the generator closure by moving local-state variables into the generator type is an additional programmer burden (removed by the coroutine in Section~\ref{s:Coroutine}).
+This manual requirement follows from the generality of allowing variable-size local-state, \eg local state with a variable-length array requires dynamic allocation as the array size is unknown at compile time.
 However, dynamic allocation significantly increases the cost of generator creation/destruction and is a showstopper for embedded real-time programming.
 But more importantly, the size of the generator type is tied to the local state in the generator main, which precludes separate compilation of the generator main, \ie a generator must be inlined or local state must be dynamically allocated.
 With respect to safety, we believe static analysis can discriminate local state from temporary variables in a generator, \ie variable usage spanning @suspend@, and generate a compile-time error.
 Finally, our current experience is that most generator problems have simple data state, including local state, but complex execution state, so the burden of creating the generator type is small.
 As well, C programmers are not afraid of this kind of semantic programming requirement, if it results in very small, fast generators.
+With respect to safety, we believe static analysis can discriminate persistent generator state from temporary generator-main state and raise a compile-time error for temporary usage spanning suspend points.
+Our experience using generators is that the problems have simple data state, including local state, but complex execution state, so the burden of creating the generator type is small.
+As well, C programmers are not afraid of this kind of semantic programming requirement, if it results in very small and fast generators.
 Figure~\ref{f:CFAFormatGen} shows an asymmetric \newterm{input generator}, @Fmt@, for restructuring text into groups of characters of fixed-size blocks, \ie the input on the left is reformatted into the output on the right, where newlines are ignored.
 …
 The example takes advantage of resuming a generator in the constructor to prime the loops so the first character sent for formatting appears inside the nested loops.
 The destructor provides a newline, if formatted text ends with a full line.
+Figure~\ref{f:CFormatSim} shows the C implementation of the \CFA input generator with one additional field and the computed @goto@.
+For contrast, Figure~\ref{f:PythonFormatter} shows the equivalent Python format generator with the same properties as the Fibonacci generator.
+Figure~\ref{f:DeviceDriverGen} shows a \emph{killer} asymmetric generator, a device-driver, because device drivers caused 70\%-85\% of failures in Windows/Linux~\cite{Swift05}.
+Device drives follow the pattern of simple data state but complex execution state, \ie finite state-machine (FSM) parsing a protocol.
+For example, the following protocol:
+Figure~\ref{f:CFormatGenImpl} shows the C implementation of the \CFA input generator with one additional field and the computed @goto@.
+For contrast, Figure~\ref{f:PythonFormatter} shows the equivalent Python format generator with the same properties as the \CFA format generator.
+% https://dl-acm-org.proxy.lib.uwaterloo.ca/
+An important application for the asymmetric generator is a device-driver, because device drivers are a significant source of operating-system errors: 85\% in Windows XP~\cite[p.~78]{Swift05} and 51.6\% in Linux~\cite[p.~1358,]{Xiao19}. %\cite{Palix11}
+Swift \etal~\cite[p.~86]{Swift05} restructure device drivers using the Extension Procedure Call (XPC) within the kernel via functions @nooks_driver_call@ and @nooks_kernel_call@, which have coroutine properties context switching to separate stacks with explicit hand-off calls;
+however, the calls do not retain execution state, and hence always start from the top.
+The alternative approach for implementing device drivers is using stack-ripping.
+However, Adya \etal~\cite{Adya02} argue against stack ripping in Section 3.2 and suggest a hybrid approach in Section 4 using cooperatively scheduled \emph{fibers}, which is coroutining.
+Figure~\ref{f:DeviceDriverGen} shows the generator advantages in implementing a simple network device-driver with the following protocol:
 \begin{center}
 \ldots\, STX \ldots\, message \ldots\, ESC ETX \ldots\, message \ldots\, ETX 2-byte crc \ldots
 \end{center}
 is a network message beginning with the control character STX, ending with an ETX, and followed by a 2-byte cyclic-redundancy check.
+where the network message begins with the control character STX, ends with an ETX, and is followed by a two-byte cyclic-redundancy check.
 Control characters may appear in a message if preceded by an ESC.
 When a message byte arrives, it triggers an interrupt, and the operating system services the interrupt by calling the device driver with the byte read from a hardware register.
+The device driver returns a status code of its current state, and when a complete message is obtained, the operating system knows the message is in the message buffer.
+Hence, the device driver is an input/output generator.
+Note, the cost of creating and resuming the device-driver generator, @Driver@, is virtually identical to call/return, so performance in an operating-system kernel is excellent.
+As well, the data state is small, where variables @byte@ and @msg@ are communication variables for passing in message bytes and returning the message, and variables @lnth@, @crc@, and @sum@ are local variable that must be retained between calls and are manually hoisted into the generator type.
+% Manually, detecting and hoisting local-state variables is easy when the number is small.
+In contrast, the execution state is large, with one @resume@ and seven @suspend@s.
+Hence, the key benefits of the generator are correctness, safety, and maintenance because the execution states are transcribed directly into the programming language rather than using a table-driven approach.
+Because FSMs can be complex and frequently occur in important domains, direct generator support is important in a system programming language.
+The device driver returns a status code of its current state, and when a complete message is obtained, the operating system reads the message accumulated in the supplied buffer.
+Hence, the device driver is an input/output generator, where the cost of resuming the device-driver generator is the same as call and return, so performance in an operating-system kernel is excellent.
+The key benefits of using a generator are correctness, safety, and maintenance because the execution states are transcribed directly into the programming language rather than table lookup or stack ripping.
+% The conclusion is that FSMs are complex and occur in important domains, so direct generator support is important in a system programming language.
 \begin{figure}
 \centering
-\newbox\myboxA
-\begin{lrbox}{\myboxA}
-\begin{python}[aboveskip=0pt,belowskip=0pt]
-def Fib():
-        fn1, fn = 0, 1
-        while True:
-                `yield fn1`
-                fn1, fn = fn, fn1 + fn
-f1 = Fib()
-f2 = Fib()
-for i in range( 10 ):
-        print( next( f1 ), next( f2 ) )
-\end{python}
-\end{lrbox}
-\newbox\myboxB
-\begin{lrbox}{\myboxB}
-\begin{python}[aboveskip=0pt,belowskip=0pt]
-def Fmt():
-        try:
-                while True:
-                        for g in range( 5 ):
-                                for b in range( 4 ):
-                                        print( `(yield)`, end='' )
-                                print( '  ', end='' )
-                        print()
-        except GeneratorExit:
-                if g != 0 | b != 0:
-                        print()
-fmt = Fmt()
-`next( fmt )`                    # prime, next prewritten
-for i in range( 41 ):
-        `fmt.send( 'a' );`      # send to yield
-\end{python}
-\end{lrbox}
-\subfloat[Fibonacci]{\label{f:PythonFibonacci}\usebox\myboxA}
-\hspace{3pt}
-\vrule
-\hspace{3pt}
-\subfloat[Formatter]{\label{f:PythonFormatter}\usebox\myboxB}
-\caption{Python generator}
-\label{f:PythonGenerator}
-\bigskip
 \begin{tabular}{@{}l|l@{}}
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 …
 `generator` Driver {
         Status status;
         unsigned char byte, * msg; // communication
         unsigned int lnth, sum;      // local state
         unsigned short int crc;
+        char byte, * msg; // communication
+        int lnth, sum;      // local state
+        short int crc;
 };
 void ?{}( Driver & d, char * m ) { d.msg = m; }
 …
 \end{figure}
 Figure~\ref{f:CFAPingPongGen} shows a symmetric generator, where the generator resumes another generator, forming a resume/resume cycle.
+Generators can also have symmetric activation using resume/resume to create control-flow cycles among generators.
 (The trivial cycle is a generator resuming itself.)
 This control flow is similar to recursion for functions but without stack growth.
+The steps for symmetric control-flow are creating, executing, and terminating the cycle.
+Figure~\ref{f:PingPongFullCoroutineSteps} shows the steps for symmetric control-flow using for the ping/pong program in Figure~\ref{f:CFAPingPongGen}.
+The program starts by creating the generators, @ping@ and @pong@, and then assigns the partners that form the cycle.
 Constructing the cycle must deal with definition-before-use to close the cycle, \ie, the first generator must know about the last generator, which is not within scope.
 (This issue occurs for any cyclic data structure.)
+% The example creates all the generators and then assigns the partners that form the cycle.
+% Alternatively, the constructor can assign the partners as they are declared, except the first, and the first-generator partner is set after the last generator declaration to close the cycle.
+Once the cycle is formed, the program main resumes one of the generators, and the generators can then traverse an arbitrary cycle using @resume@ to activate partner generator(s).
+% (Alternatively, the constructor can assign the partners as they are declared, except the first, and the first-generator partner is set after the last generator declaration to close the cycle.)
+Once the cycle is formed, the program main resumes one of the generators, @ping@, and the generators can then traverse an arbitrary number of cycles using @resume@ to activate partner generator(s).
 Terminating the cycle is accomplished by @suspend@ or @return@, both of which go back to the stack frame that started the cycle (program main in the example).
+Note, the creator and starter may be different, \eg if the creator calls another function that starts the cycle.
 The starting stack-frame is below the last active generator because the resume/resume cycle does not grow the stack.
+Also, since local variables are not retained in the generator function, it does not contain any objects with destructors that must be called, so the  cost is the same as a function return.
+Destructor cost occurs when the generator instance is deallocated, which is easily controlled by the programmer.
+Figure~\ref{f:CPingPongSim} shows the implementation of the symmetric generator, where the complexity is the @resume@, which needs an extension to the calling convention to perform a forward rather than backward jump.
+This jump-starts at the top of the next generator main to re-execute the normal calling convention to make space on the stack for its local variables.
+However, before the jump, the caller must reset its stack (and any registers) equivalent to a @return@, but subsequently jump forward.
+This semantics is basically a tail-call optimization, which compilers already perform.
+The example shows the assembly code to undo the generator's entry code before the direct jump.
+This assembly code depends on what entry code is generated, specifically if there are local variables and the level of optimization.
+To provide this new calling convention requires a mechanism built into the compiler, which is beyond the scope of \CFA at this time.
+Nevertheless, it is possible to hand generate any symmetric generators for proof of concept and performance testing.
+A compiler could also eliminate other artifacts in the generator simulation to further increase performance, \eg LLVM has various coroutine support~\cite{CoroutineTS}, and \CFA can leverage this support should it fork @clang@.
+Also, since local variables are not retained in the generator function, there are no objects with destructors to be called, so the cost is the same as a function return.
+Destructor cost occurs when the generator instance is deallocated by the creator.
+\begin{figure}
+\centering
+\input{FullCoroutinePhases.pstex_t}
+\vspace*{-10pt}
+\caption{Symmetric coroutine steps: Ping / Pong}
+\label{f:PingPongFullCoroutineSteps}
+\end{figure}
 \begin{figure}
 …
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 `generator PingPong` {
+        int N, i;                               // local state
         const char * name;
-        int N;
-        int i;                          // local state
         PingPong & partner; // rebindable reference
 };
 void `main( PingPong & pp )` with(pp) {
         for ( ; i < N; i += 1 ) {
                 sout | name | i;
 …
 \begin{cfa}[escapechar={},aboveskip=0pt,belowskip=0pt]
 typedef struct PingPong {
+        int restart, N, i;
         const char * name;
-        int N, i;
         struct PingPong * partner;
-        void * next;
 } PingPong;
 #define PPCtor(name, N) {name,N,0,NULL,NULL}
+#define PPCtor(name, N) {0, N, 0, name, NULL}
 void comain( PingPong * pp ) {
+        if ( pp->next ) goto *pp->next;
+        pp->next = &&cycle;
+        static void * states[] = {&&s0, &&s1};
+        goto *states[pp->restart];
+  s0: pp->restart = 1;
         for ( ; pp->i < pp->N; pp->i += 1 ) {
                 printf( "%s %d\n", pp->name, pp->i );
                 asm( "mov  %0,%%rdi" : "=m" (pp->partner) );
                 asm( "mov  %rdi,%rax" );
+                asm( "popq %rbx" );
+                asm( "add  $16, %rsp" );
+                asm( "popq %rbp" );
                 asm( "jmp  comain" );
           cycle: ;
+          s1: ;
+        }
+}
 …
 \end{figure}
+Finally, part of this generator work was inspired by the recent \CCtwenty generator proposal~\cite{C++20Coroutine19} (which they call coroutines).
+Figure~\ref{f:CPingPongSim} shows the C implementation of the \CFA symmetric generator, where there is still only one additional field, @restart@, but @resume@ is more complex because it does a forward rather than backward jump.
+Before the jump, the parameter for the next call @partner@ is placed into the register used for the first parameter, @rdi@, and the remaining registers are reset for a return.
+The @jmp comain@ restarts the function but with a different parameter, so the new call's behavior depends on the state of the coroutine type, \ie branch to restart location with different data state.
+While the semantics of call forward is a tail-call optimization, which compilers perform, the generator state is different on each call rather a common state for a tail-recursive function (\ie the parameter to the function never changes during the forward calls).
+However, this assembler code depends on what entry code is generated, specifically if there are local variables and the level of optimization.
+Hence, internal compiler support is necessary for any forward call or backwards return, \eg LLVM has various coroutine support~\cite{CoroutineTS}, and \CFA can leverage this support should it eventually fork @clang@.
+For this reason, \CFA does not support general symmetric generators at this time, but, it is possible to hand generate any symmetric generators, as in Figure~\ref{f:CPingPongSim}, for proof of concept and performance testing.
+Finally, part of this generator work was inspired by the recent \CCtwenty coroutine proposal~\cite{C++20Coroutine19}, which uses the general term coroutine to mean generator.
 Our work provides the same high-performance asymmetric generators as \CCtwenty, and extends their work with symmetric generators.
 An additional \CCtwenty generator feature allows @suspend@ and @resume@ to be followed by a restricted compound statement that is executed after the current generator has reset its stack but before calling the next generator, specified with \CFA syntax:
 …
 \label{s:Coroutine}
+Stackful coroutines extend generator semantics, \ie there is an implicit closure and @suspend@ may appear in a helper function called from the coroutine main.
+A coroutine is specified by replacing @generator@ with @coroutine@ for the type.
+Coroutine generality results in higher cost for creation, due to dynamic stack allocation, execution, due to context switching among stacks, and terminating, due to possible stack unwinding and dynamic stack deallocation.
+Stackful coroutines (Table~\ref{t:ExecutionPropertyComposition} case 5) extend generator semantics with an implicit closure and @suspend@ may appear in a helper function called from the coroutine main because of the separate stack.
+Note, simulating coroutines with stacks of generators, \eg Python with @yield from@ cannot handle symmetric control-flow.
+Furthermore, all stack components must be of generators, so it is impossible to call a library function passing a generator that yields.
+Creating a generator copy of the library function maybe impossible because the library function is opaque.
+A \CFA coroutine is specified by replacing @generator@ with @coroutine@ for the type.
+Coroutine generality results in higher cost for creation, due to dynamic stack allocation, for execution, due to context switching among stacks, and for terminating, due to possible stack unwinding and dynamic stack deallocation.
 A series of different kinds of coroutines and their implementations demonstrate how coroutines extend generators.
 First, the previous generator examples are converted to their coroutine counterparts, allowing local-state variables to be moved from the generator type into the coroutine main.
+\begin{description}
+\item[Fibonacci]
+Move the declaration of @fn1@ to the start of coroutine main.
+Now the coroutine type only contains communication variables between interface functions and the coroutine main.
+\begin{center}
+\begin{tabular}{@{}l|l|l|l@{}}
+\multicolumn{1}{c|}{Fibonacci} & \multicolumn{1}{c|}{Formatter} & \multicolumn{1}{c|}{Device Driver} & \multicolumn{1}{c}{PingPong} \\
+\hline
 \begin{cfa}[xleftmargin=0pt]
 void main( Fib & fib ) with(fib) {
+void main( Fib & fib ) ...
         `int fn1;`
+\end{cfa}
+\item[Formatter]
+Move the declaration of @g@ and @b@ to the for loops in the coroutine main.
+\end{cfa}
+&
 \begin{cfa}[xleftmargin=0pt]
 for ( `g`; 5 ) {
         for ( `b`; 4 ) {
+\end{cfa}
+\item[Device Driver]
+Move the declaration of @lnth@ and @sum@ to their points of initialization.
+\end{cfa}
+&
 \begin{cfa}[xleftmargin=0pt]
+        status = CONT;
+        `unsigned int lnth = 0, sum = 0;`
+        ...
+        `unsigned short int crc = byte << 8;`
+\end{cfa}
+\item[PingPong]
+Move the declaration of @i@ to the for loop in the coroutine main.
+status = CONT;
+`int lnth = 0, sum = 0;`
+...
+`short int crc = byte << 8;`
+\end{cfa}
+&
 \begin{cfa}[xleftmargin=0pt]
 void main( PingPong & pp ) with(pp) {
+void main( PingPong & pp ) ...
         for ( `i`; N ) {
+\end{cfa}
+\end{description}
+\end{cfa}
+\end{tabular}
+\end{center}
 It is also possible to refactor code containing local-state and @suspend@ statements into a helper function, like the computation of the CRC for the device driver.
 \begin{cfa}
+unsigned int Crc() {
+        `suspend;`
+        unsigned short int crc = byte << 8;
+        `suspend;`
+        status = (crc | byte) == sum ? MSG : ECRC;
+int Crc() {
+        `suspend;`  short int crc = byte << 8;
+        `suspend;`  status = (crc | byte) == sum ? MSG : ECRC;
         return crc;
+}
 \end{cfa}
 A call to this function is placed at the end of the driver's coroutine-main.
 For complex finite-state machines, refactoring is part of normal program abstraction, especially when code is used in multiple places.
+A call to this function is placed at the end of the device driver's coroutine-main.
+For complex FSMs, refactoring is part of normal program abstraction, especially when code is used in multiple places.
 Again, this complexity is usually associated with execution state rather than data state.
 \begin{comment}
 Figure~\ref{f:Coroutine3States} creates a @coroutine@ type, @`coroutine` Fib { int fn; }@, which provides communication, @fn@, for the \newterm{coroutine main}, @main@, which runs on the coroutine stack, and possibly multiple interface functions, \eg @next@.
 Like the structure in Figure~\ref{f:ExternalState}, the coroutine type allows multiple instances, where instances of this type are passed to the (overloaded) coroutine main.
+Figure~\ref{f:Coroutine3States} creates a @coroutine@ type, @`coroutine` Fib { int fn; }@, which provides communication, @fn@, for the \newterm{coroutine main}, @main@, which runs on the coroutine stack, and possibly multiple interface functions, \eg @restart@.
+Like the structure in Figure~\ref{f:ExternalState}, the coroutine type allows multiple instances, where instances of this type are passed to the overloaded coroutine main.
 The coroutine main's stack holds the state for the next generation, @f1@ and @f2@, and the code represents the three states in the Fibonacci formula via the three suspend points, to context switch back to the caller's @resume@.
 The interface function @next@, takes a Fibonacci instance and context switches to it using @resume@;
+The interface function @restart@, takes a Fibonacci instance and context switches to it using @resume@;
 on restart, the Fibonacci field, @fn@, contains the next value in the sequence, which is returned.
 The first @resume@ is special because it allocates the coroutine stack and cocalls its coroutine main on that stack;
 …
 \begin{figure}
 \centering
-\lstset{language=CFA,escapechar={},moredelim=**[is][\protect\color{red}]{`}{`}}% allow $
 \begin{tabular}{@{}l@{\hspace{2\parindentlnth}}l@{}}
 \begin{cfa}
 `coroutine` Prod {
         Cons & c;                       // communication
+        Cons & c;                       $\C[1.5in]{// communication}$
         int N, money, receipt;
 };
 void main( Prod & prod ) with( prod ) {
+        // 1st resume starts here
+        for ( i; N ) {
+        for ( i; N ) {          $\C{// 1st resume}\CRT$
                 int p1 = random( 100 ), p2 = random( 100 );
-                sout | p1 | " " | p2;
                 int status = delivery( c, p1, p2 );
-                sout | " $" | money | nl | status;
                 receipt += 1;
+        }
         stop( c );
-        sout | "prod stops";
+}
 int payment( Prod & prod, int money ) {
 …
 \begin{cfa}
 `coroutine` Cons {
         Prod & p;                       // communication
+        Prod & p;                       $\C[1.5in]{// communication}$
         int p1, p2, status;
         bool done;
 };
 void ?{}( Cons & cons, Prod & p ) {
         &cons.p = &p; // reassignable reference
+        &cons.p = &p;           $\C{// reassignable reference}$
         cons.[status, done ] = [0, false];
+}
 void main( Cons & cons ) with( cons ) {
+        // 1st resume starts here
+        int money = 1, receipt;
+        int money = 1, receipt; $\C{// 1st resume}\CRT$
         for ( ; ! done; ) {
-                sout | p1 | " " | p2 | nl | " $" | money;
                 status += 1;
                 receipt = payment( p, money );
-                sout | " #" | receipt;
                 money += 1;
+        }
-        sout | "cons stops";
+}
 int delivery( Cons & cons, int p1, int p2 ) {
 …
 Figure~\ref{f:ProdCons} shows the ping-pong example in Figure~\ref{f:CFAPingPongGen} extended into a producer/consumer symmetric-coroutine performing bidirectional communication.
 This example is illustrative because both producer/consumer have two interface functions with @resume@s that suspend execution in these interface (helper) functions.
+This example is illustrative because both producer and consumer have two interface functions with @resume@s that suspend execution in these interface functions.
 The program main creates the producer coroutine, passes it to the consumer coroutine in its initialization, and closes the cycle at the call to @start@ along with the number of items to be produced.
+The first @resume@ of @prod@ creates @prod@'s stack with a frame for @prod@'s coroutine main at the top, and context switches to it.
+@prod@'s coroutine main starts, creates local-state variables that are retained between coroutine activations, and executes $N$ iterations, each generating two random values, calling the consumer to deliver the values, and printing the status returned from the consumer.
+The producer call to @delivery@ transfers values into the consumer's communication variables, resumes the consumer, and returns the consumer status.
+On the first resume, @cons@'s stack is created and initialized, holding local-state variables retained between subsequent activations of the coroutine.
+The consumer iterates until the @done@ flag is set, prints the values delivered by the producer, increments status, and calls back to the producer via @payment@, and on return from @payment@, prints the receipt from the producer and increments @money@ (inflation).
+The call from the consumer to @payment@ introduces the cycle between producer and consumer.
+When @payment@ is called, the consumer copies values into the producer's communication variable and a resume is executed.
+The context switch restarts the producer at the point where it last context switched, so it continues in @delivery@ after the resume.
+@delivery@ returns the status value in @prod@'s coroutine main, where the status is printed.
+The loop then repeats calling @delivery@, where each call resumes the consumer coroutine.
+The context switch to the consumer continues in @payment@.
+The consumer increments and returns the receipt to the call in @cons@'s coroutine main.
+The loop then repeats calling @payment@, where each call resumes the producer coroutine.
+The call to @start@ is the first @resume@ of @prod@, which remembers the program main as the starter and creates @prod@'s stack with a frame for @prod@'s coroutine main at the top, and context switches to it.
+@prod@'s coroutine main starts, creates local-state variables that are retained between coroutine activations, and executes $N$ iterations, each generating two random values, calling the consumer's @deliver@ function to transfer the values, and printing the status returned from the consumer.
+The producer's call to @delivery@ transfers values into the consumer's communication variables, resumes the consumer, and returns the consumer status.
+Similarly on the first resume, @cons@'s stack is created and initialized, holding local-state variables retained between subsequent activations of the coroutine.
+The symmetric coroutine cycle forms when the consumer calls the producer's @payment@ function, which resumes the producer in the consumer's delivery function.
+When the producer calls @delivery@ again, it resumes the consumer in the @payment@ function.
+Both interface functions then return to their corresponding coroutine-main functions for the next cycle.
 Figure~\ref{f:ProdConsRuntimeStacks} shows the runtime stacks of the program main, and the coroutine mains for @prod@ and @cons@ during the cycling.
+As a consequence of a coroutine retaining its last resumer for suspending back, these reverse pointers allow @suspend@ to cycle \emph{backwards} around a symmetric coroutine cycle.
 \begin{figure}
 …
 \caption{Producer / consumer runtime stacks}
 \label{f:ProdConsRuntimeStacks}
-\medskip
-\begin{center}
-\input{FullCoroutinePhases.pstex_t}
-\end{center}
-\vspace*{-10pt}
-\caption{Ping / Pong coroutine steps}
-\label{f:PingPongFullCoroutineSteps}
 \end{figure}
 Terminating a coroutine cycle is more complex than a generator cycle, because it requires context switching to the program main's \emph{stack} to shutdown the program, whereas generators started by the program main run on its stack.
+Furthermore, each deallocated coroutine must guarantee all destructors are run for object allocated in the coroutine type \emph{and} allocated on the coroutine's stack at the point of suspension, which can be arbitrarily deep.
+When a coroutine's main ends, its stack is already unwound so any stack allocated objects with destructors have been finalized.
+The na\"{i}ve semantics for coroutine-cycle termination is to context switch to the last resumer, like executing a @suspend@/@return@ in a generator.
+Furthermore, each deallocated coroutine must execute all destructors for objects allocated in the coroutine type \emph{and} allocated on the coroutine's stack at the point of suspension, which can be arbitrarily deep.
+In the example, termination begins with the producer's loop stopping after N iterations and calling the consumer's @stop@ function, which sets the @done@ flag, resumes the consumer in function @payment@, terminating the call, and the consumer's loop in its coroutine main.
+% (Not shown is having @prod@ raise a nonlocal @stop@ exception at @cons@ after it finishes generating values and suspend back to @cons@, which catches the @stop@ exception to terminate its loop.)
+When the consumer's main ends, its stack is already unwound so any stack allocated objects with destructors are finalized.
+The question now is where does control continue?
+The na\"{i}ve semantics for coroutine-cycle termination is to context switch to the last resumer, like executing a @suspend@ or @return@ in a generator.
 However, for coroutines, the last resumer is \emph{not} implicitly below the current stack frame, as for generators, because each coroutine's stack is independent.
 Unfortunately, it is impossible to determine statically if a coroutine is in a cycle and unrealistic to check dynamically (graph-cycle problem).
 Hence, a compromise solution is necessary that works for asymmetric (acyclic) and symmetric (cyclic) coroutines.
 Our solution is to context switch back to the first resumer (starter) once the coroutine ends.
+Our solution is to retain a coroutine's starter (first resumer), and context switch back to the starter when the coroutine ends.
+Hence, the consumer restarts its first resumer, @prod@, in @stop@, and when the producer ends, it restarts its first resumer, program main, in @start@ (see dashed lines from the end of the coroutine mains in Figure~\ref{f:ProdConsRuntimeStacks}).
 This semantics works well for the most common asymmetric and symmetric coroutine usage patterns.
+For asymmetric coroutines, it is common for the first resumer (starter) coroutine to be the only resumer.
+All previous generators converted to coroutines have this property.
+For symmetric coroutines, it is common for the cycle creator to persist for the lifetime of the cycle.
+Hence, the starter coroutine is remembered on the first resume and ending the coroutine resumes the starter.
+Figure~\ref{f:ProdConsRuntimeStacks} shows this semantic by the dashed lines from the end of the coroutine mains: @prod@ starts @cons@ so @cons@ resumes @prod@ at the end, and the program main starts @prod@ so @prod@ resumes the program main at the end.
+For other scenarios, it is always possible to devise a solution with additional programming effort, such as forcing the cycle forward (backward) to a safe point before starting termination.
+The producer/consumer example does not illustrate the full power of the starter semantics because @cons@ always ends first.
+Assume generator @PingPong@ is converted to a coroutine.
+Figure~\ref{f:PingPongFullCoroutineSteps} shows the creation, starter, and cyclic execution steps of the coroutine version.
+The program main creates (declares) coroutine instances @ping@ and @pong@.
+Next, program main resumes @ping@, making it @ping@'s starter, and @ping@'s main resumes @pong@'s main, making it @pong@'s starter.
+Execution forms a cycle when @pong@ resumes @ping@, and cycles $N$ times.
+By adjusting $N$ for either @ping@/@pong@, it is possible to have either one finish first, instead of @pong@ always ending first.
+If @pong@ ends first, it resumes its starter @ping@ in its coroutine main, then @ping@ ends and resumes its starter the program main in function @start@.
+If @ping@ ends first, it resumes its starter the program main in function @start@.
+Regardless of the cycle complexity, the starter stack always leads back to the program main, but the stack can be entered at an arbitrary point.
+Once back at the program main, coroutines @ping@ and @pong@ are deallocated.
+For generators, deallocation runs the destructors for all objects in the generator type.
+For coroutines, deallocation deals with objects in the coroutine type and must also run the destructors for any objects pending on the coroutine's stack for any unterminated coroutine.
+Hence, if a coroutine's destructor detects the coroutine is not ended, it implicitly raises a cancellation exception (uncatchable exception) at the coroutine and resumes it so the cancellation exception can propagate to the root of the coroutine's stack destroying all local variable on the stack.
+So the \CFA semantics for the generator and coroutine, ensure both can be safely deallocated at any time, regardless of their current state, like any other aggregate object.
+Explicitly raising normal exceptions at another coroutine can replace flag variables, like @stop@, \eg @prod@ raises a @stop@ exception at @cons@ after it finishes generating values and resumes @cons@, which catches the @stop@ exception to terminate its loop.
+Finally, there is an interesting effect for @suspend@ with symmetric coroutines.
+A coroutine must retain its last resumer to suspend back because the resumer is on a different stack.
+These reverse pointers allow @suspend@ to cycle \emph{backwards}, which may be useful in certain cases.
+However, there is an anomaly if a coroutine resumes itself, because it overwrites its last resumer with itself, losing the ability to resume the last external resumer.
+To prevent losing this information, a self-resume does not overwrite the last resumer.
+\subsection{Generator / Coroutine Implementation}
+A significant implementation challenge for generators/coroutines (and threads in Section~\ref{s:threads}) is adding extra fields to the custom types and related functions, \eg inserting code after/before the coroutine constructor/destructor and @main@ to create/initialize/de-initialize/destroy any extra fields, \eg stack.
+There are several solutions to these problem, which follow from the object-oriented flavour of adopting custom types.
+For asymmetric coroutines, it is common for the first resumer (starter) coroutine to be the only resumer;
+for symmetric coroutines, it is common for the cycle creator to persist for the lifetime of the cycle.
+For other scenarios, it is always possible to devise a solution with additional programming effort, such as forcing the cycle forward or backward to a safe point before starting termination.
+Note, the producer/consumer example does not illustrate the full power of the starter semantics because @cons@ always ends first.
+Assume generator @PingPong@ in Figure~\ref{f:PingPongSymmetricGenerator} is converted to a coroutine.
+Unlike generators, coroutines have a starter structure with multiple levels, where the program main starts @ping@ and @ping@ starts @pong@.
+By adjusting $N$ for either @ping@ or @pong@, it is possible to have either finish first.
+If @pong@ ends first, it resumes its starter @ping@ in its coroutine main, then @ping@ ends and resumes its starter the program main on return;
+if @ping@ ends first, it resumes its starter the program main on return.
+Regardless of the cycle complexity, the starter structure always leads back to the program main, but the path can be entered at an arbitrary point.
+Once back at the program main (creator), coroutines @ping@ and @pong@ are deallocated, running any destructors for objects within the coroutine and possibly deallocating any coroutine stacks for non-terminated coroutines, where stack deallocation implies stack unwinding to find destructors for allocated objects on the stack.
+Hence, the \CFA termination semantics for the generator and coroutine ensure correct deallocation semantics, regardless of the coroutine's state (terminated or active), like any other aggregate object.
+\subsection{Generator / coroutine implementation}
+A significant implementation challenge for generators and coroutines (and threads in Section~\ref{s:threads}) is adding extra fields to the custom types and related functions, \eg inserting code after/before the coroutine constructor/destructor and @main@ to create/initialize/de-initialize/destroy any extra fields, \eg the coroutine stack.
+There are several solutions to this problem, which follow from the object-oriented flavor of adopting custom types.
 For object-oriented languages, inheritance is used to provide extra fields and code via explicit inheritance:
 …
 \end{cfa}
 % The problem is that the programming language and its tool chain, \eg debugger, @valgrind@, need to understand @baseCoroutine@ because it infers special property, so type @baseCoroutine@ becomes a de facto keyword and all types inheriting from it are implicitly custom types.
 The problem is that some special properties are not handled by existing language semantics, \eg the execution of constructors/destructors is in the wrong order to implicitly start threads because the thread must start \emph{after} all constructors as it relies on a completely initialized object, but the inherited constructor runs \emph{before} the derived.
+The problem is that some special properties are not handled by existing language semantics, \eg the execution of constructors and destructors is in the wrong order to implicitly start threads because the thread must start \emph{after} all constructors as it relies on a completely initialized object, but the inherited constructor runs \emph{before} the derived.
 Alternatives, such as explicitly starting threads as in Java, are repetitive and forgetting to call start is a common source of errors.
 An alternative is composition:
 …
 Users wanting to extend custom types or build their own can only do so in ways offered by the language.
 Furthermore, implementing custom types without language support may display the power of a programming language.
 \CFA blends the two approaches, providing custom type for idiomatic \CFA code, while extending and building new custom types is still possible, similar to Java concurrency with builtin and library.
+\CFA blends the two approaches, providing custom type for idiomatic \CFA code, while extending and building new custom types is still possible, similar to Java concurrency with builtin and library (@java.util.concurrent@) monitors.
 Part of the mechanism to generalize custom types is the \CFA trait~\cite[\S~2.3]{Moss18}, \eg the definition for custom-type @coroutine@ is anything satisfying the trait @is_coroutine@, and this trait both enforces and restricts the coroutine-interface functions.
 …
 forall( `dtype` T | is_coroutine(T) ) void $suspend$( T & ), resume( T & );
 \end{cfa}
+Note, copying generators/coroutines/threads is not meaningful.
+For example, both the resumer and suspender descriptors can have bidirectional pointers;
+copying these coroutines does not update the internal pointers so behaviour of both copies would be difficult to understand.
+Furthermore, two coroutines cannot logically execute on the same stack.
+A deep coroutine copy, which copies the stack, is also meaningless in an unmanaged language (no garbage collection), like C, because the stack may contain pointers to object within it that require updating for the copy.
+The \CFA @dtype@ property provides no \emph{implicit} copying operations and the @is_coroutine@ trait provides no \emph{explicit} copying operations, so all coroutines must be passed by reference (pointer).
+The function definitions ensure there is a statically typed @main@ function that is the starting point (first stack frame) of a coroutine, and a mechanism to get (read) the coroutine descriptor from its handle.
+The @main@ function has no return value or additional parameters because the coroutine type allows an arbitrary number of interface functions with corresponding arbitrary typed input/output values versus fixed ones.
+Note, copying generators, coroutines, and threads is undefined because multiple objects cannot execute on a shared stack and stack copying does not work in unmanaged languages (no garbage collection), like C, because the stack may contain pointers to objects within it that require updating for the copy.
+The \CFA @dtype@ property provides no \emph{implicit} copying operations and the @is_coroutine@ trait provides no \emph{explicit} copying operations, so all coroutines must be passed by reference or pointer.
+The function definitions ensure there is a statically typed @main@ function that is the starting point (first stack frame) of a coroutine, and a mechanism to read the coroutine descriptor from its handle.
+The @main@ function has no return value or additional parameters because the coroutine type allows an arbitrary number of interface functions with arbitrary typed input and output values versus fixed ones.
 The advantage of this approach is that users can easily create different types of coroutines, \eg changing the memory layout of a coroutine is trivial when implementing the @get_coroutine@ function, and possibly redefining \textsf{suspend} and @resume@.
 …
 The combination of custom types and fundamental @trait@ description of these types allows a concise specification for programmers and tools, while more advanced programmers can have tighter control over memory layout and initialization.
 Figure~\ref{f:CoroutineMemoryLayout} shows different memory-layout options for a coroutine (where a task is similar).
 The coroutine handle is the @coroutine@ instance containing programmer specified type global/communication variables across interface functions.
+Figure~\ref{f:CoroutineMemoryLayout} shows different memory-layout options for a coroutine (where a thread is similar).
+The coroutine handle is the @coroutine@ instance containing programmer specified type global and communication variables across interface functions.
 The coroutine descriptor contains all implicit declarations needed by the runtime, \eg @suspend@/@resume@, and can be part of the coroutine handle or separate.
 The coroutine stack can appear in a number of locations and be fixed or variable sized.
+Hence, the coroutine's stack could be a VLS\footnote{
+We are examining variable-sized structures (VLS), where fields can be variable-sized structures or arrays.
+Once allocated, a VLS is fixed sized.}
+Hence, the coroutine's stack could be a variable-length structure (VLS)
+% \footnote{
+% We are examining VLSs, where fields can be variable-sized structures or arrays.
+% Once allocated, a VLS is fixed sized.}
 on the allocating stack, provided the allocating stack is large enough.
 For a VLS stack allocation/deallocation is an inexpensive adjustment of the stack pointer, modulo any stack constructor costs (\eg initial frame setup).
 For heap stack allocation, allocation/deallocation is an expensive heap allocation (where the heap can be a shared resource), modulo any stack constructor costs.
 With heap stack allocation, it is also possible to use a split (segmented) stack calling convention, available with gcc and clang, so the stack is variable sized.
 Currently, \CFA supports stack/heap allocated descriptors but only fixed-sized heap allocated stacks.
+For a VLS stack allocation and deallocation is an inexpensive adjustment of the stack pointer, modulo any stack constructor costs to initial frame setup.
+For stack allocation in the heap, allocation and deallocation is an expensive allocation, where the heap can be a shared resource, modulo any stack constructor costs.
+It is also possible to use a split or segmented stack calling convention, available with gcc and clang, allowing a variable-sized stack via a set of connected blocks in the heap.
+Currently, \CFA supports stack and heap allocated descriptors but only fixed-sized heap allocated stacks.
 In \CFA debug-mode, the fixed-sized stack is terminated with a write-only page, which catches most stack overflows.
 Experience teaching concurrency with \uC~\cite{CS343} shows fixed-sized stacks are rarely an issue for students.
 Split-stack allocation is under development but requires recompilation of legacy code, which may be impossible.
+Split-stack allocation is under development but requires recompilation of legacy code, which is not always possible.
 \begin{figure}
 …
 Concurrency is nondeterministic scheduling of independent sequential execution paths (threads), where each thread has its own stack.
 A single thread with multiple call stacks, \newterm{coroutining}~\cite{Conway63,Marlin80}, does \emph{not} imply concurrency~\cite[\S~2]{Buhr05a}.
 In coroutining, coroutines self-schedule the thread across stacks so execution is deterministic.
+A single thread with multiple stacks, \ie coroutining, does \emph{not} imply concurrency~\cite[\S~3]{Buhr05a}.
+Coroutining self-schedule the thread across stacks so execution is deterministic.
 (It is \emph{impossible} to generate a concurrency error when coroutining.)
+However, coroutines are a stepping stone towards concurrency.
+The transition to concurrency, even for a single thread with multiple stacks, occurs when coroutines context switch to a \newterm{scheduling coroutine}, introducing non-determinism from the coroutine perspective~\cite[\S~3,]{Buhr05a}.
+The transition to concurrency, even for a single thread with multiple stacks, occurs when coroutines context switch to a \newterm{scheduling coroutine}, introducing non-determinism from the coroutine perspective~\cite[\S~3]{Buhr05a}.
 Therefore, a minimal concurrency system requires coroutines \emph{in conjunction with a nondeterministic scheduler}.
+The resulting execution system now follows a cooperative threading model~\cite{Adya02,libdill}, called \newterm{non-preemptive scheduling}.
+Adding \newterm{preemption} introduces non-cooperative scheduling, where context switching occurs randomly between any two instructions often based on a timer interrupt, called \newterm{preemptive scheduling}.
+While a scheduler introduces uncertain execution among explicit context switches, preemption introduces uncertainty by introducing implicit context switches.
+The resulting execution system now follows a cooperative threading-model~\cite{Adya02,libdill} because context-switching points to the scheduler are known, but the next unblocking point is unknown due to the scheduler.
+Adding \newterm{preemption} introduces \newterm{non-cooperative} or \newterm{preemptive} scheduling, where context switching points to the scheduler are unknown as they can occur randomly between any two instructions often based on a timer interrupt.
 Uncertainty gives the illusion of parallelism on a single processor and provides a mechanism to access and increase performance on multiple processors.
 The reason is that the scheduler/runtime have complete knowledge about resources and how to best utilized them.
 However, the introduction of unrestricted nondeterminism results in the need for \newterm{mutual exclusion} and \newterm{synchronization}, which restrict nondeterminism for correctness;
+The reason is that the scheduler and runtime have complete knowledge about resources and how to best utilized them.
+However, the introduction of unrestricted nondeterminism results in the need for \newterm{mutual exclusion} and \newterm{synchronization}~\cite[\S~4]{Buhr05a}, which restrict nondeterminism for correctness;
 otherwise, it is impossible to write meaningful concurrent programs.
 Optimal concurrent performance is often obtained by having as much nondeterminism as mutual exclusion and synchronization correctness allow.
 A scheduler can either be a stackless or stackful.
+A scheduler can also be stackless or stackful.
 For stackless, the scheduler performs scheduling on the stack of the current coroutine and switches directly to the next coroutine, so there is one context switch.
 For stackful, the current coroutine switches to the scheduler, which performs scheduling, and it then switches to the next coroutine, so there are two context switches.
 …
 \label{s:threads}
+Threading needs the ability to start a thread and wait for its completion.
+A common API for this ability is @fork@ and @join@.
+\begin{cquote}
+\begin{tabular}{@{}lll@{}}
+\multicolumn{1}{c}{\textbf{Java}} & \multicolumn{1}{c}{\textbf{\Celeven}} & \multicolumn{1}{c}{\textbf{pthreads}} \\
+\begin{cfa}
+class MyTask extends Thread {...}
+mytask t = new MyTask(...);
+Threading (Table~\ref{t:ExecutionPropertyComposition} case 11) needs the ability to start a thread and wait for its completion, where a common API is @fork@ and @join@.
+\vspace{4pt}
+\par\noindent
+\begin{tabular}{@{}l|l|l@{}}
+\multicolumn{1}{c|}{\textbf{Java}} & \multicolumn{1}{c|}{\textbf{\Celeven}} & \multicolumn{1}{c}{\textbf{pthreads}} \\
+\hline
+\begin{cfa}
+class MyThread extends Thread {...}
+mythread t = new MyThread(...);
 `t.start();` // start
 // concurrency
 …
+&
 \begin{cfa}
 class MyTask { ... } // functor
 MyTask mytask;
 `thread t( mytask, ... );` // start
+class MyThread { ... } // functor
+MyThread mythread;
+`thread t( mythread, ... );` // start
 // concurrency
 `t.join();` // wait
 …
 \end{cfa}
 \end{tabular}
+\end{cquote}
+\CFA has a simpler approach using a custom @thread@ type and leveraging declaration semantics (allocation/deallocation), where threads implicitly @fork@ after construction and @join@ before destruction.
+\begin{cfa}
+thread MyTask {};
+void main( MyTask & this ) { ... }
+\vspace{1pt}
+\par\noindent
+\CFA has a simpler approach using a custom @thread@ type and leveraging declaration semantics, allocation and deallocation, where threads implicitly @fork@ after construction and @join@ before destruction.
+\begin{cfa}
+thread MyThread {};
+void main( MyThread & this ) { ... }
 int main() {
         MyTask team`[10]`; $\C[2.5in]{// allocate stack-based threads, implicit start after construction}$
+        MyThread team`[10]`; $\C[2.5in]{// allocate stack-based threads, implicit start after construction}$
         // concurrency
 } $\C{// deallocate stack-based threads, implicit joins before destruction}$
 \end{cfa}
 This semantic ensures a thread is started and stopped exactly once, eliminating some programming error, and scales to multiple threads for basic (termination) synchronization.
 For block allocation to arbitrary depth, including recursion, threads are created/destroyed in a lattice structure (tree with top and bottom).
+This semantic ensures a thread is started and stopped exactly once, eliminating some programming error, and scales to multiple threads for basic termination synchronization.
+For block allocation to arbitrary depth, including recursion, threads are created and destroyed in a lattice structure (tree with top and bottom).
 Arbitrary topologies are possible using dynamic allocation, allowing threads to outlive their declaration scope, identical to normal dynamic allocation.
 \begin{cfa}
 MyTask * factory( int N ) { ... return `anew( N )`; } $\C{// allocate heap-based threads, implicit start after construction}$
+MyThread * factory( int N ) { ... return `anew( N )`; } $\C{// allocate heap-based threads, implicit start after construction}$
 int main() {
         MyTask * team = factory( 10 );
+        MyThread * team = factory( 10 );
         // concurrency
         `delete( team );` $\C{// deallocate heap-based threads, implicit joins before destruction}\CRT$
+        `adelete( team );` $\C{// deallocate heap-based threads, implicit joins before destruction}\CRT$
+}
 \end{cfa}
 …
 \subsection{Thread Implementation}
+\subsection{Thread implementation}
 Threads in \CFA are user level run by runtime kernel threads (see Section~\ref{s:CFARuntimeStructure}), where user threads provide concurrency and kernel threads provide parallelism.
 Like coroutines, and for the same design reasons, \CFA provides a custom @thread@ type and a @trait@ to enforce and restrict the task-interface functions.
+Like coroutines, and for the same design reasons, \CFA provides a custom @thread@ type and a @trait@ to enforce and restrict the thread-interface functions.
 \begin{cquote}
 \begin{tabular}{@{}c@{\hspace{3\parindentlnth}}c@{}}
 …
 \end{tabular}
 \end{cquote}
 Like coroutines, the @dtype@ property prevents \emph{implicit} copy operations and the @is_thread@ trait provides no \emph{explicit} copy operations, so threads must be passed by reference (pointer).
 Similarly, the function definitions ensure there is a statically typed @main@ function that is the thread starting point (first stack frame), a mechanism to get (read) the thread descriptor from its handle, and a special destructor to prevent deallocation while the thread is executing.
+Like coroutines, the @dtype@ property prevents \emph{implicit} copy operations and the @is_thread@ trait provides no \emph{explicit} copy operations, so threads must be passed by reference or pointer.
+Similarly, the function definitions ensure there is a statically typed @main@ function that is the thread starting point (first stack frame), a mechanism to read the thread descriptor from its handle, and a special destructor to prevent deallocation while the thread is executing.
 (The qualifier @mutex@ for the destructor parameter is discussed in Section~\ref{s:Monitor}.)
 The difference between the coroutine and thread is that a coroutine borrows a thread from its caller, so the first thread resuming a coroutine creates the coroutine's stack and starts running the coroutine main on the stack;
 whereas, a thread is scheduling for execution in @main@ immediately after its constructor is run.
 No return value or additional parameters are necessary for this function because the @thread@ type allows an arbitrary number of interface functions with corresponding arbitrary typed input/output values.
+No return value or additional parameters are necessary for this function because the @thread@ type allows an arbitrary number of interface functions with corresponding arbitrary typed input and output values.
 …
 \label{s:MutualExclusionSynchronization}
 Unrestricted nondeterminism is meaningless as there is no way to know when the result is completed without synchronization.
+Unrestricted nondeterminism is meaningless as there is no way to know when a result is completed and safe to access.
 To produce meaningful execution requires clawing back some determinism using mutual exclusion and synchronization, where mutual exclusion provides access control for threads using shared data, and synchronization is a timing relationship among threads~\cite[\S~4]{Buhr05a}.
+Some concurrent systems eliminate mutable shared-state by switching to stateless communication like message passing~\cite{Thoth,Harmony,V-Kernel,MPI} (Erlang, MPI), channels~\cite{CSP} (CSP,Go), actors~\cite{Akka} (Akka, Scala), or functional techniques (Haskell).
+The shared data protected by mutual exclusion is called a \newterm{critical section}~\cite{Dijkstra65}, and the protection can be simple, only 1 thread, or complex, only N kinds of threads, \eg group~\cite{Joung00} or readers/writer~\cite{Courtois71} problems.
+Without synchronization control in a critical section, an arriving thread can barge ahead of preexisting waiter threads resulting in short/long-term starvation, staleness and freshness problems, and incorrect transfer of data.
+Preventing or detecting barging is a challenge with low-level locks, but made easier through higher-level constructs.
+This challenge is often split into two different approaches: barging \emph{avoidance} and \emph{prevention}.
+Approaches that unconditionally releasing a lock for competing threads to acquire must use barging avoidance with flag/counter variable(s) to force barging threads to wait;
+approaches that conditionally hold locks during synchronization, \eg baton-passing~\cite{Andrews89}, prevent barging completely.
+At the lowest level, concurrent control is provided by atomic operations, upon which different kinds of locking mechanisms are constructed, \eg spin locks, semaphores~\cite{Dijkstra68b}, barriers, and path expressions~\cite{Campbell74}.
+However, for productivity it is always desirable to use the highest-level construct that provides the necessary efficiency~\cite{Hochstein05}.
+A significant challenge with locks is composability because it takes careful organization for multiple locks to be used while preventing deadlock.
+Easing composability is another feature higher-level mutual-exclusion mechanisms can offer.
+Some concurrent systems eliminate mutable shared-state by switching to non-shared communication like message passing~\cite{Thoth,Harmony,V-Kernel,MPI} (Erlang, MPI), channels~\cite{CSP} (CSP,Go), actors~\cite{Akka} (Akka, Scala), or functional techniques (Haskell).
 However, these approaches introduce a new communication mechanism for concurrency different from the standard communication using function call/return.
 Hence, a programmer must learn and manipulate two sets of design/programming patterns.
+Hence, a programmer must learn and manipulate two sets of design and programming patterns.
 While this distinction can be hidden away in library code, effective use of the library still has to take both paradigms into account.
+In contrast, approaches based on stateful models more closely resemble the standard call/return programming model, resulting in a single programming paradigm.
+At the lowest level, concurrent control is implemented by atomic operations, upon which different kinds of locking mechanisms are constructed, \eg semaphores~\cite{Dijkstra68b}, barriers, and path expressions~\cite{Campbell74}.
+However, for productivity it is always desirable to use the highest-level construct that provides the necessary efficiency~\cite{Hochstein05}.
+A newer approach for restricting non-determinism is transactional memory~\cite{Herlihy93}.
+While this approach is pursued in hardware~\cite{Nakaike15} and system languages, like \CC~\cite{Cpp-Transactions}, the performance and feature set is still too restrictive to be the main concurrency paradigm for system languages, which is why it is rejected as the core paradigm for concurrency in \CFA.
+One of the most natural, elegant, and efficient mechanisms for mutual exclusion and synchronization for shared-memory systems is the \emph{monitor}.
+First proposed by Brinch Hansen~\cite{Hansen73} and later described and extended by C.A.R.~Hoare~\cite{Hoare74}, many concurrent programming languages provide monitors as an explicit language construct: \eg Concurrent Pascal~\cite{ConcurrentPascal}, Mesa~\cite{Mesa}, Modula~\cite{Modula-2}, Turing~\cite{Turing:old}, Modula-3~\cite{Modula-3}, NeWS~\cite{NeWS}, Emerald~\cite{Emerald}, \uC~\cite{Buhr92a} and Java~\cite{Java}.
+In addition, operating-system kernels and device drivers have a monitor-like structure, although they often use lower-level primitives such as mutex locks or semaphores to simulate monitors.
+For these reasons, \CFA selected monitors as the core high-level concurrency construct, upon which higher-level approaches can be easily constructed.
+\subsection{Mutual Exclusion}
+A group of instructions manipulating a specific instance of shared data that must be performed atomically is called a \newterm{critical section}~\cite{Dijkstra65}, which is enforced by \newterm{simple mutual-exclusion}.
+The generalization is called a \newterm{group critical-section}~\cite{Joung00}, where multiple tasks with the same session use the resource simultaneously and different sessions are segregated, which is enforced by \newterm{complex mutual-exclusion} providing the correct kind and number of threads using a group critical-section.
+The readers/writer problem~\cite{Courtois71} is an instance of a group critical-section, where readers share a session but writers have a unique session.
+However, many solutions exist for mutual exclusion, which vary in terms of performance, flexibility and ease of use.
+Methods range from low-level locks, which are fast and flexible but require significant attention for correctness, to higher-level concurrency techniques, which sacrifice some performance to improve ease of use.
+Ease of use comes by either guaranteeing some problems cannot occur, \eg deadlock free, or by offering a more explicit coupling between shared data and critical section.
+For example, the \CC @std::atomic<T>@ offers an easy way to express mutual-exclusion on a restricted set of operations, \eg reading/writing, for numerical types.
+However, a significant challenge with locks is composability because it takes careful organization for multiple locks to be used while preventing deadlock.
+Easing composability is another feature higher-level mutual-exclusion mechanisms can offer.
+\subsection{Synchronization}
+Synchronization enforces relative ordering of execution, and synchronization tools provide numerous mechanisms to establish these timing relationships.
+Low-level synchronization primitives offer good performance and flexibility at the cost of ease of use;
+higher-level mechanisms often simplify usage by adding better coupling between synchronization and data, \eg receive-specific versus receive-any thread in message passing or offering specialized solutions, \eg barrier lock.
+Often synchronization is used to order access to a critical section, \eg ensuring a waiting writer thread enters the critical section before a calling reader thread.
+If the calling reader is scheduled before the waiting writer, the reader has barged.
+Barging can result in staleness/freshness problems, where a reader barges ahead of a writer and reads temporally stale data, or a writer barges ahead of another writer overwriting data with a fresh value preventing the previous value from ever being read (lost computation).
+Preventing or detecting barging is an involved challenge with low-level locks, which is made easier through higher-level constructs.
+This challenge is often split into two different approaches: barging avoidance and prevention.
+Algorithms that unconditionally releasing a lock for competing threads to acquire use barging avoidance during synchronization to force a barging thread to wait;
+algorithms that conditionally hold locks during synchronization, \eg baton-passing~\cite{Andrews89}, prevent barging completely.
+In contrast, approaches based on shared-state models more closely resemble the standard call and return programming model, resulting in a single programming paradigm.
+Finally, a newer approach for restricting non-determinism is transactional memory~\cite{Herlihy93}.
+While this approach is pursued in hardware~\cite{Nakaike15} and system languages, like \CC~\cite{Cpp-Transactions}, the performance and feature set is still too restrictive~\cite{Cascaval08,Boehm09} to be the main concurrency paradigm for system languages.
 …
 \label{s:Monitor}
+A \textbf{monitor} is a set of functions that ensure mutual exclusion when accessing shared state.
+More precisely, a monitor is a programming technique that implicitly binds mutual exclusion to static function scope, as opposed to locks, where mutual-exclusion is defined by acquire/release calls, independent of lexical context (analogous to block and heap storage allocation).
+Restricting acquire/release points eases programming, comprehension, and maintenance, at a slight cost in flexibility and efficiency.
+\CFA uses a custom @monitor@ type and leverages declaration semantics (deallocation) to protect active or waiting threads in a monitor.
+The following is a \CFA monitor implementation of an atomic counter.
+\begin{cfa}[morekeywords=nomutex]
+`monitor` Aint { int cnt; }; $\C[4.25in]{// atomic integer counter}$
+int ++?( Aint & `mutex`$\(_{opt}\)$ this ) with( this ) { return ++cnt; } $\C{// increment}$
+int ?=?( Aint & `mutex`$\(_{opt}\)$ lhs, int rhs ) with( lhs ) { cnt = rhs; } $\C{// conversions with int}\CRT$
+int ?=?( int & lhs, Aint & `mutex`$\(_{opt}\)$ rhs ) with( rhs ) { lhs = cnt; }
+\end{cfa}
+% The @Aint@ constructor, @?{}@, uses the \lstinline[morekeywords=nomutex]@nomutex@ qualifier indicating mutual exclusion is unnecessary during construction because an object is inaccessible (private) until after it is initialized.
+% (While a constructor may publish its address into a global variable, doing so generates a race-condition.)
+The prefix increment operation, @++?@, is normally @mutex@, indicating mutual exclusion is necessary during function execution, to protect the incrementing from race conditions, unless there is an atomic increment instruction for the implementation type.
+The assignment operators provide bidirectional conversion between an atomic and normal integer without accessing field @cnt@;
+these operations only need @mutex@, if reading/writing the implementation type is not atomic.
+The atomic counter is used without any explicit mutual-exclusion and provides thread-safe semantics, which is similar to the \CC template @std::atomic@.
+\begin{cfa}
+One of the most natural, elegant, efficient, high-level mechanisms for mutual exclusion and synchronization for shared-memory systems is the \emph{monitor} (Table~\ref{t:ExecutionPropertyComposition} case 2).
+First proposed by Brinch Hansen~\cite{Hansen73} and later described and extended by C.A.R.~Hoare~\cite{Hoare74}, many concurrent programming languages provide monitors as an explicit language construct: \eg Concurrent Pascal~\cite{ConcurrentPascal}, Mesa~\cite{Mesa}, Modula~\cite{Modula-2}, Turing~\cite{Turing:old}, Modula-3~\cite{Modula-3}, NeWS~\cite{NeWS}, Emerald~\cite{Emerald}, \uC~\cite{Buhr92a} and Java~\cite{Java}.
+In addition, operating-system kernels and device drivers have a monitor-like structure, although they often use lower-level primitives such as mutex locks or semaphores to manually implement a monitor.
+For these reasons, \CFA selected monitors as the core high-level concurrency construct, upon which higher-level approaches can be easily constructed.
+Figure~\ref{f:AtomicCounter} compares a \CFA and Java monitor implementing an atomic counter.
+(Like other concurrent programming languages, \CFA and Java have performant specializations for the basic types using atomic instructions.)
+A \newterm{monitor} is a set of functions that ensure mutual exclusion when accessing shared state.
+(Note, in \CFA, @monitor@ is short-hand for @mutex struct@.)
+More precisely, a monitor is a programming technique that implicitly binds mutual exclusion to static function scope by call and return, as opposed to locks, where mutual exclusion is defined by acquire/release calls, independent of lexical context (analogous to block and heap storage allocation).
+Restricting acquire and release points eases programming, comprehension, and maintenance, at a slight cost in flexibility and efficiency.
+As for other special types, \CFA has a custom @monitor@ type.
+\begin{figure}
+\centering
+\begin{lrbox}{\myboxA}
+\begin{cfa}[aboveskip=0pt,belowskip=0pt]
+`monitor` Aint { // atomic integer counter
+        int cnt;
+};
+int ++?( Aint & `mutex` this ) with(this) { return ++cnt; }
+int ?=?( Aint & `mutex` lhs, int rhs ) with(lhs) { cnt = rhs; }
+int ?=?(int & lhs, Aint & rhs) with(rhs) { lhs = cnt; }
 int i = 0, j = 0, k = 5;
+Aint x = { 0 }, y = { 0 }, z = { 5 }; $\C{// no mutex required}$
+++x; ++y; ++z; $\C{// safe increment by multiple threads}$
+x = 2; y = i; z = k; $\C{// conversions}$
+i = x; j = y; k = z;
+\end{cfa}
+\CFA monitors have \newterm{multi-acquire} semantics so the thread in the monitor may acquire it multiple times without deadlock, allowing recursion and calling other interface functions.
+\begin{cfa}
+monitor M { ... } m;
+void foo( M & mutex m ) { ... } $\C{// acquire mutual exclusion}$
+void bar( M & mutex m ) { $\C{// acquire mutual exclusion}$
+        ... `bar( m );` ... `foo( m );` ... $\C{// reacquire mutual exclusion}$
+}
+\end{cfa}
+\CFA monitors also ensure the monitor lock is released regardless of how an acquiring function ends (normal or exceptional), and returning a shared variable is safe via copying before the lock is released.
+Similar safety is offered by \emph{explicit} mechanisms like \CC RAII;
+monitor \emph{implicit} safety ensures no programmer usage errors.
+Furthermore, RAII mechanisms cannot handle complex synchronization within a monitor, where the monitor lock may not be released on function exit because it is passed to an unblocking thread;
+Aint x = { 0 }, y = { 0 }, z = { 5 }; // no mutex
+++x; ++y; ++z;     // mutex
+x = 2; y = i; z = k;  // mutex
+i = x; j = y; k = z;  // no mutex
+\end{cfa}
+\end{lrbox}
+\begin{lrbox}{\myboxB}
+\begin{java}[aboveskip=0pt,belowskip=0pt]
+class Aint {
+    private int cnt;
+    public Aint( int init ) { cnt = init; }
+    `synchronized` public int inc() { return ++cnt; }
+    `synchronized` public void set( int rhs ) {cnt=rhs;}
+    public int get() { return cnt; }
+}
+int i = 0, j = 0, k = 5;
+Aint x=new Aint(0), y=new Aint(0), z=new Aint(5);
+x.inc(); y.inc(); z.inc();
+x.set( 2 ); y.set( i ); z.set( k );
+i = x.get(); j = y.get(); k = z.get();
+\end{java}
+\end{lrbox}
+\subfloat[\CFA]{\label{f:AtomicCounterCFA}\usebox\myboxA}
+\hspace{3pt}
+\vrule
+\hspace{3pt}
+\subfloat[Java]{\label{f:AtomicCounterJava}\usebox\myboxB}
+\caption{Atomic counter}
+\label{f:AtomicCounter}
+\end{figure}
+Like Java, \CFA monitors have \newterm{multi-acquire} semantics so the thread in the monitor may acquire it multiple times without deadlock, allowing recursion and calling other interface functions.
+% \begin{cfa}
+% monitor M { ... } m;
+% void foo( M & mutex m ) { ... } $\C{// acquire mutual exclusion}$
+% void bar( M & mutex m ) { $\C{// acquire mutual exclusion}$
+%       ... `bar( m );` ... `foo( m );` ... $\C{// reacquire mutual exclusion}$
+% }
+% \end{cfa}
+\CFA monitors also ensure the monitor lock is released regardless of how an acquiring function ends, normal or exceptional, and returning a shared variable is safe via copying before the lock is released.
+Similar safety is offered by \emph{explicit} opt-in disciplines like \CC RAII versus the monitor \emph{implicit} language-enforced safety guarantee ensuring no programmer usage errors.
+However, RAII mechanisms cannot handle complex synchronization within a monitor, where the monitor lock may not be released on function exit because it is passed to an unblocking thread;
 RAII is purely a mutual-exclusion mechanism (see Section~\ref{s:Scheduling}).
+\subsection{Monitor Implementation}
+Both Java and \CFA use a keyword @mutex@/\lstinline[language=java]|synchronized| to designate functions that implicitly acquire/release the monitor lock on call/return providing mutual exclusion to the stared data.
+Non-designated functions provide no mutual exclusion for read-only access or as an interface to a multi-step protocol requiring several steps of acquiring and releasing the monitor.
+Monitor objects can be passed through multiple helper functions without acquiring mutual exclusion, until a designated function associated with the object is called.
+\CFA designated functions are marked by an explicitly parameter-only pointer/reference qualifier @mutex@ (discussed further in Section\ref{s:MutexAcquisition}).
+Whereas, Java designated members are marked with \lstinline[language=java]|synchronized| that applies to the implicit reference parameter @this@.
+In the example, the increment and setter operations need mutual exclusion while the read-only getter operation can be nonmutex if reading the implementation is atomic.
+\subsection{Monitor implementation}
 For the same design reasons, \CFA provides a custom @monitor@ type and a @trait@ to enforce and restrict the monitor-interface functions.
 …
 \end{tabular}
 \end{cquote}
+The @dtype@ property prevents \emph{implicit} copy operations and the @is_monitor@ trait provides no \emph{explicit} copy operations, so monitors must be passed by reference (pointer).
+% Copying a lock is insecure because it is possible to copy an open lock and then use the open copy when the original lock is closed to simultaneously access the shared data.
+% Copying a monitor is secure because both the lock and shared data are copies, but copying the shared data is meaningless because it no longer represents a unique entity.
+Similarly, the function definitions ensures there is a mechanism to get (read) the monitor descriptor from its handle, and a special destructor to prevent deallocation if a thread using the shared data.
+The @dtype@ property prevents \emph{implicit} copy operations and the @is_monitor@ trait provides no \emph{explicit} copy operations, so monitors must be passed by reference or pointer.
+Similarly, the function definitions ensure there is a mechanism to read the monitor descriptor from its handle, and a special destructor to prevent deallocation if a thread is using the shared data.
 The custom monitor type also inserts any locks needed to implement the mutual exclusion semantics.
+\subsection{Mutex Acquisition}
+\CFA relies heavily on traits as an abstraction mechanism, so the @mutex@ qualifier prevents coincidentally matching of a monitor trait with a type that is not a monitor, similar to coincidental inheritance where a shape and playing card can both be drawable.
+\subsection{Mutex acquisition}
 \label{s:MutexAcquisition}
+While the monitor lock provides mutual exclusion for shared data, there are implementation options for when and where the locking/unlocking occurs.
+(Much of this discussion also applies to basic locks.)
+For example, a monitor may be passed through multiple helper functions before it is necessary to acquire the monitor's mutual exclusion.
+The benefit of mandatory monitor qualifiers is self-documentation, but requiring both @mutex@ and \lstinline[morekeywords=nomutex]@nomutex@ for all monitor parameters is redundant.
+Instead, the semantics has one qualifier as the default and the other required.
+For example, make the safe @mutex@ qualifier the default because assuming \lstinline[morekeywords=nomutex]@nomutex@ may cause subtle errors.
+Alternatively, make the unsafe \lstinline[morekeywords=nomutex]@nomutex@ qualifier the default because it is the \emph{normal} parameter semantics while @mutex@ parameters are rare.
+Providing a default qualifier implies knowing whether a parameter is a monitor.
+Since \CFA relies heavily on traits as an abstraction mechanism, types can coincidentally match the monitor trait but not be a monitor, similar to inheritance where a shape and playing card can both be drawable.
+For this reason, \CFA requires programmers to identify the kind of parameter with the @mutex@ keyword and uses no keyword to mean \lstinline[morekeywords=nomutex]@nomutex@.
+The next semantic decision is establishing which parameter \emph{types} may be qualified with @mutex@.
+The following has monitor parameter types that are composed of multiple objects.
+\begin{cfa}
+monitor M { ... }
+For object-oriented programming languages, the mutex property applies to one object, the implicit pointer/reference to the monitor type.
+Because \CFA uses a pointer qualifier, other possibilities exist, \eg:
+\begin{cfa}
+monitor M { ... };
 int f1( M & mutex m ); $\C{// single parameter object}$
 int f2( M * mutex m ); $\C{// single or multiple parameter object}$
 …
 int f4( stack( M * ) & mutex m ); $\C{// multiple parameters object}$
 \end{cfa}
+Function @f1@ has a single parameter object, while @f2@'s indirection could be a single or multi-element array, where static array size is often unknown in C.
+Function @f3@ has a multiple object matrix, and @f4@ a multiple object data structure.
+While shown shortly, multiple object acquisition is possible, but the number of objects must be statically known.
+Therefore, \CFA only acquires one monitor per parameter with at most one level of indirection, excluding pointers as it is impossible to statically determine the size.
+For object-oriented monitors, \eg Java, calling a mutex member \emph{implicitly} acquires mutual exclusion of the receiver object, @`rec`.foo(...)@.
+\CFA has no receiver, and hence, the explicit @mutex@ qualifier is used to specify which objects acquire mutual exclusion.
+A positive consequence of this design decision is the ability to support multi-monitor functions,\footnote{
+Function @f1@ has a single object parameter, while functions @f2@ to @f4@ can be a single or multi-element parameter with statically unknown size.
+Because of the statically unknown size, \CFA only supports a single reference @mutex@ parameter, @f1@.
+The \CFA @mutex@ qualifier does allow the ability to support multimonitor functions,\footnote{
 While object-oriented monitors can be extended with a mutex qualifier for multiple-monitor members, no prior example of this feature could be found.}
 called \newterm{bulk acquire}.
 \CFA guarantees acquisition order is consistent across calls to @mutex@ functions using the same monitors as arguments, so acquiring multiple monitors is safe from deadlock.
+where the number of acquisitions is statically known, called \newterm{bulk acquire}.
+\CFA guarantees bulk acquisition order is consistent across calls to @mutex@ functions using the same monitors as arguments, so acquiring multiple monitors in a bulk acquire is safe from deadlock.
 Figure~\ref{f:BankTransfer} shows a trivial solution to the bank transfer problem~\cite{BankTransfer}, where two resources must be locked simultaneously, using \CFA monitors with implicit locking and \CC with explicit locking.
 A \CFA programmer only has to manage when to acquire mutual exclusion;
 …
 void transfer( BankAccount & `mutex` my,
         BankAccount & `mutex` your, int me2you ) {
+        // bulk acquire
         deposit( my, -me2you ); // debit
         deposit( your, me2you ); // credit
 …
 void transfer( BankAccount & my,
                         BankAccount & your, int me2you ) {
         `scoped_lock lock( my.m, your.m );`
+        `scoped_lock lock( my.m, your.m );` // bulk acquire
         deposit( my, -me2you ); // debit
         deposit( your, me2you ); // credit
 …
 \end{figure}
 Users can still force the acquiring order by using @mutex@/\lstinline[morekeywords=nomutex]@nomutex@.
+Users can still force the acquiring order by using or not using @mutex@.
 \begin{cfa}
 void foo( M & mutex m1, M & mutex m2 ); $\C{// acquire m1 and m2}$
 void bar( M & mutex m1, M & /* nomutex */ m2 ) { $\C{// acquire m1}$
+void bar( M & mutex m1, M & m2 ) { $\C{// only acquire m1}$
         ... foo( m1, m2 ); ... $\C{// acquire m2}$
+}
 void baz( M & /* nomutex */ m1, M & mutex m2 ) { $\C{// acquire m2}$
+void baz( M & m1, M & mutex m2 ) { $\C{// only acquire m2}$
         ... foo( m1, m2 ); ... $\C{// acquire m1}$
+}
 …
 % There are many aspects of scheduling in a concurrency system, all related to resource utilization by waiting threads, \ie which thread gets the resource next.
 % Different forms of scheduling include access to processors by threads (see Section~\ref{s:RuntimeStructureCluster}), another is access to a shared resource by a lock or monitor.
+This section discusses monitor scheduling for waiting threads eligible for entry, \ie which thread gets the shared resource next. (See Section~\ref{s:RuntimeStructureCluster} for scheduling threads on virtual processors.)
+While monitor mutual-exclusion provides safe access to shared data, the monitor data may indicate that a thread accessing it cannot proceed, \eg a bounded buffer may be full/empty so produce/consumer threads must block.
+Leaving the monitor and trying again (busy waiting) is impractical for high-level programming.
+Monitors eliminate busy waiting by providing synchronization to schedule threads needing access to the shared data, where threads block versus spinning.
+This section discusses scheduling for waiting threads eligible for monitor entry~\cite{Buhr95b}, \ie which user thread gets the shared resource next.
+(See Section~\ref{s:RuntimeStructureCluster} for scheduling kernel threads on virtual processors.)
+While monitor mutual-exclusion provides safe access to its shared data, the data may indicate a thread cannot proceed, \eg a bounded buffer may be full/\-empty so produce/consumer threads must block.
+Leaving the monitor and retrying (busy waiting) is impractical for high-level programming.
+Monitors eliminate busy waiting by providing synchronization within the monitor critical-section to schedule threads needing access to the shared data, where threads block versus spin.
 Synchronization is generally achieved with internal~\cite{Hoare74} or external~\cite[\S~2.9.2]{uC++} scheduling.
+\newterm{Internal scheduling} is characterized by each thread entering the monitor and making an individual decision about proceeding or blocking, while \newterm{external scheduling} is characterized by an entering thread making a decision about proceeding for itself and on behalf of other threads attempting entry.
+Finally, \CFA monitors do not allow calling threads to barge ahead of signalled threads, which simplifies synchronization among threads in the monitor and increases correctness.
+If barging is allowed, synchronization between a signaller and signallee is difficult, often requiring additional flags and multiple unblock/block cycles.
+In fact, signals-as-hints is completely opposite from that proposed by Hoare in the seminal paper on monitors~\cite[p.~550]{Hoare74}.
+\newterm{Internal} largely schedules threads located \emph{inside} the monitor and is accomplished using condition variables with signal and wait.
+\newterm{External} largely schedules threads located \emph{outside} the monitor and is accomplished with the @waitfor@ statement.
+Note, internal scheduling has a small amount of external scheduling and vice versa, so the naming denotes where the majority of the block threads reside (inside or outside) for scheduling.
+For complex scheduling, the approaches can be combined, so there are threads waiting inside and outside.
+\CFA monitors do not allow calling threads to barge ahead of signaled threads via barging prevention, which simplifies synchronization among threads in the monitor and increases correctness.
+A direct consequence of this semantics is that unblocked waiting threads are not required to recheck the waiting condition, \ie waits are not in a starvation-prone busy-loop as required by the signals-as-hints style with barging.
+Preventing barging comes directly from Hoare's semantics in the seminal paper on monitors~\cite[p.~550]{Hoare74}.
 % \begin{cquote}
 % However, we decree that a signal operation be followed immediately by resumption of a waiting program, without possibility of an intervening procedure call from yet a third program.
 % It is only in this way that a waiting program has an absolute guarantee that it can acquire the resource just released by the signalling program without any danger that a third program will interpose a monitor entry and seize the resource instead.~\cite[p.~550]{Hoare74}
+% It is only in this way that a waiting program has an absolute guarantee that it can acquire the resource just released by the signaling program without any danger that a third program will interpose a monitor entry and seize the resource instead.~\cite[p.~550]{Hoare74}
 % \end{cquote}
+Furthermore, \CFA concurrency has no spurious wakeup~\cite[\S~9]{Buhr05a}, which eliminates an implicit form of self barging.
+Hence, a \CFA @wait@ statement is not enclosed in a @while@ loop retesting a blocking predicate, which can cause thread starvation due to barging.
+Figure~\ref{f:MonitorScheduling} shows general internal/external scheduling (for the bounded-buffer example in Figure~\ref{f:InternalExternalScheduling}).
+External calling threads block on the calling queue, if the monitor is occupied, otherwise they enter in FIFO order.
+Internal threads block on condition queues via @wait@ and reenter from the condition in FIFO order.
+Alternatively, internal threads block on urgent from the @signal_block@ or @waitfor@, and reenter implicitly when the monitor becomes empty, \ie, the thread in the monitor exits or waits.
+There are three signalling mechanisms to unblock waiting threads to enter the monitor.
+Note, signalling cannot have the signaller and signalled thread in the monitor simultaneously because of the mutual exclusion, so either the signaller or signallee can proceed.
+For internal scheduling, threads are unblocked from condition queues using @signal@, where the signallee is moved to urgent and the signaller continues (solid line).
+Multiple signals move multiple signallees to urgent until the condition is empty.
+When the signaller exits or waits, a thread blocked on urgent is processed before calling threads to prevent barging.
+(Java conceptually moves the signalled thread to the calling queue, and hence, allows barging.)
+The alternative unblock is in the opposite order using @signal_block@, where the signaller is moved to urgent and the signallee continues (dashed line), and is implicitly unblocked from urgent when the signallee exits or waits.
+For external scheduling, the condition queues are not used;
+instead threads are unblocked directly from the calling queue using @waitfor@ based on function names requesting mutual exclusion.
+(The linear search through the calling queue to locate a particular call can be reduced to $O(1)$.)
+The @waitfor@ has the same semantics as @signal_block@, where the signalled thread executes before the signallee, which waits on urgent.
+Executing multiple @waitfor@s from different signalled functions causes the calling threads to move to urgent.
+External scheduling requires urgent to be a stack, because the signaller expects to execute immediately after the specified monitor call has exited or waited.
+Internal scheduling behaves the same for an urgent stack or queue, except for multiple signalling, where the threads unblock from urgent in reverse order from signalling.
+If the restart order is important, multiple signalling by a signal thread can be transformed into daisy-chain signalling among threads, where each thread signals the next thread.
+We tried both a stack for @waitfor@ and queue for signalling, but that resulted in complex semantics about which thread enters next.
+Hence, \CFA uses a single urgent stack to correctly handle @waitfor@ and adequately support both forms of signalling.
+Furthermore, \CFA concurrency has no spurious wakeup~\cite[\S~9]{Buhr05a}, which eliminates an implicit self barging.
+Monitor mutual-exclusion means signaling cannot have the signaller and signaled thread in the monitor simultaneously, so only the signaller or signallee can proceed and the other waits on an implicit urgent list~\cite[p.~551]{Hoare74}.
+Figure~\ref{f:MonitorScheduling} shows internal and external scheduling for the bounded-buffer examples in Figure~\ref{f:GenericBoundedBuffer}.
+For internal scheduling in Figure~\ref{f:BBInt}, the @signal@ moves the signallee, front thread of the specified condition queue, to the urgent list (see Figure~\ref{f:MonitorScheduling}) and the signaller continues (solid line).
+Multiple signals move multiple signallees to urgent until the condition queue is empty.
+When the signaller exits or waits, a thread is implicitly unblocked from urgent, if available, before unblocking a calling thread to prevent barging.
+(Java conceptually moves the signaled thread to the calling queue, and hence, allows barging.)
+Signal is used when the signaller is providing the cooperation needed by the signallee, \eg creating an empty slot in a buffer for a producer, and the signaller immediately exits the monitor to run concurrently consuming the buffer element, and passes control of the monitor to the signaled thread, which can immediately take advantage of the state change.
+Specifically, the @wait@ function atomically blocks the calling thread and implicitly releases the monitor lock(s) for all monitors in the function's parameter list.
+Signalling is unconditional because signaling an empty condition queue does nothing.
+It is common to declare condition queues as monitor fields to prevent shared access, hence no locking is required for access as the queues are protected by the monitor lock.
+In \CFA, a condition queue can be created and stored independently.
 \begin{figure}
 …
 \end{figure}
-Figure~\ref{f:BBInt} shows a \CFA generic bounded-buffer with internal scheduling, where producers/consumers enter the monitor, detect the buffer is full/empty, and block on an appropriate condition variable, @full@/@empty@.
-The @wait@ function atomically blocks the calling thread and implicitly releases the monitor lock(s) for all monitors in the function's parameter list.
-The appropriate condition variable is signalled to unblock an opposite kind of thread after an element is inserted/removed from the buffer.
-Signalling is unconditional, because signalling an empty condition variable does nothing.
-It is common to declare condition variables as monitor fields to prevent shared access, hence no locking is required for access as the conditions are protected by the monitor lock.
-In \CFA, a condition variable can be created/stored independently.
-% To still prevent expensive locking on access, a condition variable is tied to a \emph{group} of monitors on first use, called \newterm{branding}, resulting in a low-cost boolean test to detect sharing from other monitors.
-% Signalling semantics cannot have the signaller and signalled thread in the monitor simultaneously, which means:
-% \begin{enumerate}
-% \item
-% The signalling thread returns immediately and the signalled thread continues.
-% \item
-% The signalling thread continues and the signalled thread is marked for urgent unblocking at the next scheduling point (exit/wait).
-% \item
-% The signalling thread blocks but is marked for urgent unblocking at the next scheduling point and the signalled thread continues.
-% \end{enumerate}
-% The first approach is too restrictive, as it precludes solving a reasonable class of problems, \eg dating service (see Figure~\ref{f:DatingService}).
-% \CFA supports the next two semantics as both are useful.
 \begin{figure}
 \centering
 …
                 T elements[10];
         };
         void ?{}( Buffer(T) & buffer ) with(buffer) {
+        void ?{}( Buffer(T) & buf ) with(buf) {
                 front = back = count = 0;
+        }
+        void insert( Buffer(T) & mutex buffer, T elem )
                                 with(buffer) {
                 if ( count == 10 ) `wait( empty )`;
                 // insert elem into buffer
+        void insert(Buffer(T) & mutex buf, T elm) with(buf){
+                if ( count == 10 ) `wait( empty )`; // full ?
+                // insert elm into buf
                 `signal( full )`;
+        }
         T remove( Buffer(T) & mutex buffer ) with(buffer) {
                 if ( count == 0 ) `wait( full )`;
                 // remove elem from buffer
+        T remove( Buffer(T) & mutex buf ) with(buf) {
+                if ( count == 0 ) `wait( full )`; // empty ?
+                // remove elm from buf
                 `signal( empty )`;
                 return elem;
+                return elm;
+        }
+}
 \end{cfa}
 \end{lrbox}
-% \newbox\myboxB
-% \begin{lrbox}{\myboxB}
-% \begin{cfa}[aboveskip=0pt,belowskip=0pt]
-% forall( otype T ) { // distribute forall
-%       monitor Buffer {
+%
-%               int front, back, count;
-%               T elements[10];
-%       };
-%       void ?{}( Buffer(T) & buffer ) with(buffer) {
-%               [front, back, count] = 0;
-%       }
-%       T remove( Buffer(T) & mutex buffer ); // forward
-%       void insert( Buffer(T) & mutex buffer, T elem )
-%                               with(buffer) {
-%               if ( count == 10 ) `waitfor( remove, buffer )`;
-%               // insert elem into buffer
+%
-%       }
-%       T remove( Buffer(T) & mutex buffer ) with(buffer) {
-%               if ( count == 0 ) `waitfor( insert, buffer )`;
-%               // remove elem from buffer
+%
-%               return elem;
-%       }
-% }
-% \end{cfa}
-% \end{lrbox}
 \newbox\myboxB
 \begin{lrbox}{\myboxB}
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
+forall( otype T ) { // distribute forall
+        monitor Buffer {
+                int front, back, count;
+                T elements[10];
+        };
+        void ?{}( Buffer(T) & buf ) with(buf) {
+                front = back = count = 0;
+        }
+        T remove( Buffer(T) & mutex buf ); // forward
+        void insert(Buffer(T) & mutex buf, T elm) with(buf){
+                if ( count == 10 ) `waitfor( remove : buf )`;
+                // insert elm into buf
+        }
+        T remove( Buffer(T) & mutex buf ) with(buf) {
+                if ( count == 0 ) `waitfor( insert : buf )`;
+                // remove elm from buf
+                return elm;
+        }
+}
+\end{cfa}
+\end{lrbox}
+\subfloat[Internal scheduling]{\label{f:BBInt}\usebox\myboxA}
+\hspace{1pt}
+\vrule
+\hspace{3pt}
+\subfloat[External scheduling]{\label{f:BBExt}\usebox\myboxB}
+\caption{Generic bounded buffer}
+\label{f:GenericBoundedBuffer}
+\end{figure}
+The @signal_block@ provides the opposite unblocking order, where the signaller is moved to urgent and the signallee continues and a thread is implicitly unblocked from urgent when the signallee exits or waits (dashed line)~\cite[p.~551]{Hoare74}.
+Signal block is used when the signallee is providing the cooperation needed by the signaller, \eg if the buffer is removed and a producer hands off an item to a consumer as in Figure~\ref{f:DatingSignalBlock}, so the signaller must wait until the signallee unblocks, provides the cooperation, exits the monitor to run concurrently, and passes control of the monitor to the signaller, which can immediately take advantage of the state change.
+Using @signal@ or @signal_block@ can be a dynamic decision based on whether the thread providing the cooperation arrives before or after the thread needing the cooperation.
+For external scheduling in Figure~\ref{f:BBExt}, the internal scheduling is replaced, eliminating condition queues and @signal@/@wait@ (cases where it cannot are discussed shortly), and has existed in the programming language Ada for almost 40 years with variants in other languages~\cite{SR,ConcurrentC++,uC++}.
+While prior languages use external scheduling solely for thread interaction, \CFA generalizes it to both monitors and threads.
+External scheduling allows waiting for events from other threads while restricting unrelated events, that would otherwise have to wait on condition queues in the monitor.
+Scheduling is controlled by the @waitfor@ statement, which atomically blocks the calling thread, releases the monitor lock, and restricts the function calls that can next acquire mutual exclusion.
+Specifically, a thread calling the monitor is unblocked directly from the calling queue based on function names that can fulfill the cooperation required by the signaller.
+(The linear search through the calling queue to locate a particular call can be reduced to $O(1)$.)
+Hence, the @waitfor@ has the same semantics as @signal_block@, where the signallee thread from the calling queue executes before the signaller, which waits on urgent.
+Now when a producer/consumer detects a full/empty buffer, the necessary cooperation for continuation is specified by indicating the next function call that can occur.
+For example, a producer detecting a full buffer must have cooperation from a consumer to remove an item so function @remove@ is accepted, which prevents producers from entering the monitor, and after a consumer calls @remove@, the producer waiting on urgent is \emph{implicitly} unblocked because it can now continue its insert operation.
+Hence, this mechanism is done in terms of control flow, next call, versus in terms of data, channels, as in Go and Rust @select@.
+While both mechanisms have strengths and weaknesses, \CFA uses the control-flow mechanism to be consistent with other language features.
+Figure~\ref{f:ReadersWriterLock} shows internal and external scheduling for a readers/writer lock with no barging and threads are serviced in FIFO order to eliminate staleness and freshness among the reader/writer threads.
+For internal scheduling in Figure~\ref{f:RWInt}, the readers and writers wait on the same condition queue in FIFO order, making it impossible to tell if a waiting thread is a reader or writer.
+To clawback the kind of thread, a \CFA condition can store user data in the node for a blocking thread at the @wait@, \ie whether the thread is a @READER@ or @WRITER@.
+An unblocked reader thread checks if the thread at the front of the queue is a reader and unblock it, \ie the readers daisy-chain signal the next group of readers demarcated by the next writer or end of the queue.
+For external scheduling in Figure~\ref{f:RWExt}, a waiting reader checks if a writer is using the resource, and if so, restricts further calls until the writer exits by calling @EndWrite@.
+The writer does a similar action for each reader or writer using the resource.
+Note, no new calls to @StartRead@/@StartWrite@ may occur when waiting for the call to @EndRead@/@EndWrite@.
+\begin{figure}
+\centering
+\newbox\myboxA
+\begin{lrbox}{\myboxA}
+\begin{cfa}[aboveskip=0pt,belowskip=0pt]
+enum RW { READER, WRITER };
 monitor ReadersWriter {
+        int rcnt, wcnt; // readers/writer using resource
+        int rcnt, wcnt; // readers/writer using resource
+        `condition RWers;`
 };
 void ?{}( ReadersWriter & rw ) with(rw) {
 …
 void EndRead( ReadersWriter & mutex rw ) with(rw) {
         rcnt -= 1;
+        if ( rcnt == 0 ) `signal( RWers )`;
+}
 void EndWrite( ReadersWriter & mutex rw ) with(rw) {
         wcnt = 0;
+        `signal( RWers );`
+}
 void StartRead( ReadersWriter & mutex rw ) with(rw) {
+        if ( wcnt > 0 ) `waitfor( EndWrite, rw );`
+        if ( wcnt !=0 || ! empty( RWers ) )
+                `wait( RWers, READER )`;
         rcnt += 1;
+        if ( ! empty(RWers) && `front(RWers) == READER` )
+                `signal( RWers )`;  // daisy-chain signaling
+}
 void StartWrite( ReadersWriter & mutex rw ) with(rw) {
         if ( wcnt > 0 ) `waitfor( EndWrite, rw );`
+        else while ( rcnt > 0 ) `waitfor( EndRead, rw );`
+        if ( wcnt != 0 || rcnt != 0 ) `wait( RWers, WRITER )`;
         wcnt = 1;
+}
 \end{cfa}
 \end{lrbox}
+\subfloat[Generic bounded buffer, internal scheduling]{\label{f:BBInt}\usebox\myboxA}
+\hspace{3pt}
+\newbox\myboxB
+\begin{lrbox}{\myboxB}
+\begin{cfa}[aboveskip=0pt,belowskip=0pt]
+monitor ReadersWriter {
+        int rcnt, wcnt; // readers/writer using resource
+};
+void ?{}( ReadersWriter & rw ) with(rw) {
+        rcnt = wcnt = 0;
+}
+void EndRead( ReadersWriter & mutex rw ) with(rw) {
+        rcnt -= 1;
+}
+void EndWrite( ReadersWriter & mutex rw ) with(rw) {
+        wcnt = 0;
+}
+void StartRead( ReadersWriter & mutex rw ) with(rw) {
+        if ( wcnt > 0 ) `waitfor( EndWrite : rw );`
+        rcnt += 1;
+}
+void StartWrite( ReadersWriter & mutex rw ) with(rw) {
+        if ( wcnt > 0 ) `waitfor( EndWrite : rw );`
+        else while ( rcnt > 0 ) `waitfor( EndRead : rw );`
+        wcnt = 1;
+}
+\end{cfa}
+\end{lrbox}
+\subfloat[Internal scheduling]{\label{f:RWInt}\usebox\myboxA}
+\hspace{1pt}
 \vrule
 \hspace{3pt}
 \subfloat[Readers / writer lock, external scheduling]{\label{f:RWExt}\usebox\myboxB}
 \caption{Internal / external scheduling}
 \label{f:InternalExternalScheduling}
+\subfloat[External scheduling]{\label{f:RWExt}\usebox\myboxB}
+\caption{Readers / writer lock}
+\label{f:ReadersWriterLock}
 \end{figure}
+Figure~\ref{f:BBInt} can be transformed into external scheduling by removing the condition variables and signals/waits, and adding the following lines at the locations of the current @wait@s in @insert@/@remove@, respectively.
+\begin{cfa}[aboveskip=2pt,belowskip=1pt]
+if ( count == 10 ) `waitfor( remove, buffer )`;       |      if ( count == 0 ) `waitfor( insert, buffer )`;
+\end{cfa}
+Here, the producers/consumers detects a full/\-empty buffer and prevents more producers/consumers from entering the monitor until there is a free/empty slot in the buffer.
+External scheduling is controlled by the @waitfor@ statement, which atomically blocks the calling thread, releases the monitor lock, and restricts the function calls that can next acquire mutual exclusion.
+If the buffer is full, only calls to @remove@ can acquire the buffer, and if the buffer is empty, only calls to @insert@ can acquire the buffer.
+Threads calling excluded functions block outside of (external to) the monitor on the calling queue, versus blocking on condition queues inside of (internal to) the monitor.
+Figure~\ref{f:RWExt} shows a readers/writer lock written using external scheduling, where a waiting reader detects a writer using the resource and restricts further calls until the writer exits by calling @EndWrite@.
+The writer does a similar action for each reader or writer using the resource.
+Note, no new calls to @StarRead@/@StartWrite@ may occur when waiting for the call to @EndRead@/@EndWrite@.
+External scheduling allows waiting for events from other threads while restricting unrelated events, that would otherwise have to wait on conditions in the monitor.
+The mechnaism can be done in terms of control flow, \eg Ada @accept@ or \uC @_Accept@, or in terms of data, \eg Go @select@ on channels.
+While both mechanisms have strengths and weaknesses, this project uses the control-flow mechanism to be consistent with other language features.
+% Two challenges specific to \CFA for external scheduling are loose object-definitions (see Section~\ref{s:LooseObjectDefinitions}) and multiple-monitor functions (see Section~\ref{s:Multi-MonitorScheduling}).
+Figure~\ref{f:DatingService} shows a dating service demonstrating non-blocking and blocking signalling.
+The dating service matches girl and boy threads with matching compatibility codes so they can exchange phone numbers.
+A thread blocks until an appropriate partner arrives.
+The complexity is exchanging phone numbers in the monitor because of the mutual-exclusion property.
+For signal scheduling, the @exchange@ condition is necessary to block the thread finding the match, while the matcher unblocks to take the opposite number, post its phone number, and unblock the partner.
+For signal-block scheduling, the implicit urgent-queue replaces the explict @exchange@-condition and @signal_block@ puts the finding thread on the urgent condition and unblocks the matcher.
+The dating service is an example of a monitor that cannot be written using external scheduling because it requires knowledge of calling parameters to make scheduling decisions, and parameters of waiting threads are unavailable;
+as well, an arriving thread may not find a partner and must wait, which requires a condition variable, and condition variables imply internal scheduling.
+Furthermore, barging corrupts the dating service during an exchange because a barger may also match and change the phone numbers, invalidating the previous exchange phone number.
+Putting loops around the @wait@s does not correct the problem;
+the simple solution must be restructured to account for barging.
+Finally, external scheduling requires urgent to be a stack, because the signaller expects to execute immediately after the specified monitor call has exited or waited.
+Internal scheduling performing multiple signaling results in unblocking from urgent in the reverse order from signaling.
+It is rare for the unblocking order to be important as an unblocked thread can be time-sliced immediately after leaving the monitor.
+If the unblocking order is important, multiple signaling can be restructured into daisy-chain signaling, where each thread signals the next thread.
+Hence, \CFA uses a single urgent stack to correctly handle @waitfor@ and adequately support both forms of signaling.
+(Advanced @waitfor@ features are discussed in Section~\ref{s:ExtendedWaitfor}.)
 \begin{figure}
 …
 };
 int girl( DS & mutex ds, int phNo, int ccode ) {
         if ( is_empty( Boys[ccode] ) ) {
+        if ( empty( Boys[ccode] ) ) {
                 wait( Girls[ccode] );
                 GirlPhNo = phNo;
 …
 };
 int girl( DS & mutex ds, int phNo, int ccode ) {
         if ( is_empty( Boys[ccode] ) ) { // no compatible
+        if ( empty( Boys[ccode] ) ) { // no compatible
                 wait( Girls[ccode] ); // wait for boy
                 GirlPhNo = phNo; // make phone number available
 …
 \qquad
 \subfloat[\lstinline@signal_block@]{\label{f:DatingSignalBlock}\usebox\myboxB}
 \caption{Dating service}
 \label{f:DatingService}
+\caption{Dating service Monitor}
+\label{f:DatingServiceMonitor}
 \end{figure}
+In summation, for internal scheduling, non-blocking signalling (as in the producer/consumer example) is used when the signaller is providing the cooperation for a waiting thread;
+the signaller enters the monitor and changes state, detects a waiting threads that can use the state, performs a non-blocking signal on the condition queue for the waiting thread, and exits the monitor to run concurrently.
+The waiter unblocks next from the urgent queue, uses/takes the state, and exits the monitor.
+Blocking signal is the reverse, where the waiter is providing the cooperation for the signalling thread;
+the signaller enters the monitor, detects a waiting thread providing the necessary state, performs a blocking signal to place it on the urgent queue and unblock the waiter.
+The waiter changes state and exits the monitor, and the signaller unblocks next from the urgent queue to use/take the state.
+Both internal and external scheduling extend to multiple monitors in a natural way.
+Figure~\ref{f:DatingServiceMonitor} shows a dating service demonstrating nonblocking and blocking signaling.
+The dating service matches girl and boy threads with matching compatibility codes so they can exchange phone numbers.
+A thread blocks until an appropriate partner arrives.
+The complexity is exchanging phone numbers in the monitor because of the mutual-exclusion property.
+For signal scheduling, the @exchange@ condition is necessary to block the thread finding the match, while the matcher unblocks to take the opposite number, post its phone number, and unblock the partner.
+For signal-block scheduling, the implicit urgent-queue replaces the explicit @exchange@-condition and @signal_block@ puts the finding thread on the urgent stack and unblocks the matcher.
+Note, barging corrupts the dating service during an exchange because a barger may also match and change the phone numbers, invalidating the previous exchange phone number.
+This situation shows rechecking the waiting condition and waiting again (signals-as-hints) fails, requiring significant restructured to account for barging.
+Given external and internal scheduling, what guidelines can a programmer use to select between them?
+In general, external scheduling is easier to understand and code because only the next logical action (mutex function(s)) is stated, and the monitor implicitly handles all the details.
+Therefore, there are no condition variables, and hence, no wait and signal, which reduces coding complexity and synchronization errors.
+If external scheduling is simpler than internal, why not use it all the time?
+Unfortunately, external scheduling cannot be used if: scheduling depends on parameter value(s) or scheduling must block across an unknown series of calls on a condition variable, \ie internal scheduling.
+For example, the dating service cannot be written using external scheduling.
+First, scheduling requires knowledge of calling parameters to make matching decisions and parameters of calling threads are unavailable within the monitor.
+Specifically, a thread within the monitor cannot examine the @ccode@ of threads waiting on the calling queue to determine if there is a matching partner.
+(Similarly, if the bounded buffer or readers/writer are restructured with a single interface function with a parameter denoting producer/consumer or reader/write, they cannot be solved with external scheduling.)
+Second, a scheduling decision may be delayed across an unknown number of calls when there is no immediate match so the thread in the monitor must block on a condition.
+Specifically, if a thread determines there is no opposite calling thread with the same @ccode@, it must wait an unknown period until a matching thread arrives.
+For complex synchronization, both external and internal scheduling can be used to take advantage of best of properties of each.
+Finally, both internal and external scheduling extend to multiple monitors in a natural way.
 \begin{cquote}
 \begin{tabular}{@{}l@{\hspace{3\parindentlnth}}l@{}}
+\begin{tabular}{@{}l@{\hspace{2\parindentlnth}}l@{}}
 \begin{cfa}
 monitor M { `condition e`; ... };
 …
+&
 \begin{cfa}
 void rtn$\(_1\)$( M & mutex m1, M & mutex m2 );
+void rtn$\(_1\)$( M & mutex m1, M & mutex m2 ); // overload rtn
 void rtn$\(_2\)$( M & mutex m1 );
 void bar( M & mutex m1, M & mutex m2 ) {
         ... waitfor( `rtn` ); ...       // $\LstCommentStyle{waitfor( rtn\(_1\), m1, m2 )}$
         ... waitfor( `rtn, m1` ); ... // $\LstCommentStyle{waitfor( rtn\(_2\), m1 )}$
+        ... waitfor( `rtn`${\color{red}\(_1\)}$ ); ...       // $\LstCommentStyle{waitfor( rtn\(_1\) : m1, m2 )}$
+        ... waitfor( `rtn${\color{red}\(_2\)}$ : m1` ); ...
+}
 \end{cfa}
 …
 \end{cquote}
 For @wait( e )@, the default semantics is to atomically block the signaller and release all acquired mutex parameters, \ie @wait( e, m1, m2 )@.
 To override the implicit multi-monitor wait, specific mutex parameter(s) can be specified, \eg @wait( e, m1 )@.
 Wait cannot statically verifies the released monitors are the acquired mutex-parameters without disallowing separately compiled helper functions calling @wait@.
 While \CC supports bulk locking, @wait@ only accepts a single lock for a condition variable, so bulk locking with condition variables is asymmetric.
+To override the implicit multimonitor wait, specific mutex parameter(s) can be specified, \eg @wait( e, m1 )@.
+Wait cannot statically verify the released monitors are the acquired mutex-parameters without disallowing separately compiled helper functions calling @wait@.
+While \CC supports bulk locking, @wait@ only accepts a single lock for a condition queue, so bulk locking with condition queues is asymmetric.
 Finally, a signaller,
 \begin{cfa}
 …
+}
 \end{cfa}
 must have acquired at least the same locks as the waiting thread signalled from a condition queue to allow the locks to be passed, and hence, prevent barging.
 Similarly, for @waitfor( rtn )@, the default semantics is to atomically block the acceptor and release all acquired mutex parameters, \ie @waitfor( rtn, m1, m2 )@.
 To override the implicit multi-monitor wait, specific mutex parameter(s) can be specified, \eg @waitfor( rtn, m1 )@.
 @waitfor@ does statically verify the monitor types passed are the same as the acquired mutex-parameters of the given function or function pointer, hence the function (pointer) prototype must be accessible.
+must have acquired at least the same locks as the waiting thread signaled from a condition queue to allow the locks to be passed, and hence, prevent barging.
+Similarly, for @waitfor( rtn )@, the default semantics is to atomically block the acceptor and release all acquired mutex parameters, \ie @waitfor( rtn : m1, m2 )@.
+To override the implicit multimonitor wait, specific mutex parameter(s) can be specified, \eg @waitfor( rtn : m1 )@.
+@waitfor@ does statically verify the monitor types passed are the same as the acquired mutex-parameters of the given function or function pointer, hence the prototype must be accessible.
 % When an overloaded function appears in an @waitfor@ statement, calls to any function with that name are accepted.
 % The rationale is that members with the same name should perform a similar function, and therefore, all should be eligible to accept a call.
+% The rationale is that functions with the same name should perform a similar actions, and therefore, all should be eligible to accept a call.
 Overloaded functions can be disambiguated using a cast
 \begin{cfa}
 void rtn( M & mutex m );
 `int` rtn( M & mutex m );
 waitfor( (`int` (*)( M & mutex ))rtn, m );
 \end{cfa}
 The ability to release a subset of acquired monitors can result in a \newterm{nested monitor}~\cite{Lister77} deadlock.
+waitfor( (`int` (*)( M & mutex ))rtn : m );
+\end{cfa}
+The ability to release a subset of acquired monitors can result in a \newterm{nested monitor}~\cite{Lister77} deadlock (see Section~\ref{s:MutexAcquisition}).
 \begin{cfa}
 void foo( M & mutex m1, M & mutex m2 ) {
         ... wait( `e, m1` ); ...                                $\C{// release m1, keeping m2 acquired )}$
 void bar( M & mutex m1, M & mutex m2 ) {        $\C{// must acquire m1 and m2 )}$
+        ... wait( `e, m1` ); ...                                $\C{// release m1, keeping m2 acquired}$
+void bar( M & mutex m1, M & mutex m2 ) {        $\C{// must acquire m1 and m2}$
         ... signal( `e` ); ...
 \end{cfa}
+The @wait@ only releases @m1@ so the signalling thread cannot acquire @m1@ and @m2@ to enter @bar@ and @signal@ the condition.
+While deadlock can occur with multiple/nesting acquisition, this is a consequence of locks, and by extension monitors, not being perfectly composable.
+The @wait@ only releases @m1@ so the signaling thread cannot acquire @m1@ and @m2@ to enter @bar@ and @signal@ the condition.
+While deadlock can occur with multiple/nesting acquisition, this is a consequence of locks, and by extension monitor locking is not perfectly composable.
 \subsection{\texorpdfstring{Extended \protect\lstinline@waitfor@}{Extended waitfor}}
+\label{s:ExtendedWaitfor}
 Figure~\ref{f:ExtendedWaitfor} shows the extended form of the @waitfor@ statement to conditionally accept one of a group of mutex functions, with an optional statement to be performed \emph{after} the mutex function finishes.
 For a @waitfor@ clause to be executed, its @when@ must be true and an outstanding call to its corresponding member(s) must exist.
+For a @waitfor@ clause to be executed, its @when@ must be true and an outstanding call to its corresponding function(s) must exist.
 The \emph{conditional-expression} of a @when@ may call a function, but the function must not block or context switch.
 If there are multiple acceptable mutex calls, selection occurs top-to-bottom (prioritized) among the @waitfor@ clauses, whereas some programming languages with similar mechanisms accept nondeterministically for this case, \eg Go \lstinline[morekeywords=select]@select@.
 If some accept guards are true and there are no outstanding calls to these members, the acceptor is blocked until a call to one of these members is made.
+If there are multiple acceptable mutex calls, selection is prioritized top-to-bottom among the @waitfor@ clauses, whereas some programming languages with similar mechanisms accept nondeterministically for this case, \eg Go \lstinline[morekeywords=select]@select@.
+If some accept guards are true and there are no outstanding calls to these functions, the acceptor is blocked until a call to one of these functions is made.
 If there is a @timeout@ clause, it provides an upper bound on waiting.
 If all the accept guards are false, the statement does nothing, unless there is a terminating @else@ clause with a true guard, which is executed instead.
 Hence, the terminating @else@ clause allows a conditional attempt to accept a call without blocking.
 If both @timeout@ and @else@ clause are present, the @else@ must be conditional, or the @timeout@ is never triggered.
+There is also a traditional future wait queue (not shown) (\eg Microsoft (@WaitForMultipleObjects@)), to wait for a specified number of future elements in the queue.
+% There is also a traditional future wait queue (not shown) (\eg Microsoft @WaitForMultipleObjects@), to wait for a specified number of future elements in the queue.
+Finally, there is a shorthand for specifying multiple functions using the same set of monitors: @waitfor( f, g, h : m1, m2, m3 )@.
 \begin{figure}
 …
 \begin{cfa}
 `when` ( $\emph{conditional-expression}$ )      $\C{// optional guard}$
         waitfor( $\emph{mutex-member-name}$ ) $\emph{statement}$ $\C{// action after call}$
+        waitfor( $\emph{mutex-function-name}$ ) $\emph{statement}$ $\C{// action after call}$
 `or` `when` ( $\emph{conditional-expression}$ ) $\C{// any number of functions}$
         waitfor( $\emph{mutex-member-name}$ ) $\emph{statement}$
+        waitfor( $\emph{mutex-function-name}$ ) $\emph{statement}$
 `or`    ...
 `when` ( $\emph{conditional-expression}$ ) $\C{// optional guard}$
 …
 The left example only accepts @mem1@ if @C1@ is true or only @mem2@ if @C2@ is true.
 The right example accepts either @mem1@ or @mem2@ if @C1@ and @C2@ are true.
+An interesting use of @waitfor@ is accepting the @mutex@ destructor to know when an object is deallocated, \eg assume the bounded buffer is restructred from a monitor to a thread with the following @main@.
+Hence, the @waitfor@ has parallel semantics, accepting any true @when@ clause.
+An interesting use of @waitfor@ is accepting the @mutex@ destructor to know when an object is deallocated, \eg assume the bounded buffer is restructured from a monitor to a thread with the following @main@.
 \begin{cfa}
 void main( Buffer(T) & buffer ) with(buffer) {
         for () {
                 `waitfor( ^?{}, buffer )` break;
                 or when ( count != 20 ) waitfor( insert, buffer ) { ... }
                 or when ( count != 0 ) waitfor( remove, buffer ) { ... }
+                `waitfor( ^?{} : buffer )` break;
+                or when ( count != 20 ) waitfor( insert : buffer ) { ... }
+                or when ( count != 0 ) waitfor( remove : buffer ) { ... }
+        }
         // clean up
 …
 \subsection{Bulk Barging Prevention}
 Figure~\ref{f:BulkBargingPrevention} shows \CFA code where bulk acquire adds complexity to the internal-signalling semantics.
+\subsection{Bulk barging prevention}
+Figure~\ref{f:BulkBargingPrevention} shows \CFA code where bulk acquire adds complexity to the internal-signaling semantics.
 The complexity begins at the end of the inner @mutex@ statement, where the semantics of internal scheduling need to be extended for multiple monitors.
 The problem is that bulk acquire is used in the inner @mutex@ statement where one of the monitors is already acquired.
 When the signalling thread reaches the end of the inner @mutex@ statement, it should transfer ownership of @m1@ and @m2@ to the waiting threads to prevent barging into the outer @mutex@ statement by another thread.
 However, both the signalling and waiting threads W1 and W2 need some subset of monitors @m1@ and @m2@.
+When the signaling thread reaches the end of the inner @mutex@ statement, it should transfer ownership of @m1@ and @m2@ to the waiting threads to prevent barging into the outer @mutex@ statement by another thread.
+However, both the signaling and waiting threads W1 and W2 need some subset of monitors @m1@ and @m2@.
 \begin{cquote}
 condition c: (order 1) W2(@m2@), W1(@m1@,@m2@)\ \ \ or\ \ \ (order 2) W1(@m1@,@m2@), W2(@m2@) \\
 …
 \end{figure}
 One scheduling solution is for the signaller S to keep ownership of all locks until the last lock is ready to be transferred, because this semantics fits most closely to the behaviour of single-monitor scheduling.
 However, this solution is inefficient if W2 waited first and can be immediate passed @m2@ when released, while S retains @m1@ until completion of the outer mutex statement.
+One scheduling solution is for the signaller S to keep ownership of all locks until the last lock is ready to be transferred, because this semantics fits most closely to the behavior of single-monitor scheduling.
+However, this solution is inefficient if W2 waited first and immediate passed @m2@ when released, while S retains @m1@ until completion of the outer mutex statement.
 If W1 waited first, the signaller must retain @m1@ amd @m2@ until completion of the outer mutex statement and then pass both to W1.
 % Furthermore, there is an execution sequence where the signaller always finds waiter W2, and hence, waiter W1 starves.
 To support this efficient semantics (and prevent barging), the implementation maintains a list of monitors acquired for each blocked thread.
 When a signaller exits or waits in a monitor function/statement, the front waiter on urgent is unblocked if all its monitors are released.
 Implementing a fast subset check for the necessary released monitors is important.
+To support these efficient semantics and prevent barging, the implementation maintains a list of monitors acquired for each blocked thread.
+When a signaller exits or waits in a mutex function or statement, the front waiter on urgent is unblocked if all its monitors are released.
+Implementing a fast subset check for the necessarily released monitors is important and discussed in the following sections.
 % The benefit is encapsulating complexity into only two actions: passing monitors to the next owner when they should be released and conditionally waking threads if all conditions are met.
+\subsection{Loose Object Definitions}
+\label{s:LooseObjectDefinitions}
+In an object-oriented programming language, a class includes an exhaustive list of operations.
+A new class can add members via static inheritance but the subclass still has an exhaustive list of operations.
+(Dynamic member adding, \eg JavaScript~\cite{JavaScript}, is not considered.)
+In the object-oriented scenario, the type and all its operators are always present at compilation (even separate compilation), so it is possible to number the operations in a bit mask and use an $O(1)$ compare with a similar bit mask created for the operations specified in a @waitfor@.
+However, in \CFA, monitor functions can be statically added/removed in translation units, making a fast subset check difficult.
+\begin{cfa}
+        monitor M { ... }; // common type, included in .h file
+translation unit 1
+        void `f`( M & mutex m );
+        void g( M & mutex m ) { waitfor( `f`, m ); }
+translation unit 2
+        void `f`( M & mutex m ); $\C{// replacing f and g for type M in this translation unit}$
+        void `g`( M & mutex m );
+        void h( M & mutex m ) { waitfor( `f`, m ) or waitfor( `g`, m ); } $\C{// extending type M in this translation unit}$
+\end{cfa}
+The @waitfor@ statements in each translation unit cannot form a unique bit-mask because the monitor type does not carry that information.
+\subsection{\texorpdfstring{\protect\lstinline@waitfor@ Implementation}{waitfor Implementation}}
+\label{s:waitforImplementation}
+In a statically typed object-oriented programming language, a class has an exhaustive list of members, even when members are added via static inheritance (see Figure~\ref{f:uCinheritance}).
+Knowing all members at compilation, even separate compilation, allows uniquely numbered them so the accept-statement implementation can use a fast and compact bit mask with $O(1)$ compare.
+\begin{figure}
+\centering
+\begin{lrbox}{\myboxA}
+\begin{uC++}[aboveskip=0pt,belowskip=0pt]
+$\emph{translation unit 1}$
+_Monitor B { // common type in .h file
+        _Mutex virtual void `f`( ... );
+        _Mutex virtual void `g`( ... );
+        _Mutex virtual void w1( ... ) { ... _Accept(`f`, `g`); ... }
+};
+$\emph{translation unit 2}$
+// include B
+_Monitor D : public B { // inherit
+        _Mutex void `h`( ... ); // add
+        _Mutex void w2( ... ) { ... _Accept(`f`, `h`); ... }
+};
+\end{uC++}
+\end{lrbox}
+\begin{lrbox}{\myboxB}
+\begin{cfa}[aboveskip=0pt,belowskip=0pt]
+$\emph{translation unit 1}$
+monitor M { ... }; // common type in .h file
+void `f`( M & mutex m, ... );
+void `g`( M & mutex m, ... );
+void w1( M & mutex m, ... ) { ... waitfor(`f`, `g` : m); ... }
+$\emph{translation unit 2}$
+// include M
+extern void `f`( M & mutex m, ... ); // import f but not g
+void `h`( M & mutex m ); // add
+void w2( M & mutex m, ... ) { ... waitfor(`f`, `h` : m); ... }
+\end{cfa}
+\end{lrbox}
+\subfloat[\uC]{\label{f:uCinheritance}\usebox\myboxA}
+\hspace{3pt}
+\vrule
+\hspace{3pt}
+\subfloat[\CFA]{\label{f:CFinheritance}\usebox\myboxB}
+\caption{Member / function visibility}
+\label{f:MemberFunctionVisibility}
+\end{figure}
+However, the @waitfor@ statement in translation unit 2 (see Figure~\ref{f:CFinheritance}) cannot see function @g@ in translation unit 1 precluding a unique numbering for a bit-mask because the monitor type only carries the protected shared data.
+(A possible way to construct a dense mapping is at link or load-time.)
 Hence, function pointers are used to identify the functions listed in the @waitfor@ statement, stored in a variable-sized array.
+Then, the same implementation approach used for the urgent stack is used for the calling queue.
+Each caller has a list of monitors acquired, and the @waitfor@ statement performs a (usually short) linear search matching functions in the @waitfor@ list with called functions, and then verifying the associated mutex locks can be transfers.
+(A possible way to construct a dense mapping is at link or load-time.)
+\subsection{Multi-Monitor Scheduling}
+Then, the same implementation approach used for the urgent stack (see Section~\ref{s:Scheduling}) is used for the calling queue.
+Each caller has a list of monitors acquired, and the @waitfor@ statement performs a short linear search matching functions in the @waitfor@ list with called functions, and then verifying the associated mutex locks can be transferred.
+\subsection{Multimonitor scheduling}
 \label{s:Multi-MonitorScheduling}
 External scheduling, like internal scheduling, becomes significantly more complex for multi-monitor semantics.
+External scheduling, like internal scheduling, becomes significantly more complex for multimonitor semantics.
 Even in the simplest case, new semantics need to be established.
 \begin{cfa}
 …
 The solution is for the programmer to disambiguate:
 \begin{cfa}
 waitfor( f, `m2` ); $\C{// wait for call to f with argument m2}$
+waitfor( f : `m2` ); $\C{// wait for call to f with argument m2}$
 \end{cfa}
 Both locks are acquired by function @g@, so when function @f@ is called, the lock for monitor @m2@ is passed from @g@ to @f@, while @g@ still holds lock @m1@.
 This behaviour can be extended to the multi-monitor @waitfor@ statement.
+This behavior can be extended to the multimonitor @waitfor@ statement.
 \begin{cfa}
 monitor M { ... };
 void f( M & mutex m1, M & mutex m2 );
 void g( M & mutex m1, M & mutex m2 ) { waitfor( f, `m1, m2` ); $\C{// wait for call to f with arguments m1 and m2}$
+void g( M & mutex m1, M & mutex m2 ) { waitfor( f : `m1, m2` ); $\C{// wait for call to f with arguments m1 and m2}$
 \end{cfa}
 Again, the set of monitors passed to the @waitfor@ statement must be entirely contained in the set of monitors already acquired by the accepting function.
 Also, the order of the monitors in a @waitfor@ statement is unimportant.
 Figure~\ref{f:UnmatchedMutexSets} shows an example where, for internal and external scheduling with multiple monitors, a signalling or accepting thread must match exactly, \ie partial matching results in waiting.
 For both examples, the set of monitors is disjoint so unblocking is impossible.
+% Also, the order of the monitors in a @waitfor@ statement must match the order of the mutex parameters.
+Figure~\ref{f:UnmatchedMutexSets} shows internal and external scheduling with multiple monitors that must match exactly with a signaling or accepting thread, \ie partial matching results in waiting.
+In both cases, the set of monitors is disjoint so unblocking is impossible.
 \begin{figure}
 …
+}
 void g( M1 & mutex m1, M2 & mutex m2 ) {
         waitfor( f, m1, m2 );
+        waitfor( f : m1, m2 );
+}
 g( `m11`, m2 ); // block on accept
 …
 \end{figure}
-\subsection{\texorpdfstring{\protect\lstinline@mutex@ Threads}{mutex Threads}}
-Threads in \CFA can also be monitors to allow \emph{direct communication} among threads, \ie threads can have mutex functions that are called by other threads.
-Hence, all monitor features are available when using threads.
-Figure~\ref{f:DirectCommunication} shows a comparison of direct call communication in \CFA with direct channel communication in Go.
-(Ada provides a similar mechanism to the \CFA direct communication.)
-The program main in both programs communicates directly with the other thread versus indirect communication where two threads interact through a passive monitor.
-Both direct and indirection thread communication are valuable tools in structuring concurrent programs.
 \begin{figure}
 \centering
 …
 struct Msg { int i, j; };
 thread GoRtn { int i;  float f;  Msg m; };
+mutex thread GoRtn { int i;  float f;  Msg m; };
 void mem1( GoRtn & mutex gortn, int i ) { gortn.i = i; }
 void mem2( GoRtn & mutex gortn, float f ) { gortn.f = f; }
 …
 void ^?{}( GoRtn & mutex ) {}
 void main( GoRtn & gortn ) with( gortn ) {  // thread starts
+void main( GoRtn & mutex gortn ) with(gortn) { // thread starts
         for () {
                 `waitfor( mem1, gortn )` sout | i;  // wait for calls
                 or `waitfor( mem2, gortn )` sout | f;
                 or `waitfor( mem3, gortn )` sout | m.i | m.j;
                 or `waitfor( ^?{}, gortn )` break;
+                `waitfor( mem1 : gortn )` sout | i;  // wait for calls
+                or `waitfor( mem2 : gortn )` sout | f;
+                or `waitfor( mem3 : gortn )` sout | m.i | m.j;
+                or `waitfor( ^?{} : gortn )` break; // low priority
+        }
 …
 \hspace{3pt}
 \subfloat[Go]{\label{f:Gochannel}\usebox\myboxB}
+\caption{Direct communication}
+\label{f:DirectCommunication}
+\caption{Direct versus indirect communication}
+\label{f:DirectCommunicationComparison}
+\medskip
+\begin{cfa}
+mutex thread DatingService {
+        condition Girls[CompCodes], Boys[CompCodes];
+        int girlPhoneNo, boyPhoneNo, ccode;
+};
+int girl( DatingService & mutex ds, int phoneno, int code ) with( ds ) {
+        girlPhoneNo = phoneno;  ccode = code;
+        `wait( Girls[ccode] );`                                                         $\C{// wait for boy}$
+        girlPhoneNo = phoneno;  return boyPhoneNo;
+}
+int boy( DatingService & mutex ds, int phoneno, int code ) with( ds ) {
+        boyPhoneNo = phoneno;  ccode = code;
+        `wait( Boys[ccode] );`                                                          $\C{// wait for girl}$
+        boyPhoneNo = phoneno;  return girlPhoneNo;
+}
+void main( DatingService & ds ) with( ds ) {                    $\C{// thread starts, ds defaults to mutex}$
+        for () {
+                waitfor( ^?{} ) break;                                                  $\C{// high priority}$
+                or waitfor( girl )                                                              $\C{// girl called, compatible boy ? restart boy then girl}$
+                        if ( ! is_empty( Boys[ccode] ) ) { `signal_block( Boys[ccode] );  signal_block( Girls[ccode] );` }
+                or waitfor( boy ) {                                                             $\C{// boy called, compatible girl ? restart girl then boy}$
+                        if ( ! is_empty( Girls[ccode] ) ) { `signal_block( Girls[ccode] );  signal_block( Boys[ccode] );` }
+        }
+}
+\end{cfa}
+\caption{Direct communication dating service}
+\label{f:DirectCommunicationDatingService}
 \end{figure}
 …
 void main( Ping & pi ) {
         for ( 10 ) {
                 `waitfor( ping, pi );`
+                `waitfor( ping : pi );`
                 `pong( po );`
+        }
 …
         for ( 10 ) {
                 `ping( pi );`
                 `waitfor( pong, po );`
+                `waitfor( pong : po );`
+        }
+}
 …
 % \label{f:pingpong}
 % \end{figure}
 Note, the ping/pong threads are globally declared, @pi@/@po@, and hence, start (and possibly complete) before the program main starts.
+Note, the ping/pong threads are globally declared, @pi@/@po@, and hence, start and possibly complete before the program main starts.
 \end{comment}
+\subsection{Execution Properties}
+Table~\ref{t:ObjectPropertyComposition} shows how the \CFA high-level constructs cover 3 fundamental execution properties: thread, stateful function, and mutual exclusion.
+Case 1 is a basic object, with none of the new execution properties.
+Case 2 allows @mutex@ calls to Case 1 to protect shared data.
+Case 3 allows stateful functions to suspend/resume but restricts operations because the state is stackless.
+Case 4 allows @mutex@ calls to Case 3 to protect shared data.
+Cases 5 and 6 are the same as 3 and 4 without restriction because the state is stackful.
+Cases 7 and 8 are rejected because a thread cannot execute without a stackful state in a preemptive environment when context switching from the signal handler.
+Cases 9 and 10 have a stackful thread without and with @mutex@ calls.
+For situations where threads do not require direct communication, case 9 provides faster creation/destruction by eliminating @mutex@ setup.
+\begin{table}
+\caption{Object property composition}
+\centering
+\label{t:ObjectPropertyComposition}
+\renewcommand{\arraystretch}{1.25}
+%\setlength{\tabcolsep}{5pt}
+\begin{tabular}{c|c||l|l}
+\multicolumn{2}{c||}{object properties} & \multicolumn{2}{c}{mutual exclusion} \\
+\hline
+thread  & stateful                              & \multicolumn{1}{c|}{No} & \multicolumn{1}{c}{Yes} \\
+\hline
+\hline
+No              & No                                    & \textbf{1}\ \ \ aggregate type                & \textbf{2}\ \ \ @monitor@ aggregate type \\
+\hline
+No              & Yes (stackless)               & \textbf{3}\ \ \ @generator@                   & \textbf{4}\ \ \ @monitor@ @generator@ \\
+\hline
+No              & Yes (stackful)                & \textbf{5}\ \ \ @coroutine@                   & \textbf{6}\ \ \ @monitor@ @coroutine@ \\
+\hline
+Yes             & No / Yes (stackless)  & \textbf{7}\ \ \ {\color{red}rejected} & \textbf{8}\ \ \ {\color{red}rejected} \\
+\hline
+Yes             & Yes (stackful)                & \textbf{9}\ \ \ @thread@                              & \textbf{10}\ \ @monitor@ @thread@ \\
+\end{tabular}
+\end{table}
+\subsection{\texorpdfstring{\protect\lstinline@mutex@ Generators / coroutines / threads}{monitor Generators / coroutines / threads}}
+\CFA generators, coroutines, and threads can also be @mutex@ (Table~\ref{t:ExecutionPropertyComposition} cases 4, 6, 12) allowing safe \emph{direct communication} with threads, \ie the custom types can have mutex functions that are called by other threads.
+All monitor features are available within these mutex functions.
+For example, if the formatter generator or coroutine equivalent in Figure~\ref{f:CFAFormatGen} is extended with the monitor property and this interface function is used to communicate with the formatter:
+\begin{cfa}
+void fmt( Fmt & mutex fmt, char ch ) { fmt.ch = ch; resume( fmt ) }
+\end{cfa}
+multiple threads can safely pass characters for formatting.
+Figure~\ref{f:DirectCommunicationComparison} shows a comparison of direct call-communication in \CFA versus indirect channel-communication in Go.
+(Ada has a similar mechanism to \CFA direct communication.)
+% The thread main function is by default @mutex@, so the @mutex@ qualifier for the thread parameter is optional.
+% The reason is that the thread logically starts instantaneously in the thread main acquiring its mutual exclusion, so it starts before any calls to prepare for synchronizing these calls.
+The \CFA program @main@ uses the call/return paradigm to directly communicate with the @GoRtn main@, whereas Go switches to the unbuffered channel paradigm to indirectly communicate with the goroutine.
+Communication by multiple threads is safe for the @gortn@ thread via mutex calls in \CFA or channel assignment in Go.
+The difference between call and channel send occurs for buffered channels making the send asynchronous.
+In \CFA, asynchronous call and multiple buffers are provided using an administrator and worker threads~\cite{Gentleman81} and/or futures (not discussed).
+Figure~\ref{f:DirectCommunicationDatingService} shows the dating-service problem in Figure~\ref{f:DatingServiceMonitor} extended from indirect monitor communication to direct thread communication.
+When converting a monitor to a thread (server), the coding pattern is to move as much code as possible from the accepted functions into the thread main so it does as much work as possible.
+Notice, the dating server is postponing requests for an unspecified time while continuing to accept new requests.
+For complex servers, \eg web-servers, there can be hundreds of lines of code in the thread main and safe interaction with clients can be complex.
 …
 For completeness and efficiency, \CFA provides a standard set of low-level locks: recursive mutex, condition, semaphore, barrier, \etc, and atomic instructions: @fetchAssign@, @fetchAdd@, @testSet@, @compareSet@, \etc.
 Some of these low-level mechanism are used in the \CFA runtime, but we strongly advocate using high-level mechanisms whenever possible.
+Some of these low-level mechanisms are used to build the \CFA runtime, but we always advocate using high-level mechanisms whenever possible.
 …
+%
+%
 % \subsection{User Threads}
+% \subsection{User threads}
+%
 % A direct improvement on kernel threads is user threads, \eg Erlang~\cite{Erlang} and \uC~\cite{uC++book}.
 …
 \begin{comment}
 \subsection{Thread Pools}
+\subsection{Thread pools}
 In contrast to direct threading is indirect \newterm{thread pools}, \eg Java @executor@, where small jobs (work units) are inserted into a work pool for execution.
 If the jobs are dependent, \ie interact, there is an implicit/explicit dependency graph that ties them together.
+If the jobs are dependent, \ie interact, there is an implicit dependency graph that ties them together.
 While removing direct concurrency, and hence the amount of context switching, thread pools significantly limit the interaction that can occur among jobs.
 Indeed, jobs should not block because that also blocks the underlying thread, which effectively means the CPU utilization, and therefore throughput, suffers.
 …
 \begin{cfa}
 struct Adder {
     int * row, cols;
+        int * row, cols;
 };
 int operator()() {
 …
 \label{s:RuntimeStructureCluster}
+A \newterm{cluster} is a collection of threads and virtual processors (abstract kernel-thread) that execute the (user) threads from its own ready queue (like an OS executing kernel threads).
+A \newterm{cluster} is a collection of user and kernel threads, where the kernel threads run the user threads from the cluster's ready queue, and the operating system runs the kernel threads on the processors from its ready queue~\cite{Buhr90a}.
+The term \newterm{virtual processor} is introduced as a synonym for kernel thread to disambiguate between user and kernel thread.
+From the language perspective, a virtual processor is an actual processor (core).
 The purpose of a cluster is to control the amount of parallelism that is possible among threads, plus scheduling and other execution defaults.
 The default cluster-scheduler is single-queue multi-server, which provides automatic load-balancing of threads on processors.
 However, the design allows changing the scheduler, \eg multi-queue multi-server with work-stealing/sharing across the virtual processors.
+However, the design allows changing the scheduler, \eg multi-queue multiserver with work-stealing/sharing across the virtual processors.
 If several clusters exist, both threads and virtual processors, can be explicitly migrated from one cluster to another.
 No automatic load balancing among clusters is performed by \CFA.
 …
 \subsection{Virtual Processor}
+\subsection{Virtual processor}
 \label{s:RuntimeStructureProcessor}
 A virtual processor is implemented by a kernel thread (\eg UNIX process), which are scheduled for execution on a hardware processor by the underlying operating system.
+A virtual processor is implemented by a kernel thread, \eg UNIX process, which are scheduled for execution on a hardware processor by the underlying operating system.
 Programs may use more virtual processors than hardware processors.
 On a multiprocessor, kernel threads are distributed across the hardware processors resulting in virtual processors executing in parallel.
 (It is possible to use affinity to lock a virtual processor onto a particular hardware processor~\cite{affinityLinux, affinityWindows, affinityFreebsd, affinityNetbsd, affinityMacosx}, which is used when caching issues occur or for heterogeneous hardware processors.)
+(It is possible to use affinity to lock a virtual processor onto a particular hardware processor~\cite{affinityLinux,affinityWindows}, which is used when caching issues occur or for heterogeneous hardware processors.) %, affinityFreebsd, affinityNetbsd, affinityMacosx
 The \CFA runtime attempts to block unused processors and unblock processors as the system load increases;
 balancing the workload with processors is difficult because it requires future knowledge, \ie what will the applicaton workload do next.
+balancing the workload with processors is difficult because it requires future knowledge, \ie what will the application workload do next.
 Preemption occurs on virtual processors rather than user threads, via operating-system interrupts.
 Thus virtual processors execute user threads, where preemption frequency applies to a virtual processor, so preemption occurs randomly across the executed user threads.
 …
 \label{s:Implementation}
 A primary implementation challenge is avoiding contention from dynamically allocating memory because of bulk acquire, \eg the internal-scheduling design is (almost) free of allocations.
+A primary implementation challenge is avoiding contention from dynamically allocating memory because of bulk acquire, \eg the internal-scheduling design is almost free of allocations.
 All blocking operations are made by parking threads onto queues, therefore all queues are designed with intrusive nodes, where each node has preallocated link fields for chaining.
 Furthermore, several bulk-acquire operations need a variable amount of memory.
 This storage is allocated at the base of a thread's stack before blocking, which means programmers must add a small amount of extra space for stacks.
 In \CFA, ordering of monitor acquisition relies on memory ordering to prevent deadlock~\cite{Havender68}, because all objects have distinct non-overlapping memory layouts, and mutual-exclusion for a monitor is only defined for its lifetime.
+In \CFA, ordering of monitor acquisition relies on memory ordering to prevent deadlock~\cite{Havender68}, because all objects have distinct nonoverlapping memory layouts, and mutual-exclusion for a monitor is only defined for its lifetime.
 When a mutex call is made, pointers to the concerned monitors are aggregated into a variable-length array and sorted.
 This array persists for the entire duration of the mutual exclusion and is used extensively for synchronization operations.
 …
 Nondeterministic preemption provides fairness from long-running threads, and forces concurrent programmers to write more robust programs, rather than relying on code between cooperative scheduling to be atomic.
 This atomic reliance can fail on multi-core machines, because execution across cores is nondeterministic.
 A different reason for not supporting preemption is that it significantly complicates the runtime system, \eg Microsoft runtime does not support interrupts and on Linux systems, interrupts are complex (see below).
+This atomic reliance can fail on multicore machines, because execution across cores is nondeterministic.
+A different reason for not supporting preemption is that it significantly complicates the runtime system, \eg Windows runtime does not support interrupts and on Linux systems, interrupts are complex (see below).
 Preemption is normally handled by setting a countdown timer on each virtual processor.
 When the timer expires, an interrupt is delivered, and the interrupt handler resets the countdown timer, and if the virtual processor is executing in user code, the signal handler performs a user-level context-switch, or if executing in the language runtime kernel, the preemption is ignored or rolled forward to the point where the runtime kernel context switches back to user code.
+When the timer expires, an interrupt is delivered, and its signal handler resets the countdown timer, and if the virtual processor is executing in user code, the signal handler performs a user-level context-switch, or if executing in the language runtime kernel, the preemption is ignored or rolled forward to the point where the runtime kernel context switches back to user code.
 Multiple signal handlers may be pending.
 When control eventually switches back to the signal handler, it returns normally, and execution continues in the interrupted user thread, even though the return from the signal handler may be on a different kernel thread than the one where the signal is delivered.
 The only issue with this approach is that signal masks from one kernel thread may be restored on another as part of returning from the signal handler;
 therefore, the same signal mask is required for all virtual processors in a cluster.
+Because preemption frequency is usually long (1 millisecond) performance cost is negligible.
+Linux switched a decade ago from specific to arbitrary process signal-delivery for applications with multiple kernel threads.
+\begin{cquote}
+A process-directed signal may be delivered to any one of the threads that does not currently have the signal blocked.
+If more than one of the threads has the signal unblocked, then the kernel chooses an arbitrary thread to which it will deliver the signal.
+SIGNAL(7) - Linux Programmer's Manual
+\end{cquote}
+Because preemption interval is usually long (1 ms) performance cost is negligible.
+Linux switched a decade ago from specific to arbitrary virtual-processor signal-delivery for applications with multiple kernel threads.
+In the new semantics, a virtual-processor directed signal may be delivered to any virtual processor created by the application that does not have the signal blocked.
 Hence, the timer-expiry signal, which is generated \emph{externally} by the Linux kernel to an application, is delivered to any of its Linux subprocesses (kernel threads).
 To ensure each virtual processor receives a preemption signal, a discrete-event simulation is run on a special virtual processor, and only it sets and receives timer events.
 …
 \subsection{Debug Kernel}
 There are two versions of the \CFA runtime kernel: debug and non-debug.
 The debugging version has many runtime checks and internal assertions, \eg stack (non-writable) guard page, and checks for stack overflow whenever context switches occur among coroutines and threads, which catches most stack overflows.
 After a program is debugged, the non-debugging version can be used to significantly decrease space and increase performance.
+\subsection{Debug kernel}
+There are two versions of the \CFA runtime kernel: debug and nondebug.
+The debugging version has many runtime checks and internal assertions, \eg stack nonwritable guard page, and checks for stack overflow whenever context switches occur among coroutines and threads, which catches most stack overflows.
+After a program is debugged, the nondebugging version can be used to significantly decrease space and increase performance.
 …
 \label{s:Performance}
+To verify the implementation of the \CFA runtime, a series of microbenchmarks are performed comparing \CFA with pthreads, Java OpenJDK-9, Go 1.12.6 and \uC 7.0.0.
+For comparison, the package must be multi-processor (M:N), which excludes libdill/libmil~\cite{libdill} (M:1)), and use a shared-memory programming model, \eg not message passing.
+The benchmark computer is an AMD Opteron\texttrademark\ 6380 NUMA 64-core, 8 socket, 2.5 GHz processor, running Ubuntu 16.04.6 LTS, and \CFA/\uC are compiled with gcc 6.5.
+All benchmarks are run using the following harness. (The Java harness is augmented to circumvent JIT issues.)
+\begin{cfa}
+unsigned int N = 10_000_000;
+#define BENCH( `run` ) Time before = getTimeNsec();  `run;`  Duration result = (getTimeNsec() - before) / N;
+\end{cfa}
+The method used to get time is @clock_gettime( CLOCK_REALTIME )@.
+Each benchmark is performed @N@ times, where @N@ varies depending on the benchmark;
+the total time is divided by @N@ to obtain the average time for a benchmark.
+Each benchmark experiment is run 31 times.
+All omitted tests for other languages are functionally identical to the \CFA tests and available online~\cite{CforallBenchMarks}.
+% tar --exclude=.deps --exclude=Makefile --exclude=Makefile.in --exclude=c.c --exclude=cxx.cpp --exclude=fetch_add.c -cvhf benchmark.tar benchmark
+\paragraph{Object Creation}
+Object creation is measured by creating/deleting the specific kind of concurrent object.
+Figure~\ref{f:creation} shows the code for \CFA, with results in Table~\ref{tab:creation}.
+The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine to force stack creation, the creation cost is artificially low.
+To test the performance of the \CFA runtime, a series of microbenchmarks are used to compare \CFA with pthreads, Java 11.0.6, Go 1.12.6, Rust 1.37.0, Python 3.7.6, Node.js 12.14.1, and \uC 7.0.0.
+For comparison, the package must be multiprocessor (M:N), which excludes libdil and libmil~\cite{libdill} (M:1)), and use a shared-memory programming model, \eg not message passing.
+The benchmark computer is an AMD Opteron\texttrademark\ 6380 NUMA 64-core, 8 socket, 2.5 GHz processor, running Ubuntu 16.04.6 LTS, and pthreads/\CFA/\uC are compiled with gcc 9.2.1.
+All benchmarks are run using the following harness.
+(The Java harness is augmented to circumvent JIT issues.)
+\begin{cfa}
+#define BENCH( `run` ) uint64_t start = cputime_ns();  `run;`  double result = (double)(cputime_ns() - start) / N;
+\end{cfa}
+where CPU time in nanoseconds is from the appropriate language clock.
+Each benchmark is performed @N@ times, where @N@ is selected so the benchmark runs in the range of 2--20 s for the specific programming language;
+each @N@ appears after the experiment name in the following tables.
+The total time is divided by @N@ to obtain the average time for a benchmark.
+Each benchmark experiment is run 13 times and the average appears in the table.
+For languages with a runtime JIT (Java, Node.js, Python), a single half-hour long experiment is run to check stability;
+all long-experiment results are statistically equivalent, \ie median/average/SD correlate with the short-experiment results, indicating the short experiments reached a steady state.
+All omitted tests for other languages are functionally identical to the \CFA tests and available online~\cite{CforallConcurrentBenchmarks}.
+\subsection{Creation}
+Creation is measured by creating and deleting a specific kind of control-flow object.
+Figure~\ref{f:creation} shows the code for \CFA with results in Table~\ref{t:creation}.
+Note, the call stacks of \CFA coroutines are lazily created on the first resume, therefore the cost of creation with and without a stack are presented.
 \begin{multicols}{2}
+\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
+\begin{cfa}
+@thread@ MyThread {};
+void @main@( MyThread & ) {}
+\begin{cfa}[xleftmargin=0pt]
+`coroutine` MyCoroutine {};
+void ?{}( MyCoroutine & this ) {
+#ifdef EAGER
+        resume( this );
+#endif
+}
+void main( MyCoroutine & ) {}
 int main() {
         BENCH( for ( N ) { @MyThread m;@ } )
         sout | result`ns;
+}
 \end{cfa}
 \captionof{figure}{\CFA object-creation benchmark}
+        BENCH( for ( N ) { `MyCoroutine c;` } )
+        sout | result;
+}
+\end{cfa}
+\captionof{figure}{\CFA creation benchmark}
 \label{f:creation}
 …
 \vspace*{-16pt}
 \captionof{table}{Object creation comparison (nanoseconds)}
 \label{tab:creation}
+\captionof{table}{Creation comparison (nanoseconds)}
+\label{t:creation}
 \begin{tabular}[t]{@{}r*{3}{D{.}{.}{5.2}}@{}}
+\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA Coroutine Lazy             & 13.2          & 13.1          & 0.44          \\
+\CFA Coroutine Eager    & 531.3         & 536.0         & 26.54         \\
+\CFA Thread                             & 2074.9        & 2066.5        & 170.76        \\
+\uC Coroutine                   & 89.6          & 90.5          & 1.83          \\
+\uC Thread                              & 528.2         & 528.5         & 4.94          \\
+Goroutine                               & 4068.0        & 4113.1        & 414.55        \\
+Java Thread                             & 103848.5      & 104295.4      & 2637.57       \\
+Pthreads                                & 33112.6       & 33127.1       & 165.90
+\multicolumn{1}{@{}r}{Object(N)\hspace*{10pt}} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA generator (1B)                     & 0.6           & 0.6           & 0.0           \\
+\CFA coroutine lazy     (100M)  & 13.4          & 13.1          & 0.5           \\
+\CFA coroutine eager (10M)      & 144.7         & 143.9         & 1.5           \\
+\CFA thread (10M)                       & 466.4         & 468.0         & 11.3          \\
+\uC coroutine (10M)                     & 155.6         & 155.7         & 1.7           \\
+\uC thread (10M)                        & 523.4         & 523.9         & 7.7           \\
+Python generator (10M)          & 123.2         & 124.3         & 4.1           \\
+Node.js generator (10M)         & 33.4          & 33.5          & 0.3           \\
+Goroutine thread (10M)          & 751.0         & 750.5         & 3.1           \\
+Rust tokio thread (10M)         & 1860.0        & 1881.1        & 37.6          \\
+Rust thread     (250K)                  & 53801.0       & 53896.8       & 274.9         \\
+Java thread (250K)                      & 119256.0      & 119679.2      & 2244.0        \\
+% Java thread (1 000 000)               & 123100.0      & 123052.5      & 751.6         \\
+Pthreads thread (250K)          & 31465.5       & 31419.5       & 140.4
 \end{tabular}
 \end{multicols}
+\paragraph{Context-Switching}
+\vspace*{-10pt}
+\subsection{Internal scheduling}
+Internal scheduling is measured using a cycle of two threads signaling and waiting.
+Figure~\ref{f:schedint} shows the code for \CFA, with results in Table~\ref{t:schedint}.
+Note, the \CFA incremental cost for bulk acquire is a fixed cost for small numbers of mutex objects.
+User-level threading has one kernel thread, eliminating contention between the threads (direct handoff of the kernel thread).
+Kernel-level threading has two kernel threads allowing some contention.
+\begin{multicols}{2}
+\setlength{\tabcolsep}{3pt}
+\begin{cfa}[xleftmargin=0pt]
+volatile int go = 0;
+`condition c;`
+`monitor` M {} m1/*, m2, m3, m4*/;
+void call( M & `mutex p1/*, p2, p3, p4*/` ) {
+        `signal( c );`
+}
+void wait( M & `mutex p1/*, p2, p3, p4*/` ) {
+        go = 1; // continue other thread
+        for ( N ) { `wait( c );` } );
+}
+thread T {};
+void main( T & ) {
+        while ( go == 0 ) { yield(); } // waiter must start first
+        BENCH( for ( N ) { call( m1/*, m2, m3, m4*/ ); } )
+        sout | result;
+}
+int main() {
+        T t;
+        wait( m1/*, m2, m3, m4*/ );
+}
+\end{cfa}
+\vspace*{-8pt}
+\captionof{figure}{\CFA Internal-scheduling benchmark}
+\label{f:schedint}
+\columnbreak
+\vspace*{-16pt}
+\captionof{table}{Internal-scheduling comparison (nanoseconds)}
+\label{t:schedint}
+\bigskip
+\begin{tabular}{@{}r*{3}{D{.}{.}{5.2}}@{}}
+\multicolumn{1}{@{}r}{Object(N)\hspace*{10pt}} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA @signal@, 1 monitor (10M)  & 364.4         & 364.2         & 4.4           \\
+\CFA @signal@, 2 monitor (10M)  & 484.4         & 483.9         & 8.8           \\
+\CFA @signal@, 4 monitor (10M)  & 709.1         & 707.7         & 15.0          \\
+\uC @signal@ monitor (10M)              & 328.3         & 327.4         & 2.4           \\
+Rust cond. variable     (1M)            & 7514.0        & 7437.4        & 397.2         \\
+Java @notify@ monitor (1M)              & 8717.0        & 8774.1        & 471.8         \\
+% Java @notify@ monitor (100 000 000)           & 8634.0        & 8683.5        & 330.5         \\
+Pthreads cond. variable (1M)    & 5553.7        & 5576.1        & 345.6
+\end{tabular}
+\end{multicols}
+\subsection{External scheduling}
+External scheduling is measured using a cycle of two threads calling and accepting the call using the @waitfor@ statement.
+Figure~\ref{f:schedext} shows the code for \CFA with results in Table~\ref{t:schedext}.
+Note, the \CFA incremental cost for bulk acquire is a fixed cost for small numbers of mutex objects.
+\begin{multicols}{2}
+\setlength{\tabcolsep}{5pt}
+\vspace*{-16pt}
+\begin{cfa}[xleftmargin=0pt]
+`monitor` M {} m1/*, m2, m3, m4*/;
+void call( M & `mutex p1/*, p2, p3, p4*/` ) {}
+void wait( M & `mutex p1/*, p2, p3, p4*/` ) {
+        for ( N ) { `waitfor( call : p1/*, p2, p3, p4*/ );` }
+}
+thread T {};
+void main( T & ) {
+        BENCH( for ( N ) { call( m1/*, m2, m3, m4*/ ); } )
+        sout | result;
+}
+int main() {
+        T t;
+        wait( m1/*, m2, m3, m4*/ );
+}
+\end{cfa}
+\captionof{figure}{\CFA external-scheduling benchmark}
+\label{f:schedext}
+\columnbreak
+\vspace*{-18pt}
+\captionof{table}{External-scheduling comparison (nanoseconds)}
+\label{t:schedext}
+\begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
+\multicolumn{1}{@{}r}{Object(N)\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA @waitfor@, 1 monitor (10M) & 367.1 & 365.3 & 5.0   \\
+\CFA @waitfor@, 2 monitor (10M) & 463.0 & 464.6 & 7.1   \\
+\CFA @waitfor@, 4 monitor (10M) & 689.6 & 696.2 & 21.5  \\
+\uC \lstinline[language=uC++]|_Accept| monitor (10M)    & 328.2 & 329.1 & 3.4   \\
+Go \lstinline[language=Golang]|select| channel (10M)    & 365.0 & 365.5 & 1.2
+\end{tabular}
+\end{multicols}
+\subsection{Mutual-Exclusion}
+Uncontented mutual exclusion, which frequently occurs, is measured by entering and leaving a critical section.
+For monitors, entering and leaving a mutex function are measured, otherwise the language-appropriate mutex-lock is measured.
+For comparison, a spinning (vs.\ blocking) test-and-test-set lock is presented.
+Figure~\ref{f:mutex} shows the code for \CFA with results in Table~\ref{t:mutex}.
+Note the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
+\begin{multicols}{2}
+\setlength{\tabcolsep}{3pt}
+\begin{cfa}[xleftmargin=0pt]
+`monitor` M {} m1/*, m2, m3, m4*/;
+call( M & `mutex p1/*, p2, p3, p4*/` ) {}
+int main() {
+        BENCH( for( N ) call( m1/*, m2, m3, m4*/ ); )
+        sout | result;
+}
+\end{cfa}
+\captionof{figure}{\CFA acquire/release mutex benchmark}
+\label{f:mutex}
+\columnbreak
+\vspace*{-16pt}
+\captionof{table}{Mutex comparison (nanoseconds)}
+\label{t:mutex}
+\begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
+\multicolumn{1}{@{}r}{Object(N)\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+test-and-test-set lock (50M)            & 19.1  & 18.9  & 0.4   \\
+\CFA @mutex@ function, 1 arg. (50M)     & 48.3  & 47.8  & 0.9   \\
+\CFA @mutex@ function, 2 arg. (50M)     & 86.7  & 87.6  & 1.9   \\
+\CFA @mutex@ function, 4 arg. (50M)     & 173.4 & 169.4 & 5.9   \\
+\uC @monitor@ member rtn. (50M)         & 54.8  & 54.8  & 0.1   \\
+Goroutine mutex lock (50M)                      & 34.0  & 34.0  & 0.0   \\
+Rust mutex lock (50M)                           & 33.0  & 33.2  & 0.8   \\
+Java synchronized method (50M)          & 31.0  & 30.9  & 0.5   \\
+% Java synchronized method (10 000 000 000)             & 31.0 & 30.2 & 0.9 \\
+Pthreads mutex Lock (50M)                       & 31.0  & 31.1  & 0.4
+\end{tabular}
+\end{multicols}
+\subsection{Context switching}
 In procedural programming, the cost of a function call is important as modularization (refactoring) increases.
 (In many cases, a compiler inlines function calls to eliminate this cost.)
 Similarly, when modularization extends to coroutines/tasks, the time for a context switch becomes a relevant factor.
+(In many cases, a compiler inlines function calls to increase the size and number of basic blocks for optimizing.)
+Similarly, when modularization extends to coroutines and threads, the time for a context switch becomes a relevant factor.
 The coroutine test is from resumer to suspender and from suspender to resumer, which is two context switches.
+%For async-await systems, the test is scheduling and fulfilling @N@ empty promises, where all promises are allocated before versus interleaved with fulfillment to avoid garbage collection.
+For async-await systems, the test measures the cost of the @await@ expression entering the event engine by awaiting @N@ promises, where each created promise is resolved by an immediate event in the engine (using Node.js @setImmediate@).
 The thread test is using yield to enter and return from the runtime kernel, which is two context switches.
 The difference in performance between coroutine and thread context-switch is the cost of scheduling for threads, whereas coroutines are self-scheduling.
+Figure~\ref{f:ctx-switch} only shows the \CFA code for coroutines/threads (other systems are similar) with all results in Table~\ref{tab:ctx-switch}.
+Figure~\ref{f:ctx-switch} shows the \CFA code for a coroutine and thread with results in Table~\ref{t:ctx-switch}.
+% From: Gregor Richards <gregor.richards@uwaterloo.ca>
+% To: "Peter A. Buhr" <pabuhr@plg2.cs.uwaterloo.ca>
+% Date: Fri, 24 Jan 2020 13:49:18 -0500
+%
+% I can also verify that the previous version, which just tied a bunch of promises together, *does not* go back to the
+% event loop at all in the current version of Node. Presumably they're taking advantage of the fact that the ordering of
+% events is intentionally undefined to just jump right to the next 'then' in the chain, bypassing event queueing
+% entirely. That's perfectly correct behavior insofar as its difference from the specified behavior isn't observable, but
+% it isn't typical or representative of much anything useful, because most programs wouldn't have whole chains of eager
+% promises. Also, it's not representative of *anything* you can do with async/await, as there's no way to encode such an
+% eager chain that way.
 \begin{multicols}{2}
+\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
+\begin{cfa}[aboveskip=0pt,belowskip=0pt]
+@coroutine@ C {} c;
+void main( C & ) { for ( ;; ) { @suspend;@ } }
+\begin{cfa}[xleftmargin=0pt]
+`coroutine` C {};
+void main( C & ) { for () { `suspend;` } }
 int main() { // coroutine test
+        BENCH( for ( N ) { @resume( c );@ } )
+        sout | result`ns;
+}
+int main() { // task test
+        BENCH( for ( N ) { @yield();@ } )
+        sout | result`ns;
+        C c;
+        BENCH( for ( N ) { `resume( c );` } )
+        sout | result;
+}
+int main() { // thread test
+        BENCH( for ( N ) { `yield();` } )
+        sout | result;
+}
 \end{cfa}
 …
 \vspace*{-16pt}
 \captionof{table}{Context switch comparison (nanoseconds)}
 \label{tab:ctx-switch}
+\label{t:ctx-switch}
 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
+\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+C function              & 1.8   & 1.8   & 0.01  \\
+\CFA generator  & 2.4   & 2.2   & 0.25  \\
+\CFA Coroutine  & 36.2  & 36.2  & 0.25  \\
+\CFA Thread             & 93.2  & 93.5  & 2.09  \\
+\uC Coroutine   & 52.0  & 52.1  & 0.51  \\
+\uC Thread              & 96.2  & 96.3  & 0.58  \\
+Goroutine               & 141.0 & 141.3 & 3.39  \\
+Java Thread             & 374.0 & 375.8 & 10.38 \\
+Pthreads Thread & 361.0 & 365.3 & 13.19
+\multicolumn{1}{@{}r}{Object(N)\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+C function (10B)                        & 1.8           & 1.8           & 0.0   \\
+\CFA generator (5B)                     & 1.8           & 2.0           & 0.3   \\
+\CFA coroutine (100M)           & 32.5          & 32.9          & 0.8   \\
+\CFA thread (100M)                      & 93.8          & 93.6          & 2.2   \\
+\uC coroutine (100M)            & 50.3          & 50.3          & 0.2   \\
+\uC thread (100M)                       & 97.3          & 97.4          & 1.0   \\
+Python generator (100M)         & 40.9          & 41.3          & 1.5   \\
+Node.js await (5M)                      & 1852.2        & 1854.7        & 16.4  \\
+Node.js generator (100M)        & 33.3          & 33.4          & 0.3   \\
+Goroutine thread (100M)         & 143.0         & 143.3         & 1.1   \\
+Rust async await (100M)         & 32.0          & 32.0          & 0.0   \\
+Rust tokio thread (100M)        & 143.0         & 143.0         & 1.7   \\
+Rust thread (25M)                       & 332.0         & 331.4         & 2.4   \\
+Java thread (100M)                      & 405.0         & 415.0         & 17.6  \\
+% Java thread (  100 000 000)                   & 413.0 & 414.2 & 6.2 \\
+% Java thread (5 000 000 000)                   & 415.0 & 415.2 & 6.1 \\
+Pthreads thread (25M)           & 334.3         & 335.2         & 3.9
 \end{tabular}
 \end{multicols}
+\paragraph{Mutual-Exclusion}
+Uncontented mutual exclusion, which frequently occurs, is measured by entering/leaving a critical section.
+For monitors, entering and leaving a monitor function is measured.
+To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a @pthread_mutex@ lock is also measured.
+Figure~\ref{f:mutex} shows the code for \CFA with all results in Table~\ref{tab:mutex}.
+Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
+\begin{multicols}{2}
+\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
+\begin{cfa}
+@monitor@ M {} m1/*, m2, m3, m4*/;
+void __attribute__((noinline))
+do_call( M & @mutex m/*, m2, m3, m4*/@ ) {}
+int main() {
+        BENCH(
+                for( N ) do_call( m1/*, m2, m3, m4*/ );
+        )
+        sout | result`ns;
+}
+\end{cfa}
+\captionof{figure}{\CFA acquire/release mutex benchmark}
+\label{f:mutex}
+\columnbreak
+\vspace*{-16pt}
+\captionof{table}{Mutex comparison (nanoseconds)}
+\label{tab:mutex}
+\begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
+\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+test and test-and-test lock             & 19.1  & 18.9  & 0.40  \\
+\CFA @mutex@ function, 1 arg.   & 45.9  & 46.6  & 1.45  \\
+\CFA @mutex@ function, 2 arg.   & 105.0 & 104.7 & 3.08  \\
+\CFA @mutex@ function, 4 arg.   & 165.0 & 167.6 & 5.65  \\
+\uC @monitor@ member rtn.               & 54.0  & 53.7  & 0.82  \\
+Java synchronized method                & 31.0  & 31.1  & 0.50  \\
+Pthreads Mutex Lock                             & 33.6  & 32.6  & 1.14
+\end{tabular}
+\end{multicols}
+\paragraph{External Scheduling}
+External scheduling is measured using a cycle of two threads calling and accepting the call using the @waitfor@ statement.
+Figure~\ref{f:ext-sched} shows the code for \CFA, with results in Table~\ref{tab:ext-sched}.
+Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
+\begin{multicols}{2}
+\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
+\vspace*{-16pt}
+\begin{cfa}
+volatile int go = 0;
+@monitor@ M {} m;
+thread T {};
+void __attribute__((noinline))
+do_call( M & @mutex@ ) {}
+void main( T & ) {
+        while ( go == 0 ) { yield(); }
+        while ( go == 1 ) { do_call( m ); }
+}
+int __attribute__((noinline))
+do_wait( M & @mutex@ m ) {
+        go = 1; // continue other thread
+        BENCH( for ( N ) { @waitfor( do_call, m );@ } )
+        go = 0; // stop other thread
+        sout | result`ns;
+}
+int main() {
+        T t;
+        do_wait( m );
+}
+\end{cfa}
+\captionof{figure}{\CFA external-scheduling benchmark}
+\label{f:ext-sched}
+\columnbreak
+\vspace*{-16pt}
+\captionof{table}{External-scheduling comparison (nanoseconds)}
+\label{tab:ext-sched}
+\begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}}
+\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA @waitfor@, 1 @monitor@     & 376.4 & 376.8 & 7.63  \\
+\CFA @waitfor@, 2 @monitor@     & 491.4 & 492.0 & 13.31 \\
+\CFA @waitfor@, 4 @monitor@     & 681.0 & 681.7 & 19.10 \\
+\uC @_Accept@                           & 331.1 & 331.4 & 2.66
+\end{tabular}
+\end{multicols}
+\paragraph{Internal Scheduling}
+Internal scheduling is measured using a cycle of two threads signalling and waiting.
+Figure~\ref{f:int-sched} shows the code for \CFA, with results in Table~\ref{tab:int-sched}.
+Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects.
+Java scheduling is significantly greater because the benchmark explicitly creates multiple thread in order to prevent the JIT from making the program sequential, \ie removing all locking.
+\begin{multicols}{2}
+\lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}
+\begin{cfa}
+volatile int go = 0;
+@monitor@ M { @condition c;@ } m;
+void __attribute__((noinline))
+do_call( M & @mutex@ a1 ) { @signal( c );@ }
+thread T {};
+void main( T & this ) {
+        while ( go == 0 ) { yield(); }
+        while ( go == 1 ) { do_call( m ); }
+}
+int  __attribute__((noinline))
+do_wait( M & mutex m ) with(m) {
+        go = 1; // continue other thread
+        BENCH( for ( N ) { @wait( c );@ } );
+        go = 0; // stop other thread
+        sout | result`ns;
+}
+int main() {
+        T t;
+        do_wait( m );
+}
+\end{cfa}
+\captionof{figure}{\CFA Internal-scheduling benchmark}
+\label{f:int-sched}
+\columnbreak
+\vspace*{-16pt}
+\captionof{table}{Internal-scheduling comparison (nanoseconds)}
+\label{tab:int-sched}
+\bigskip
+\begin{tabular}{@{}r*{3}{D{.}{.}{5.2}}@{}}
+\multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\
+\CFA @signal@, 1 @monitor@      & 372.6         & 374.3         & 14.17         \\
+\CFA @signal@, 2 @monitor@      & 492.7         & 494.1         & 12.99         \\
+\CFA @signal@, 4 @monitor@      & 749.4         & 750.4         & 24.74         \\
+\uC @signal@                            & 320.5         & 321.0         & 3.36          \\
+Java @notify@                           & 10160.5       & 10169.4       & 267.71        \\
+Pthreads Cond. Variable         & 4949.6        & 5065.2        & 363
+\end{tabular}
+\end{multicols}
+\section{Conclusion}
+\subsection{Discussion}
+Languages using 1:1 threading based on pthreads can at best meet or exceed, due to language overhead, the pthread results.
+Note, pthreads has a fast zero-contention mutex lock checked in user space.
+Languages with M:N threading have better performance than 1:1 because there is no operating-system interactions (context-switching or locking).
+As well, for locking experiments, M:N threading has less contention if only one kernel thread is used.
+Languages with stackful coroutines have higher cost than stackless coroutines because of stack allocation and context switching;
+however, stackful \uC and \CFA coroutines have approximately the same performance as stackless Python and Node.js generators.
+The \CFA stackless generator is approximately 25 times faster for suspend/resume and 200 times faster for creation than stackless Python and Node.js generators.
+The Node.js context-switch is costly when asynchronous await must enter the event engine because a promise is not fulfilled.
+Finally, the benchmark results correlate across programming languages with and without JIT, indicating the JIT has completed any runtime optimizations.
+\section{Conclusions and Future Work}
 Advanced control-flow will always be difficult, especially when there is temporal ordering and nondeterminism.
 However, many systems exacerbate the difficulty through their presentation mechanisms.
+This paper shows it is possible to present a hierarchy of control-flow features, generator, coroutine, thread, and monitor, providing an integrated set of high-level, efficient, and maintainable control-flow features.
+Eliminated from \CFA are spurious wakeup and barging, which are nonintuitive and lead to errors, and having to work with a bewildering set of low-level locks and acquisition techniques.
+\CFA high-level race-free monitors and tasks provide the core mechanisms for mutual exclusion and synchronization, without having to resort to magic qualifiers like @volatile@/@atomic@.
+This paper shows it is possible to understand high-level control-flow using three properties: statefulness, thread, mutual-exclusion/synchronization.
+Combining these properties creates a number of high-level, efficient, and maintainable control-flow types: generator, coroutine, thread, each of which can be a monitor.
+Eliminated from \CFA are barging and spurious wakeup, which are nonintuitive and lead to errors, and having to work with a bewildering set of low-level locks and acquisition techniques.
+\CFA high-level race-free monitors and threads, when used with mutex access function, provide the core mechanisms for mutual exclusion and synchronization, without having to resort to magic qualifiers like @volatile@ or @atomic@.
 Extending these mechanisms to handle high-level deadlock-free bulk acquire across both mutual exclusion and synchronization is a unique contribution.
 The \CFA runtime provides concurrency based on a preemptive M:N user-level threading-system, executing in clusters, which encapsulate scheduling of work on multiple kernel threads providing parallelism.
 The M:N model is judged to be efficient and provide greater flexibility than a 1:1 threading model.
 These concepts and the \CFA runtime-system are written in the \CFA language, extensively leveraging the \CFA type-system, which demonstrates the expressiveness of the \CFA language.
+Performance comparisons with other concurrent systems/languages show the \CFA approach is competitive across all low-level operations, which translates directly into good performance in well-written concurrent applications.
+C programmers should feel comfortable using these mechanisms for developing complex control-flow in applications, with the ability to obtain maximum available performance by selecting mechanisms at the appropriate level of need.
+\section{Future Work}
+Performance comparisons with other concurrent systems and languages show the \CFA approach is competitive across all basic operations, which translates directly into good performance in well-written applications with advanced control-flow.
+C programmers should feel comfortable using these mechanisms for developing complex control-flow in applications, with the ability to obtain maximum available performance by selecting mechanisms at the appropriate level of need using only calling communication.
 While control flow in \CFA has a strong start, development is still underway to complete a number of missing features.
+\paragraph{Flexible Scheduling}
+\label{futur:sched}
+\medskip
+\textbf{Flexible scheduling:}
 An important part of concurrency is scheduling.
 Different scheduling algorithms can affect performance (both in terms of average and variation).
+Different scheduling algorithms can affect performance, both in terms of average and variation.
 However, no single scheduler is optimal for all workloads and therefore there is value in being able to change the scheduler for given programs.
 One solution is to offer various tuning options, allowing the scheduler to be adjusted to the requirements of the workload.
 However, to be truly flexible, a pluggable scheduler is necessary.
+Currently, the \CFA pluggable scheduler is too simple to handle complex scheduling, \eg quality of service and real-time, where the scheduler must interact with mutex objects to deal with issues like priority inversion~\cite{Buhr00b}.
+\paragraph{Non-Blocking I/O}
+\label{futur:nbio}
+Many modern workloads are not bound by computation but IO operations, a common case being web servers and XaaS~\cite{XaaS} (anything as a service).
+Currently, the \CFA pluggable scheduler is too simple to handle complex scheduling, \eg quality of service and real time, where the scheduler must interact with mutex objects to deal with issues like priority inversion~\cite{Buhr00b}.
+\smallskip
+\textbf{Non-Blocking I/O:}
+Many modern workloads are not bound by computation but IO operations, common cases being web servers and XaaS~\cite{XaaS} (anything as a service).
 These types of workloads require significant engineering to amortizing costs of blocking IO-operations.
 At its core, non-blocking I/O is an operating-system level feature queuing IO operations, \eg network operations, and registering for notifications instead of waiting for requests to complete.
+At its core, nonblocking I/O is an operating-system level feature queuing IO operations, \eg network operations, and registering for notifications instead of waiting for requests to complete.
 Current trends use asynchronous programming like callbacks, futures, and/or promises, \eg Node.js~\cite{NodeJs} for JavaScript, Spring MVC~\cite{SpringMVC} for Java, and Django~\cite{Django} for Python.
+However, these solutions lead to code that is hard to create, read and maintain.
+A better approach is to tie non-blocking I/O into the concurrency system to provide ease of use with low overhead, \eg thread-per-connection web-services.
+A non-blocking I/O library is currently under development for \CFA.
+\paragraph{Other Concurrency Tools}
+\label{futur:tools}
+However, these solutions lead to code that is hard to create, read, and maintain.
+A better approach is to tie nonblocking I/O into the concurrency system to provide ease of use with low overhead, \eg thread-per-connection web-services.
+A nonblocking I/O library is currently under development for \CFA.
+\smallskip
+\textbf{Other concurrency tools:}
 While monitors offer flexible and powerful concurrency for \CFA, other concurrency tools are also necessary for a complete multi-paradigm concurrency package.
 Examples of such tools can include futures and promises~\cite{promises}, executors and actors.
 …
 As well, new \CFA extensions should make it possible to create a uniform interface for virtually all mutual exclusion, including monitors and low-level locks.
+\paragraph{Implicit Threading}
+\label{futur:implcit}
+Basic concurrent (embarrassingly parallel) applications can benefit greatly from implicit concurrency, where sequential programs are converted to concurrent, possibly with some help from pragmas to guide the conversion.
+\smallskip
+\textbf{Implicit threading:}
+Basic \emph{embarrassingly parallel} applications can benefit greatly from implicit concurrency, where sequential programs are converted to concurrent, with some help from pragmas to guide the conversion.
 This type of concurrency can be achieved both at the language level and at the library level.
 The canonical example of implicit concurrency is concurrent nested @for@ loops, which are amenable to divide and conquer algorithms~\cite{uC++book}.
 The \CFA language features should make it possible to develop a reasonable number of implicit concurrency mechanism to solve basic HPC data-concurrency problems.
+The \CFA language features should make it possible to develop a reasonable number of implicit concurrency mechanisms to solve basic HPC data-concurrency problems.
 However, implicit concurrency is a restrictive solution with significant limitations, so it can never replace explicit concurrent programming.
 …
 \section{Acknowledgements}
 The authors would like to recognize the design assistance of Aaron Moss, Rob Schluntz, Andrew Beach and Michael Brooks on the features described in this paper.
 Funding for this project has been provided by Huawei Ltd.\ (\url{http://www.huawei.com}). %, and Peter Buhr is partially funded by the Natural Sciences and Engineering Research Council of Canada.
+The authors recognize the design assistance of Aaron Moss, Rob Schluntz, Andrew Beach, and Michael Brooks; David Dice for commenting and helping with the Java benchmarks; and Gregor Richards for helping with the Node.js benchmarks.
+This research is funded by the NSERC/Waterloo-Huawei (\url{http://www.huawei.com}) Joint Innovation Lab. %, and Peter Buhr is partially funded by the Natural Sciences and Engineering Research Council of Canada.
 {%
 \fontsize{9bp}{12bp}\selectfont%
+\fontsize{9bp}{11.5bp}\selectfont%
 \bibliography{pl,local}
 }%

doc/papers/concurrency/annex/local.bib

-              r3c64c668
+              r58fe85a
     booktitle   = {Supercomputing, 2005. Proceedings of the ACM/IEEE SC 2005 Conference},
     publisher   = {IEEE},
+    location    = {Seattle, Washington, U.S.A.},
+    month       = nov,
     year        = {2005},
     pages       = {35-35},
-    month       = nov,
+}
 …
 @manual{Cpp-Transactions,
         keywords        = {C++, Transactional Memory},
         title           = {Technical Specification for C++ Extensions for Transactional Memory},
         organization= {International Standard ISO/IEC TS 19841:2015 },
         publisher   = {American National Standards Institute},
         address = {http://www.iso.org},
         year            = 2015,
+    keywords    = {C++, Transactional Memory},
+    title       = {Tech. Spec. for C++ Extensions for Transactional Memory {ISO/IEC} {TS} 19841:2015},
+    organization= {International Standard Organization},
+    address     = {Geneva, Switzerland},
+    year        = 2015,
+    note        = {\href{https://www.iso.org/standard/66343.html}{https://\-www.iso.org/\-standard/\-66343.html}},
+}
 …
 @manual{affinityLinux,
         key     = {TBB},
+        title           = "{Linux man page - sched\_setaffinity(2)}"
+        title           = "{Linux man page - sched\_setaffinity(2)}",
+        note    = {\href{https://man7.org/linux/man-pages/man2/sched_setaffinity.2.html}{https://\-man7.org/\-linux/man-pages/\-man2/sched\_setaffinity.2.html}},
+}
 @manual{affinityWindows,
+        title           = "{Windows (vs.85) - SetThreadAffinityMask function}"
+        title           = "{Windows documentation - SetThreadAffinityMask function}",
+        note    = {\href{https://docs.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-setthreadaffinitymask}{https://\-docs.microsoft.com/\-en-us/\-windows/\-win32/api/\-winbase/\-nf-winbase-setthreadaffinitymask}}
+}

doc/papers/concurrency/examples/Fib.py

-              r3c64c668
+              r58fe85a
         while True:
                 fn = fn1 + fn2; fn2 = fn1; fn1 = fn; yield fn
 f1 = Fib()
 …
 # Local Variables: #
 # tab-width: 4 #
 # compile-command: "python3.5 Fib.py" #
+# compile-command: "python3.7 Fib.py" #
 # End: #

doc/papers/concurrency/examples/Fib2.c

-              r3c64c668
+              r58fe85a
 #include <stdio.h>
-void mary() {
-        printf( "MARY\n" );
+}
 #define FIB_INIT { 0 }
 typedef struct { int next; int fn1, fn2; } Fib;
+typedef struct { int restart; int fn1, fn2; } Fib;
 int fib( Fib * f ) {
+        static void * states[] = { &&s1, &&s2, &&s3 };
+        goto *states[f->next];
+        static void * states[] = { &&s0, &&s1, &&s2 };
+        goto *states[f->restart];
+  s0:
+        f->fn1 = 0;
+        f->restart = 1;
+        return f->fn1;
   s1:
-        mary();
-        f->fn1 = 0;
-        f->next = 1;
-        return f->fn1;
-  s2:
-        mary();
         f->fn2 = f->fn1;
         f->fn1 = 1;
         f->next = 2;
+        f->restart = 2;
         return f->fn1;
+  s3:;
+        mary();
+  s2:;
         int fn = f->fn1 + f->fn2;
         f->fn2 = f->fn1;

doc/papers/concurrency/examples/Fib2.py

-              r3c64c668
+              r58fe85a
 def Fib():
     fn1, fn = 0, 1
+    fn1, fn = 1, 0
     while True:
         yield fn1
+        yield fn
         fn1, fn = fn, fn1 + fn
 …
 # Local Variables: #
 # tab-width: 4 #
 # compile-command: "python3.5 Fib2.py" #
+# compile-command: "python3.7 Fib2.py" #
 # End: #

doc/papers/concurrency/examples/Fib3.c

-              r3c64c668
+              r58fe85a
 typedef struct {
+        int fn1, fn;
+        void * next;
+        int restart, fn1, fn;
 } Fib;
 #define FibCtor { 1, 0, NULL }
+#define FibCtor { 0, 1, 0 }
 Fib * comain( Fib * f ) {
+        if ( __builtin_expect(f->next != 0, 1) ) goto *f->next;
+        f->next = &&s1;
+        static void * states[] = {&&s0, &&s1};
+        goto *states[f->restart];
+  s0: f->restart = 1;
         for ( ;; ) {
                 return f;

doc/papers/concurrency/examples/FibRefactor.py

r3c64c668	r58fe85a
22	22	# Local Variables: #
23	23	# tab-width: 4 #
24		# compile-command: "python3.5 FibRefactor.py" #
	24	# compile-command: "python3.7 FibRefactor.py" #
25	25	# End: #

doc/papers/concurrency/examples/Format.c

-              r3c64c668
+              r58fe85a
 typedef struct {
         void * next;
+        int restart, g, b;
         char ch;
-        int g, b;
 } Fmt;
 void comain( Fmt * f ) {
+        if ( __builtin_expect(f->next != 0, 1) ) goto *f->next;
+        f->next = &&s1;
+        static void * states[] = {&&s0, &&s1};
+        goto *states[f->restart];
+  s0: f->restart = 1;
         for ( ;; ) {
                 for ( f->g = 0; f->g < 5; f->g += 1 ) {                 // groups
                         for ( f->b = 0; f->b < 4; f->b += 1 ) {         // blocks
+                                return;
+                          s1:;  while ( f->ch == '\n' ) return;         // ignore
+                                do {
+                                        return;  s1: ;
+                                } while ( f->ch == '\n' );                              // ignore
                                 printf( "%c", f->ch );                                  // print character
+                        }
 …
 int main() {
         Fmt fmt = { NULL };
+        Fmt fmt = { 0 };
         comain( &fmt );                                                                         // prime
         for ( ;; ) {

doc/papers/concurrency/examples/Format.cc

-              r3c64c668
+              r58fe85a
                         for ( g = 0; g < 5; g += 1 ) { // groups of 5 blocks
                                 for ( b = 0; b < 4; b += 1 ) { // blocks of 4 characters
 //                                      for ( ;; ) { // for newline characters
+                                        for ( ;; ) { // for newline characters
                                                 suspend();
 //                                              if ( ch != '\n' ) break; // ignore newline
 //                                      }
+                                                if ( ch != '\n' ) break; // ignore newline
+                                        }
 //                                      cout << ch; // print character
+                                }
 …
 // Local Variables: //
 // tab-width: 4 //
 // compile-command: "u++-work -O2 -nodebubg Format.cc" //
+// compile-command: "u++-work -O2 -nodebug Format.cc" //
 // End: //

doc/papers/concurrency/examples/Format.cfa

-              r3c64c668
+              r58fe85a
                 for ( g = 0; g < 5; g += 1 ) {          // groups of 5 blocks
                         for ( b = 0; b < 4; b += 1 ) {  // blocks of 4 characters
 //                              do {
+                                do {
                                         suspend();
 //                              } while ( ch == '\n' || ch == '\t' );
+                                } while ( ch == '\n' || ch == '\t' );
                                 sout | ch;                                      // print character
+                        }

doc/papers/concurrency/examples/Format.data

r3c64c668	r58fe85a
1		abcdefghijklmnopqrstuvwxyzxxxxxxxxxxxxxx
	1	abcdefghijklmnop
	2	qrstuvwxyzx
	3	xxxxxxxxxxxxx

doc/papers/concurrency/examples/Format.py

-              r3c64c668
+              r58fe85a
                         for g in range( 5 ):    # groups of 5 blocks
                                 for b in range( 4 ): # blocks of 4 characters
+                                        print( (yield), end='' ) # receive from send
+                                        while True:
+                                                ch = (yield) # receive from send
+                                                if '\n' not in ch:
+                                                        break
+                                        print( ch, end='' ) # receive from send
                                 print( '  ', end='' ) # block separator
                         print()                                 # group separator
 …
                         print()
+input = "abcdefghijklmnop\nqrstuvwx\nyzxxxxxxxxxxxxxx\n"
 fmt = Format()
 next( fmt )                                                     # prime generator
 for i in range( 41 ):
         fmt.send( 'a' );                                # send to yield
+for i in input:
+        fmt.send( i );                          # send to yield
 # Local Variables: #
 # tab-width: 4 #
 # compile-command: "python3.5 Format.py" #
+# compile-command: "python3.7 Format.py" #
 # End: #

doc/papers/concurrency/examples/Format1.c

-              r3c64c668
+              r58fe85a
 typedef struct {
         void * next;
+        int restart, g, b;
         char ch;
-        int g, b;
 } Fmt;
 void format( Fmt * f ) {
+        if ( __builtin_expect(f->next != 0, 1) ) goto *f->next;
+        f->next = &&s1;
+        static void * states[] = {&&s0, &&s1};
+        goto *states[f->restart];
+  s0: f->restart = 1;
         for ( ;; ) {
                 for ( f->g = 0; f->g < 5; f->g += 1 ) {                 // groups
                         for ( f->b = 0; f->b < 4; f->b += 1 ) {         // blocks
                                 return;
+                          s1: ;
+                                if ( f->ch == '\0' ) goto fini;                 // EOF ?
+                          s1: if ( f->ch == '\0' ) goto fini;           // EOF ?
                                 while ( f->ch == '\n' ) return;                 // ignore
                                 printf( "%c", f->ch );                                  // print character
+//                              printf( "%c", f->ch );                                  // print character
+                        }
                         printf( " " );                                                          // block separator
+//                      printf( " " );                                                          // block separator
+                }
                 printf( "\n" );                                                                 // group separator
+//              printf( "\n" );                                                                 // group separator
+        }
   fini:
         if ( f->g != 0 || f->b != 0 ) printf( "\n" );
+  fini:;
+//      if ( f->g != 0 || f->b != 0 ) printf( "\n" );
+}
 int main() {
         Fmt fmt = { NULL };
+        Fmt fmt = { 0 };
         format( &fmt );                                                                         // prime
+        for ( ;; ) {
+                scanf( "%c", &fmt.ch );                                                 // direct read into communication variable
+          if ( feof( stdin ) ) break;
+        fmt.ch = 'a';
+        for ( long int i = 0; i < 1000000000; i += 1 ) {
+//              scanf( "%c", &fmt.ch );                                                 // direct read into communication variable
+//        if ( feof( stdin ) ) break;
                 format( &fmt );
+        }
         fmt.ch = '\0';
+        fmt.ch = '\0';                                                                          // sentential (EOF)
         format( &fmt );
+}

doc/papers/concurrency/examples/PingPong.c

-              r3c64c668
+              r58fe85a
 typedef struct PingPong {
+        int restart;                                                                            // style 1
+        int N, i;
         const char * name;
-        int N, i;
         struct PingPong * partner;
         void * next;
+        void * next;                                                                            // style 2
 } PingPong;
+#define PPCtor( name, N ) { name, N, 0, NULL, NULL }
+#define PPCtor( name, N ) { 0, N, 0, name, NULL, NULL }
 void comain( PingPong * pp ) __attribute__(( noinline ));
 void comain( PingPong * pp ) {
+#if 0
         if ( __builtin_expect(pp->next != 0, 1) ) goto *pp->next;
-#if 0
-        pp->next = &&here;
-                asm( "mov  %0,%%rdi" : "=m" (pp) );
-                asm( "mov  %rdi,%rax" );
-#ifndef OPT
-#ifdef PRINT
-                asm( "add  $16, %rsp" );
-#endif // PRINT
-                asm( "popq %rbp" );
-#endif // ! OPT
-#ifdef OPT
-#ifdef PRINT
-                asm( "popq %rbx" );
-#endif // PRINT
-#endif // OPT
-                asm( "jmp  comain" );
-  here: ;
-#endif // 0
         pp->next = &&cycle;
         for ( ; pp->i < pp->N; pp->i += 1 ) {
 …
           cycle: ;
         } // for
+#endif // 0
+#if 1
+        static void * states[] = {&&s0, &&s1};
+        goto *states[pp->restart];
+  s0: pp->restart = 1;
+        for ( ; pp->i < pp->N; pp->i += 1 ) {
+#ifdef PRINT
+                printf( "%s %d\n", pp->name, pp->i );
+#endif // PRINT
+                asm( "mov  %0,%%rdi" : "=m" (pp->partner) );
+                asm( "mov  %rdi,%rax" );
+#ifndef OPT
+#ifdef PRINT
+                asm( "add  $16, %rsp" );
+#endif // PRINT
+                asm( "popq %rbp" );
+#endif // ! OPT
+#ifdef OPT
+#ifdef PRINT
+                asm( "popq %rbx" );
+#endif // PRINT
+#endif // OPT
+                asm( "jmp  comain" );
+          s1: ;
+        } // for
+#endif // 0
+}
 …
 // Local Variables: //
 // tab-width: 4 //
 // compile-command: "gcc-8 -g -DPRINT PingPong.c" //
+// compile-command: "gcc-9 -g -DPRINT PingPong.c" //
 // End: //

doc/papers/concurrency/examples/Pingpong.py

-              r3c64c668
+              r58fe85a
 def PingPong( name, N ):
         partner = (yield)           # get partner
         yield                       # resume scheduler
+        partner = yield                         # get partner
+        yield                                           # resume scheduler
         for i in range( N ):
                 print( name )
                 yield partner           # execute next
+                yield partner                   # execute next
         print( "end", name )
 def Scheduler():
+        n = (yield)                 # starting coroutine
+        while True:
+                n = next( n )           # schedule coroutine
+        n = yield                                       # starting coroutine
+        try:
+                while True:
+                        n = next( n )           # schedule coroutine
+        except StopIteration:
+                pass
 pi = PingPong( "ping", 5 )
 po = PingPong( "pong", 5 )
 next( pi )                      # prime
 pi.send( po )                   # send partner
 next( po )                      # prime
 po.send( pi )                   # send partner
+next( pi )                                              # prime
+pi.send( po )                                   # send partner
+next( po )                                              # prime
+po.send( pi )                                   # send partner
 s = Scheduler();
 next( s )                       # prime
+next( s )                                               # prime
 try:
         s.send( pi )                            # start cycle
 except StopIteration:
         print( "scheduler stop" )
+except StopIteration:                   # scheduler stopped
+        pass
 print( "stop" )
 # Local Variables: #
 # tab-width: 4 #
 # compile-command: "python3.5 Pingpong.py" #
+# compile-command: "python3.7 Pingpong.py" #
 # End: #

doc/papers/concurrency/examples/ProdCons.py

-              r3c64c668
+              r58fe85a
 def Prod( N ):
         cons = (yield)              # get cons
         yield                       # resume scheduler
+        cons = yield                            # get cons
+        yield                                           # resume scheduler
         for i in range( N ):
                 print( "prod" )
                 yield cons              # execute next
+                yield cons                              # execute next
         print( "end", "prod" )
 def Cons( N ):
         prod = (yield)              # get prod
         yield                       # resume scheduler
+        prod = yield                            # get prod
+        yield                                           # resume scheduler
         for i in range( N ):
                 print( "cons" )
                 yield prod              # execute next
+                yield prod                              # execute next
         print( "end", "cons" )
 def Scheduler():
+        n = (yield)                 # starting coroutine
+        while True:
+                n = next( n )           # schedule coroutine
+        n = yield                                       # starting coroutine
+        try:
+                while True:
+                        n = next( n )           # schedule coroutine
+        except StopIteration:
+                pass
 prod = Prod( 5 )
 cons = Cons( 5 )
 next( prod )                    # prime
 prod.send( cons )               # send cons
 next( cons )                    # prime
 cons.send( prod )               # send prod
+next( prod )                                    # prime
+prod.send( cons )                               # send cons
+next( cons )                                    # prime
+cons.send( prod )                               # send prod
 s = Scheduler();
 next( s )                       # prime
+next( s )                                               # prime
 try:
         s.send( prod )                          # start cycle
 except StopIteration:
         print( "scheduler stop" )
+except StopIteration:                   # scheduler stopped
+        pass
 print( "stop" )
 # Local Variables: #
 # tab-width: 4 #
 # compile-command: "python3.5 ProdCons.py" #
+# compile-command: "python3.7 ProdCons.py" #
 # End: #

doc/papers/concurrency/examples/Refactor.py

r3c64c668	r58fe85a
26	26	# Local Variables: #
27	27	# tab-width: 4 #
28		# compile-command: "python3.5 Refactor.py" #
	28	# compile-command: "python3.7 Refactor.py" #
29	29	# End: #

doc/papers/concurrency/figures/FullCoroutinePhases.fig

-              r3c64c668
+              r58fe85a
 -2
 2
 1 0 1 0 7 100 0 -1 0.000 0 0 1 0 4575.000 2437.500 4275 1875 4575 1800 4875 1875
+1 0 1 0 7 100 0 -1 0.000 0 0 1 0 5175.000 2437.500 4875 1875 5175 1800 5475 1875
 1 1.00 45.00 90.00
 1 0 1 0 7 100 0 -1 0.000 0 0 1 0 4575.000 1537.500 4875 2100 4575 2175 4275 2100
+1 0 1 0 7 100 0 -1 0.000 0 0 1 0 5175.000 1537.500 5475 2100 5175 2175 4875 2100
 1 1.00 45.00 90.00
 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 4207.500 1642.500 4125 1425 3975 1650 4200 1875
+1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 4807.500 1642.500 4725 1425 4575 1650 4800 1875
 1 1.00 45.00 90.00
+1575 1575 2700 2025
 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 …
 1 1.00 45.00 90.00
 1575 2400 1800
+1 0 100 0 4 10 0.0000 2 165 300 1725 1950 ping\001
+1 0 100 0 4 10 0.0000 2 135 360 2475 1950 pong\001
+-6
+3075 1575 4200 2025
+3075 1575 4200 2025
 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 1575 3300 1800
+1575 3300 1800
 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
+2025 3300 2250
+1 0 100 0 0 10 0.0000 2 105 555 2100 1200 creation\001
+1 0 100 0 4 10 0.0000 2 165 300 1725 1950 ping\001
+1 0 100 0 4 10 0.0000 2 135 360 2475 1950 pong\001
+1 0 100 0 4 10 0.0000 2 165 300 3300 1950 ping\001
+1 0 100 0 4 10 0.0000 2 135 360 3300 2400 pong\001
+1 0 100 0 0 10 0.0000 2 105 675 4575 1200 execution\001
+1 0 100 0 4 10 0.0000 2 165 300 4275 2025 ping\001
+1 0 100 0 4 10 0.0000 2 135 360 4875 2025 pong\001
+1 0 100 0 0 10 0.0000 2 90 420 3300 1200 starter\001
+1575 3900 1800
+1 0 100 0 4 10 0.0000 2 165 300 3225 1950 ping\001
+1 0 100 0 4 10 0.0000 2 135 360 3975 1950 pong\001
+-6
+-6
 1 0 100 0 4 10 0.0000 2 165 705 2100 1500 pgm main\001
+1 0 100 0 4 10 0.0000 2 165 705 3300 1500 pgm main\001
+1 0 100 0 4 10 0.0000 2 165 705 4500 1500 pgm main\001
+1 0 100 0 4 10 0.0000 2 165 705 3600 1500 pgm main\001
+1 0 100 0 4 10 0.0000 2 165 300 4875 2025 ping\001
+1 0 100 0 4 10 0.0000 2 135 360 5475 2025 pong\001
+1 0 100 0 4 10 0.0000 2 165 705 5100 1500 pgm main\001
+1 0 100 0 2 10 0.0000 2 105 540 2100 1275 creator\001
+1 0 100 0 2 10 0.0000 2 105 495 3600 1275 starter\001
+1 0 100 0 2 10 0.0000 2 105 690 5175 1275 execution\001

doc/papers/concurrency/figures/RunTimeStructure.fig

-              r3c64c668
+              r58fe85a
 -2
 2
 3855 2775 4155 2925
 3 0 1 0 0 0 0 0 0.000 1 0.0000 3930 2850 30 30 3930 2850 3960 2880
 3 0 1 0 0 0 0 0 0.000 1 0.0000 4035 2850 30 30 4035 2850 4065 2880
+3255 2475 3555 2625
+3 0 1 0 0 0 0 0 0.000 1 0.0000 3330 2550 30 30 3330 2550 3360 2580
+3 0 1 0 0 0 0 0 0.000 1 0.0000 3435 2550 30 30 3435 2550 3465 2580
 -6
 4755 3525 5055 3675
 3 0 1 0 0 0 0 0 0.000 1 0.0000 4830 3600 30 30 4830 3600 4860 3630
 3 0 1 0 0 0 0 0 0.000 1 0.0000 4935 3600 30 30 4935 3600 4965 3630
+4155 3225 4455 3375
+3 0 1 0 0 0 0 0 0.000 1 0.0000 4230 3300 30 30 4230 3300 4260 3330
+3 0 1 0 0 0 0 0 0.000 1 0.0000 4335 3300 30 30 4335 3300 4365 3330
 -6
 4650 2775 4950 2925
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4725 2850 15 15 4725 2850 4740 2865
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4800 2850 15 15 4800 2850 4815 2865
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4875 2850 15 15 4875 2850 4890 2865
+4050 2475 4350 2625
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4125 2550 15 15 4125 2550 4140 2565
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4200 2550 15 15 4200 2550 4215 2565
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4275 2550 15 15 4275 2550 4290 2565
 -6
 3225 2400 3525 2550
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3300 2475 15 15 3300 2475 3315 2490
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3375 2475 15 15 3375 2475 3390 2490
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3450 2475 15 15 3450 2475 3465 2490
+2625 2100 2925 2250
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 2700 2175 15 15 2700 2175 2715 2190
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 2775 2175 15 15 2775 2175 2790 2190
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 2850 2175 15 15 2850 2175 2865 2190
 -6
 5475 3450 5625 3750
 3 0 1 -1 -1 0 0 20 0.000 1 4.7120 5550 3525 15 15 5550 3525 5535 3540
 3 0 1 -1 -1 0 0 20 0.000 1 4.7120 5550 3600 15 15 5550 3600 5535 3615
 3 0 1 -1 -1 0 0 20 0.000 1 4.7120 5550 3675 15 15 5550 3675 5535 3690
+4875 3150 5025 3450
+3 0 1 -1 -1 0 0 20 0.000 1 4.7120 4950 3225 15 15 4950 3225 4935 3240
+3 0 1 -1 -1 0 0 20 0.000 1 4.7120 4950 3300 15 15 4950 3300 4935 3315
+3 0 1 -1 -1 0 0 20 0.000 1 4.7120 4950 3375 15 15 4950 3375 4935 3390
 -6
 4275 3525 4575 3675
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4350 3600 15 15 4350 3600 4365 3615
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4425 3600 15 15 4425 3600 4440 3615
 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 3600 15 15 4500 3600 4515 3615
+3675 3225 3975 3375
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3750 3300 15 15 3750 3300 3765 3315
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3825 3300 15 15 3825 3300 3840 3315
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3900 3300 15 15 3900 3300 3915 3315
 -6
+2175 4650 7050 4950
+3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4860
+1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4200 4800 150 75 4200 4800 4350 4875
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3275 4800 100 100 3275 4800 3375 4800
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+4950 5400 4725 5175 4725 5175 4950 5400 4950
+2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
+4950 6300 4950 6300 4725 6525 4725 6525 4950
+0 -1 0 0 0 10 0.0000 2 105 450 6600 4875 cluster\001
+0 -1 0 0 0 10 0.0000 2 105 660 5475 4875 processor\001
+0 -1 0 0 0 10 0.0000 2 105 555 4425 4875 monitor\001
+0 -1 0 0 0 10 0.0000 2 120 270 3450 4875 task\001
+0 -1 0 0 0 10 0.0000 2 105 660 2325 4875 coroutine\001
+2625 3825 4050 4125
+3750 3900 4050 4050
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3825 3975 15 15 3825 3975 3840 3990
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3900 3975 15 15 3900 3975 3915 3990
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3975 3975 15 15 3975 3975 3990 3990
 -6
+3450 1275 3750 1425
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3525 1350 15 15 3525 1350 3540 1365
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3600 1350 15 15 3600 1350 3615 1365
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3675 1350 15 15 3675 1350 3690 1365
+1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 2850 3975 225 150 2850 3975 3075 4125
+1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3450 3975 225 150 3450 3975 3675 4125
 -6
+5550 1275 5850 1425
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 5625 1350 15 15 5625 1350 5640 1365
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 5700 1350 15 15 5700 1350 5715 1365
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 5775 1350 15 15 5775 1350 5790 1365
+6075 3825 6900 4125
+6600 3900 6900 4050
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 6675 3975 15 15 6675 3975 6690 3990
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 6750 3975 15 15 6750 3975 6765 3990
+3 0 1 -1 -1 0 0 20 0.000 1 0.0000 6825 3975 15 15 6825 3975 6840 3990
 -6
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 2625 150 150 5550 2625 5700 2625
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 3225 150 150 5550 3225 5700 3225
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 3975 150 150 5550 3975 5700 3975
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 2850 150 150 3525 2850 3675 2850
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4200 2475 150 150 4200 2475 4350 2475
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4425 2850 150 150 4425 2850 4575 2850
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4650 2475 150 150 4650 2475 4800 2475
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 3600 150 150 3525 3600 3675 3600
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 3600 150 150 3975 3600 4125 3600
+3 0 1 0 0 0 0 0 0.000 1 0.0000 3525 3600 30 30 3525 3600 3555 3630
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3750 2475 150 150 3750 2475 3900 2625
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4875 3600 150 150 4875 3600 5025 3750
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 2850 150 150 3975 2850 4125 2850
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 7200 2775 150 150 7200 2775 7350 2775
+1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4650 1350 225 150 4650 1350 4875 1500
+1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5250 1350 225 150 5250 1350 5475 1500
+1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4050 1350 225 150 4050 1350 4275 1500
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+4200 2400 3750 1950 3750 1950 4200 2400 4200
+2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5
+4500 6300 1800 3000 1800 3000 4500 6300 4500
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+2850 5775 2400 5325 2400 5325 2850 5775 2850
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+4200 5775 3750 5325 3750 5325 4200 5775 4200
+1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6300 3975 225 150 6300 3975 6525 4125
+-6
+6075 3225 7425 3675
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3975 5325 3975
+3450 6375 3450
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
+3225 5325 3225
+3450 6750 3450
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+3675 7200 3225 6750 3225 6750 3675 7200 3675
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
+2625 5325 2625
+3450 7425 3450
+-6
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4950 2325 150 150 4950 2325 5100 2325
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4950 2925 150 150 4950 2925 5100 2925
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4950 3675 150 150 4950 3675 5100 3675
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 2925 2550 150 150 2925 2550 3075 2550
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3600 2175 150 150 3600 2175 3750 2175
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3825 2550 150 150 3825 2550 3975 2550
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4050 2175 150 150 4050 2175 4200 2175
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3375 3300 150 150 3375 3300 3525 3300
+3 0 1 0 0 0 0 0 0.000 1 0.0000 2925 3300 30 30 2925 3300 2955 3330
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3150 2175 150 150 3150 2175 3300 2325
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4275 3300 150 150 4275 3300 4425 3450
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3375 2550 150 150 3375 2550 3525 2550
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6600 2475 150 150 6600 2475 6750 2475
+3 0 1 0 0 0 0 0 0.000 1 0.0000 1650 4530 30 30 1650 4530 1680 4560
+3 0 1 0 0 0 0 0 0.000 1 0.0000 6600 2475 30 30 6600 2475 6630 2505
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 2925 3300 150 150 2925 3300 3075 3300
+3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3275 4500 100 100 3275 4500 3375 4500
+1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4050 4500 150 75 4050 4500 4200 4575
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+3900 1800 3450 1350 3450 1350 3900 1800 3900
+2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5
+4200 5700 1500 2400 1500 2400 4200 5700 4200
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+2550 5175 2100 4725 2100 4725 2550 5175 2550
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+3900 5175 3450 4725 3450 4725 3900 5175 3900
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3975 5925 3975
+3675 4725 3675
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3225 5925 3225
+2925 4725 2925
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
+2625 5925 2625
+1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2
+3975 5175 2625
+2325 4725 2325
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3975 5925 2025
+3675 5325 3675
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3750 6225 3750
+2925 5325 2925
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
+2625 3225 2625
+2325 5325 2325
+1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2
+3675 4575 2325
+1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3675 5325 1725
+1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3450 5625 3450
+1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+2325 2625 2325
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 3
 1 1.00 45.00 90.00
 2025 4200 2025 4200 2250
+1725 3600 1725 3600 1950
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2
 2625 3225 3600
+2325 2625 3300
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3600 3375 3600
+3300 2775 3300
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3600 3825 3600
+3300 3225 3300
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3600 4275 3600
+3300 3675 3300
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3600 4725 3600
+3300 4125 3300
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 3600 5175 3600
+3300 4575 3300
 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
 3450 5775 3000 5325 3000 5325 3450 5775 3450
+3150 5175 2700 4725 2700 4725 3150 5175 3150
 2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5
 4500 8100 1800 6600 1800 6600 4500 8100 4500
+4200 7500 1500 6000 1500 6000 4200 7500 4200
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
+3975 6975 3975
+1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+2775 6825 2775
+2475 6225 2475
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2
+2775 6825 3975
+1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3975 7350 3975
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+4200 7800 3750 7350 3750 7350 4200 7800 4200
+1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3975 8025 3975
+2475 6225 3450
 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 4
 1 1.00 45.00 90.00
+3975 7875 2325 7200 2325 7200 2550
+1 -1 0 0 0 10 0.0000 2 105 720 5550 4425 Processors\001
+1 -1 0 0 0 10 0.0000 2 120 1005 4200 3225 Blocked Tasks\001
+1 -1 0 0 0 10 0.0000 2 150 870 4200 3975 Ready Tasks\001
+1 -1 0 0 0 10 0.0000 2 135 1095 7350 1725 Other Cluster(s)\001
+1 -1 0 0 0 10 0.0000 2 105 840 4650 1725 User Cluster\001
+1 -1 0 0 0 10 0.0000 2 150 615 2175 3675 Manager\001
+1 -1 0 0 0 10 0.0000 2 105 990 2175 3525 Discrete-event\001
+1 -1 0 0 0 10 0.0000 2 135 795 2175 4350 preemption\001
+3450 7275 2025 6600 2025 6600 2250
+2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
+4650 5250 4425 5025 4425 5025 4650 5250 4650
+2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
+4650 6150 4650 6150 4425 6375 4425 6375 4650
+1 -1 0 0 0 10 0.0000 2 105 720 4950 4125 Processors\001
+1 -1 0 0 0 10 0.0000 2 120 1005 3600 2925 Blocked Tasks\001
+1 -1 0 0 0 10 0.0000 2 150 870 3600 3675 Ready Tasks\001
+1 -1 0 0 0 10 0.0000 2 135 1095 6750 1425 Other Cluster(s)\001
+1 -1 0 0 0 10 0.0000 2 105 840 4050 1425 User Cluster\001
+1 -1 0 0 0 10 0.0000 2 150 615 1575 3375 Manager\001
+1 -1 0 0 0 10 0.0000 2 105 990 1575 3225 Discrete-event\001
+1 -1 0 0 0 10 0.0000 2 135 795 1575 4050 preemption\001
+0 -1 0 0 0 10 0.0000 2 150 1365 1725 4575 generator/coroutine\001
+0 -1 0 0 0 10 0.0000 2 120 270 3450 4575 task\001
+0 -1 0 0 0 10 0.0000 2 105 450 6450 4575 cluster\001
+0 -1 0 0 0 10 0.0000 2 105 660 5325 4575 processor\001
+0 -1 0 0 0 10 0.0000 2 105 555 4275 4575 monitor\001

doc/papers/concurrency/mail

-              r3c64c668
+              r58fe85a
 Dear Dr Buhr,
 Your manuscript entitled "Concurrency in C∀" has been received by Software:
+Your manuscript entitled "Concurrency in Cforall" has been received by Software:
 Practice and Experience. It will be given full consideration for publication in
 the journal.
 …
 Dear Dr Buhr,
 Many thanks for submitting SPE-18-0205 entitled "Concurrency in C∀" to Software: Practice and Experience.
+Many thanks for submitting SPE-18-0205 entitled "Concurrency in Cforall" to Software: Practice and Experience.
 In view of the comments of the referees found at the bottom of this letter, I cannot accept your paper for publication in Software: Practice and Experience. I hope that you find the referees' very detailed comments helpful.

doc/papers/concurrency/mail2

-              r3c64c668
+              r58fe85a
 Software: Practice and Experience Editorial Office
+Date: Tue, 12 Nov 2019 22:25:17 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: Software: Practice and Experience - Decision on Manuscript ID
+ SPE-19-0219
+-Nov-2019
+Dear Dr Buhr,
+Many thanks for submitting SPE-19-0219 entitled "Advanced Control-flow and Concurrency in Cforall" to Software: Practice and Experience. The paper has now been reviewed and the comments of the referees are included at the bottom of this letter.
+The decision on this paper is that it requires substantial further work is required. The referees have a number of substantial concerns. All the reviewers found the submission very hard to read; two of the reviewers state that it needs very substantial restructuring. These concerns must be addressed before your submission can be considered further.
+A revised version of your manuscript that takes into account the comments of the referees will be reconsidered for publication.
+Please note that submitting a revision of your manuscript does not guarantee eventual acceptance, and that your revision will be subject to re-review by the referees before a decision is rendered.
+You have 90 days from the date of this email to submit your revision. If you are unable to complete the revision within this time, please contact me to request an extension.
+You can upload your revised manuscript and submit it through your Author Center. Log into https://mc.manuscriptcentral.com/spe  and enter your Author Center, where you will find your manuscript title listed under "Manuscripts with Decisions".
+When submitting your revised manuscript, you will be able to respond to the comments made by the referee(s) in the space provided.  You can use this space to document any changes you make to the original manuscript.
+If you feel that your paper could benefit from English language polishing, you may wish to consider having your paper professionally edited for English language by a service such as Wiley's at http://wileyeditingservices.com. Please note that while this service will greatly improve the readability of your paper, it does not guarantee acceptance of your paper by the journal.
+Once again, thank you for submitting your manuscript to Software: Practice and Experience and I look forward to receiving your revision.
+Sincerely,
+Prof. Richard Jones
+Software: Practice and Experience
+R.E.Jones@kent.ac.uk
+Referee(s)' Comments to Author:
+Reviewing: 1
+Comments to the Author
+This article presents the design and rationale behind the various
+threading and synchronization mechanisms of C-forall, a new low-level
+programming language.  This paper is very similar to a companion paper
+which I have also received: as the papers are similar, so will these
+reviews be --- in particular any general comments from the other
+review apply to this paper also.
+As far as I can tell, the article contains three main ideas: an
+asynchronous execution / threading model; a model for monitors to
+provide mutual exclusion; and an implementation.  The first two ideas
+are drawn together in Table 1: unfortunately this is on page 25 of 30
+pages of text. Implementation choices and descriptions are scattered
+throughout the paper - and the sectioning of the paper seems almost
+arbitrary.
+The article is about its contributions.  Simply adding feature X to
+language Y isn't by itself a contribution, (when feature X isn't
+already a contribution).  The contribution can be in the design: the
+motivation, the space of potential design options, the particular
+design chosen and the rationale for that choice, or the resulting
+performance.  For example: why support two kinds of generators as well
+as user-level threads?  Why support both low and high level
+synchronization constructs?  Similarly I would have found the article
+easier to follow if it was written top down, presenting the design
+principles, present the space of language features, justify chosen
+language features (and rationale) and those excluded, and then present
+implementation, and performance.
+Then the writing of the article is often hard to follow, to say the
+least. Two examples: section 3 "stateful functions" - I've some idea
+what that is (a function with Algol's "own" or C's "static" variables?
+but in fact the paper has a rather more specific idea than that. The
+top of page 3 throws a whole lot of defintions at the reader
+"generator" "coroutine" "stackful" "stackless" "symmetric"
+"asymmetric" without every stopping to define each one --- but then in
+footnote "C" takes the time to explain what C's "main" function is?  I
+cannot imagine a reader of this paper who doesn't know what "main" is
+in C; especially if they understand the other concepts already
+presented in the paper.  The start of section 3 then does the same
+thing: putting up a whole lot of definitions, making distinctions and
+comparisons, even talking about some runtime details, but the critical
+definition of a monitor doesn't appear until three pages later, at the
+start of section 5 on p15, lines 29-34 are a good, clear, description
+of what a monitor actually is.  That needs to come first, rather than
+being buried again after two sections of comparisons, discussions,
+implementations, and options that are ungrounded because they haven't
+told the reader what they are actually talking about.  First tell the
+reader what something is, then how they might use it (as programmers:
+what are the rules and restrictions) and only then start comparison
+with other things, other approaches, other languages, or
+implementations.
+The description of the implementation is similarly lost in the trees
+without ever really seeing the wood. Figure 19 is crucial here, but
+it's pretty much at the end of the paper, and comments about
+implementations are threaded throughout the paper without the context
+(fig 19) to understand what's going on.   The protocol for performance
+testing may just about suffice for C (although is N constantly ten
+million, or does it vary for each benchmark) but such evaluation isn't
+appropriate for garbage-collected or JITTed languages like Java or Go.
+other comments working through the paper - these are mostly low level
+and are certainly not comprehensive.
+p1 only a subset of C-forall extensions?
+p1 "has features often associated with object-oriented programming
+languages, such as constructors, destructors, virtuals and simple
+inheritance."   There's no need to quibble about this. Once a language
+has inheritance, it's hard to claim it's not object-oriented.
+p2 barging? signals-as-hints?
+p3 start your discussion of generations with a simple example of a
+C-forall generator.  Fig 1(b) might do: but put it inline instead of
+the python example - and explain the key rules and restrictions on the
+construct.  Then don't even start to compare with coroutines until
+you've presented, described and explained your coroutines...
+p3 I'd probably leave out the various "C" versions unless there are
+key points to make you can't make in C-forall. All the alternatives
+are just confusing.
+p4 but what's that "with" in Fig 1(B)
+p5 start with the high level features of C-forall generators...
+p5 why is the paper explaining networking protocols?
+p7 lines 1-9 (transforming generator to coroutine - why would I do any
+of this? Why would I want one instead of the other (do not use "stack"
+in your answer!)
+p10 last para "A coroutine must retain its last resumer to suspend
+back because the resumer is on a different stack. These reverse
+pointers allow suspend to cycle backwards, "  I've no idea what is
+going on here?  why should I care?  Shouldn't I just be using threads
+instead?  why not?
+p16 for the same reasons - what reasons?
+p17 if the multiple-monitor entry procedure really is novel, write a
+paper about that, and only about that.
+p23 "Loose Object Definitions" - no idea what that means.  in that
+section: you can't leave out JS-style dynamic properties.  Even in
+OOLs that (one way or another) allow separate definitions of methods
+(like Objective-C, Swift, Ruby, C#) at any time a runtime class has a
+fixed definition.  Quite why the detail about bit mask implementation
+is here anyway, I've no idea.
+p25 this cluster isn't a CLU cluster then?
+* conclusion should conclude the paper, not the related.
+Reviewing: 2
+Comments to the Author
+This paper describes the concurrency features of an extension of C (whose name I will write as "C\/" here, for convenience), including much design-level discussion of the coroutine- and monitor-based features and some microbenchmarks exploring the current implementation's performance. The key message of the latter is that the system's concurrency abstractions are much lighter-weight than the threading found in mainstream C or Java implementations.
+There is much description of the system and its details, but nothing about (non-artificial) uses of it. Although the microbenchmark data is encouraging, arguably not enough practical experience with the system has been reported here to say much about either its usability advantages or its performance.
+As such, the main contribution of the paper seem to be to document the existence of the described system and to provide a detailed design rationale and (partial) tutorial. I believe that could be of interest to some readers, so an acceptable manuscript is lurking in here somewhere.
+Unfortunately, at present the writing style is somewhere between unclear and infuriating. It omits to define terms; it uses needlessly many terms for what are apparently (but not clearly) the same things; it interrupts itself rather than deliver the natural consequent of whatever it has just said; and so on. Section 5 is particularly bad in these regards -- see my detailed comments below. Fairly major additional efforts will be needed to turn the present text into a digestible design-and-tutorial document. I suspect that a shorter paper could do this job better than the present manuscript, which is overwrought in parts.
+p2: lines 4--9 are a little sloppy. It is not the languages but their popular implementations which "adopt" the 1:1 kernel threading model.
+line 10: "medium work" -- "medium-sized work"?
+line 18: "is all sequential to the compiler" -- not true in modern compilers, and in 2004 H-J Boehm wrote a tech report describing exactly why ("Threads cannot be implemented as a library", HP Labs).
+line 20: "knows the optimization boundaries" -- I found this vague. What's an example?
+line 31: this paragraph has made a lot of claims. Perhaps forward-reference to the parts of the paper that discuss each one.
+line 33: "so the reader can judge if" -- this reads rather passive-aggressively. Perhaps better: "... to support our argument that..."
+line 41: "a dynamic partitioning mechanism" -- I couldn't tell what this meant
+p3. Presenting concept of a "stateful function" as a new language feature seems odd. In C, functions often have local state thanks to static local variables (or globals, indeed). Of course, that has several limitations. Can you perhaps present your contributions by enumerating these limitations? See also my suggestion below about a possible framing centred on a strawman.
+line 2: "an old idea that is new again" -- this is too oblique
+lines 2--15: I found this to be a word/concept soup. Stacks, closures, generators, stackless stackful, coroutine, symmetric, asymmetric, resume/suspend versus resume/resume... there needs to be a more gradual and structured way to introduce all this, and ideally one that minimises redundancy. Maybe present it as a series of "definitions" each with its own heading, e.g. "A closure is stackless if its local state has statically known fixed size"; "A generator simply means a stackless closure." And so on. Perhaps also strongly introduce the word "activate" as a direct contrast with resume and suspend. These are just a flavour of the sort of changes that might make this paragraph into something readable.
+Continuing the thought: I found it confusing that by these definitinos, a stackful closure is not a stack, even though logically the stack *is* a kind of closure (it is a representation of the current thread's continuation).
+lines 24--27: without explaining what the boost functor types mean, I don't think the point here comes across.
+line 34: "semantically coupled" -- I wasn't surew hat this meant
+p4: the point of Figure 1 (C) was not immediately clear. It seem to be showing how one might "compile down" Figure 1 (B). Or is that Figure 1 (A)?
+It's right that the incidental language features of the system are not front-and-centre, but I'd appreciate some brief glossing of non-C languages features as they appear. Examples are the square bracket notation, the pipe notation and the constructor syntax. These explanations could go in the caption of the figure which first uses them, perhaps. Overall I found the figure captions to be terse, and a missed opportunity to explain clearly what was going on.
+p5 line 23: "This restriction is removed..." -- give us some up-front summary of your contributions and the elements of the language design that will be talked about, so that this isn't an aside. This will reduce the "twisty passages" feeling that characterises much of the paper.
+line 40: "a killer asymmetric generator" -- this is stylistically odd, and the sentence about failures doesn't convincigly argue that C\/ will help with them. Have you any experience writing device drivers using C\/? Or any argument that the kinds of failures can be traced to the "stack-ripping" style that one is forced to use without coroutines? Also, a typo on line 41: "device drives". And saying "Windows/Linux" is sloppy... what does the cited paper actually say?
+p6 lines 13--23: this paragraph is difficult to understand. It seems to be talking about a control-flow pattern roughly equivalent to tail recursion. What is the high-level point, other than that this is possible?
+line 34: "which they call coroutines" -- a better way to make this point is presumably that the C++20 proposal only provides a specialised kind of coroutine, namely generators, despite its use of the more general word.
+line 47: "... due to dynamic stack allocation, execution..." -- this sentence doesn't scan. I suggest adding "and for" in the relevant places where currently there are only commas.
+p8 / Figure 5 (B) -- the GNU C extension of unary "&&" needs to be explained. The whole figure needs a better explanation, in fact.
+p9, lines 1--10: I wasn't sure this stepping-through really added much value. What are the truly important points to note about this code?
+p10: similarly, lines 3--27 again are somewhere between tedious and confusing. I'm sure the motivation and details of "starter semantics" can both be stated much more pithily.
+line 32: "a self-resume does not overwrite the last resumer" -- is this a hack or a defensible principled decision?
+p11: "a common source of errors" -- among beginners or among production code? Presumably the former.
+line 23: "with builtin and library" -- not sure what this means
+lines 31--36: these can be much briefer. The only important point here seems to be that coroutines cannot be copied.
+p12: line 1: what is a "task"? Does it matter?
+line 7: calling it "heap stack" seems to be a recipe for confusion. "Stack-and-heap" might be better, and contrast with "stack-and-VLS" perhaps. When "VLS" is glossed, suggest actually expanding its initials: say "length" not "size".
+line 21: are you saying "cooperative threading" is the same as "non-preemptive scheduling", or that one is a special case (kind) of the other? Both are defensible, but be clear.
+line 27: "mutual exclusion and synchronization" -- the former is a kind of the latter, so I suggest "and other forms of synchronization".
+line 30: "can either be a stackless or stackful" -- stray "a", but also, this seems to be switching from generic/background terminology to C\/-specific terminology.
+An expositional idea occurs: start the paper with a strawman naive/limited realisation of coroutines -- say, Simon Tatham's popular "Coroutines in C" web page -- and identify point by point what the limitations are and how C\/ overcomes them. Currently the presentation is often flat (lacking motivating contrasts) and backwards (stating solutions before problems). The foregoing approach might fix both of these.
+page 13: line 23: it seems a distraction to mention the Python feature here.
+p14 line 5: it seems odd to describe these as "stateless" just because they lack shared mutable state. It means the code itself is even more stateful. Maybe the "stack ripping" argument could usefully be given here.
+line 16: "too restrictive" -- would be good to have a reference to justify this, or at least give a sense of what the state-of-the-art performance in transactional memory systems is (both software and hardware)
+line 22: "simulate monitors" -- what about just *implementing* monitors? isn't that what these systems do? or is the point more about refining them somehow into something more specialised?
+p15: sections 4.1 and 4.2 seem adrift and misplaced. Split them into basic parts (which go earlier) and more advanced parts (e.g. barging, which can be explained later).
+line 31: "acquire/release" -- misses an opportunity to contrast the monitor's "enter/exit" abstraction with the less structured acquire/release of locks.
+p16 line 12: the "implicit" versus "explicit" point is unclear. Is it perhaps about the contract between an opt-in *discipline* and a language-enforced *guarantee*?
+line 28: no need to spend ages dithering about which one is default and which one is the explicit qualifier. Tell us what you decided, briefly justify it, and move on.
+p17: Figure 11: since the main point seems to be to highlight bulk acquire, include a comment which identifies the line where this is happening.
+line 2: "impossible to statically..." -- or dynamically. Doing it dynamically would be perfectly acceptable (locking is a dynamic operation after all)
+"guarantees acquisition order is consistent" -- assuming it's done in a single bulk acquire.
+p18: section 5.3: the text here is a mess. The explanations of "internal" versus "external" scheduling are unclear, and "signals as hints" is not explained. "... can cause thread starvation" -- means including a while loop, or not doing so? "There are three signalling mechanisms.." but the text does not follow that by telling us what they are. My own scribbled attempt at unpicking the internal/external thing: "threads already in the monitor, albeit waiting, have priority over those trying to enter".
+p19: line 3: "empty condition" -- explain that condition variables don't store anything. So being "empty" means that the queue of waiting threads (threads waiting to be signalled that the condition has become true) is empty.
+line 6: "... can be transformed into external scheduling..." -- OK, but give some motivation.
+p20: line 6: "mechnaism"
+lines 16--20: this is dense and can probably only be made clear with an example
+p21 line 21: clarify that nested monitor deadlock was describe earlier (in 5.2). (Is the repetition necessary?)
+line 27: "locks, and by extension monitors" -- this is true but the "by extension" argument is faulty. It is perfectly possible to use locks as a primitive and build a compositional mechanism out of them, e.g. transactions.
+p22 line 2: should say "restructured"
+line 33: "Implementing a fast subset check..." -- make clear that the following section explains how to do this. Restructuring the sections themselves could do this, or noting in the text.
+p23: line 3: "dynamic member adding, eg, JavaScript" -- needs to say "as permitted in JavaScript", and "dynamically adding members" is stylistically better
+p23: line 18: "urgent stack" -- back-reference to where this was explained before
+p24 line 7: I did not understand what was more "direct" about "direct communication". Also, what is a "passive monitor" -- just a monitor, given that monitors are passive by design?
+line 14 / section 5.9: this table was useful and it (or something like it) could be used much earlier on to set the structure of the rest of the paper. The explanation at present is too brief, e.g. I did not really understand the point about cases 7 and 8.
+p25 line 2: instead of casually dropping in a terse explanation for the newly intrdouced term "virtual processor", introduce it properly. Presumably the point is to give a less ambiguous meaning to "thread" by reserving it only for C\/'s green threads.
+Table 1: what does "No / Yes" mean?
+p26 line 15: "transforms user threads into fibres" -- a reference is needed to explain what "fibres" means... guessing it's in the sense of Adya et al.
+line 20: "Microsoft runtime" -- means Windows?
+lines 21--26: don't say "interrupt" to mean "signal", especially not without clear introduction. You can use "POSIX signal" to disambiguate from condition variables' "signal".
+p27 line 3: "frequency is usually long" -- that's a "time period" or "interval", not a frequency
+line 5: the lengthy quotation is not really necessary; just paraphrase the first sentence and move on.
+line 20: "to verify the implementation" -- I don't think that means what is intended
+Tables in section 7 -- too many significant figures. How many overall runs are described? What is N in each case?
+p29 line 2: "to eliminate this cost" -- arguably confusing since nowadays on commodity CPUs most of the benefits of inlining are not to do with call overheads, but from later optimizations enabled as a consequence of the inlining
+line 41: "a hierarchy" -- are they a hierarchy? If so, this could be explained earlier. Also, to say these make up "an integrated set... of control-flow features" verges on the tautologous.
+p30 line 15: "a common case being web servers and XaaS" -- that's two cases
+Reviewing: 3
+Comments to the Author
+# Cforall review
+Overall, I quite enjoyed reading the paper. Cforall has some very interesting ideas. I did have some suggestions that I think would be helpful before final publication. I also left notes on various parts of the paper that I find confusing when reading, in hopes that it may be useful to you.
+## Summary
+* Expand on the motivations for including both generator and coroutines, vs trying to build one atop the other
+* Expand on the motivations for having Why both symmetric and asymettric coroutines?
+* Comparison to async-await model adopted by other languages
+    * C#, JS
+    * Rust and its async/await model
+* Consider performance comparisons against node.js and Rust frameworks
+* Discuss performance of monitors vs finer-grained memory models and atomic operations found in other languages
+* Why both internal/external scheduling for synchronization?
+## Generator/coroutines
+In general, this section was clear, but I thought it would be useful to provide a somewhat deeper look into why Cforall opted for the particular combination of features that it offers. I see three main differences from other languages:
+* Generators are not exposed as a "function" that returns a generator object, but rather as a kind of struct, with communication happening via mutable state instead of "return values". That is, the generator must be manually resumed and (if I understood) it is expected to store values that can then later be read (perhaps via methods), instead of having a `yield <Expr>` statement that yields up a value explicitly.
+* Both "symmetric" and "asymmetric" generators are supported, instead of only asymmetric.
+* Coroutines (multi-frame generators) are an explicit mechanism.
+In most other languages, coroutines are rather built by layering single-frame generators atop one another (e.g., using a mechanism like async-await), and symmetric coroutines are basically not supported. I'd like to see a bit more justification for Cforall including all the above mechanisms -- it seemed like symmetric coroutines were a useful building block for some of the user-space threading and custom scheduler mechanisms that were briefly mentioned later in the paper.
+In the discussion of coroutines, I would have expected a bit more of a comparison to the async-await mechanism offered in other languages. Certainly the semantics of async-await in JavaScript implies significantly more overhead (because each async fn is a distinct heap object). [Rust's approach avoids this overhead][zc], however, and might be worthy of a comparison (see the Performance section).
+## Locks and threading
+### Comparison to atomics overlooks performance
+There are several sections in the paper that compare against atomics -- for example, on page 15, the paper shows a simple monitor that encapsulates an integer and compares that to C++ atomics. Later, the paper compares the simplicity of monitors against the `volatile` quantifier from Java. The conclusion in section 8 also revisits this point.
+While I agree that monitors are simpler, they are obviously also significantly different from a performance perspective -- the paper doesn't seem to address this at all. It's plausible that (e.g.) the `Aint` monitor type described in the paper can be compiled and mapped to the specialized instructions offered by hardware, but I didn't see any mention of how this would be done. There is also no mention of the more nuanced memory ordering relations offered by C++11 and how one might achieve similar performance characteristics in Cforall (perhaps the answer is that one simply doesn't need to; I think that's defensible, but worth stating explicitly).
+### Justification for external scheduling feels lacking
+Cforall includes both internal and external scheduling; I found the explanation for the external scheduling mechanism to be lacking in justification. Why include both mechanisms when most languages seem to make do with only internal scheduling? It would be useful to show some scenarios where external scheduling is truly more powerful.
+I would have liked to see some more discussion of external scheduling and how it  interacts with software engineering best practices. It seems somewhat similar to AOP in certain regards. It seems to add a bit of "extra semantics" to monitor methods, in that any method may now also become a kind of synchronization point. The "open-ended" nature of this feels like it could easily lead to subtle bugs, particularly when code refactoring occurs (which may e.g. split an existing method into two). This seems particularly true if external scheduling can occur across compilation units -- the paper suggested that this is true, but I wasn't entirely clear.
+I would have also appreciated a few more details on how external scheduling is implemented. It seems to me that there must be some sort of "hooks" on mutex methods so that they can detect whether some other function is waiting on them and awaken those blocked threads. I'm not sure how such hooks are inserted, particularly across compilation units. The material in Section 5.6 didn't quite clarify the matter for me. For example, it left me somewhat confused about whether the `f` and `g` functions declared were meant to be local to a translation unit, or shared with other unit.
+### Presentation of monitors is somewhat confusing
+I found myself confused fairly often in the section on monitors. I'm just going to leave some notes here on places that I got confused in how that it could be useful to you as feedback on writing that might want to be clarified.
+To start, I did not realize that the `mutex_opt` notation was a keyword, I thought it was a type annotation. I think this could be called out more explicitly.
+Later, in section 5.2, the paper discusses `nomutex` annotations, which initially threw me, as they had not been introduced (now I realize that this paragraph is there to justify why there is no such keyword). The paragraph might be rearranged to make that clearer, perhaps by leading with the choice that Cforall made.
+On page 17, the paper states that "acquiring multiple monitors is safe from deadlock", but this could be stated a bit more precisely: acquiring multiple monitors in a bulk-acquire is safe from deadlock (deadlock can still result from nested acquires).
+On page 18, the paper states that wait states do not have to be enclosed in loops, as there is no concern of barging. This seems true but there are also other reasons to use loops (e.g., if there are multiple reasons to notify on the same condition). Thus the statement initially surprised me, as barging is only one of many reasons that I typically employ loops around waits.
+I did not understand the diagram in Figure 12 for some time. Initially, I thought that it was generic to all monitors, and I could not understand the state space. It was only later that I realized it was specific to your example. Updating the caption from "Monitor scheduling to "Monitor scheduling in the example from Fig 13" might have helped me quite a bit.
+I spent quite some time reading the boy/girl dating example (\*) and I admit I found it somewhat confusing. For example, I couldn't tell whether there were supposed to be many "girl" threads executing at once, or if there was only supposed to be one girl and one boy thread executing in a loop. Are the girl/boy threads supposed to invoke the girl/boy methods or vice versa? Surely there is some easier way to set this up? I believe that when reading the paper I convinced myself of how it was supposed to be working, but I'm writing this review some days later, and I find myself confused all over again and not able to easily figure it out.
+(\*) as an aside, I would consider modifying the example to some other form of matching, like customers and support personnel.
+## Related work
+The paper offered a number of comparisons to Go, C#, Scala, and so forth, but seems to have overlooked another recent language, Rust. In many ways, Rust seems to be closest in philosophy to Cforall, so it seems like an odd omission. I already mentioned above that Rust is in the process of shipping [async-await syntax][aa], which is definitely an alternative to the generator/coroutine approach in Cforall (though one with clear pros/cons).
+## Performance
+In the performance section in particular, you might consider comparing against some of the Rust web servers and threading systems. For example, actix is top of the [single query TechEmpower Framework benchmarks], and tokio is near the top of the [plainthreading benchmarks][pt] (hyper, the top, is more of an HTTP framework, though it is also written in Rust). It would seem worth trying to compare their "context switching" costs as well -- I believe both actix and tokio have a notion of threads that could be readily compared.
+Another addition that might be worth considering is to compare against node.js promises, although I think the comparison to process creation is not as clean.
+That said, I think that the performance comparison is not a big focus of the paper, so it may not be necessary to add anything to it.
+## Authorship of this review
+I'm going to sign this review. This review was authored by Nicholas D. Matsakis. In the intrerest of full disclosure, I'm heavily involved in the Rust project, although I dont' think that influenced this review in particular. Feel free to reach out to me for clarifying questions.
+## Links
+[aa]: https://blog.rust-lang.org/2019/09/30/Async-await-hits-beta.html
+[zc]: https://aturon.github.io/blog/2016/08/11/futures/
+[sq]: https://www.techempower.com/benchmarks/#section=data-r18&hw=ph&test=db
+[pt]: https://www.techempower.com/benchmarks/#section=data-r18&hw=ph&test=plaintext
+Subject: Re: manuscript SPE-19-0219
+To: "Peter A. Buhr" <pabuhr@uwaterloo.ca>
+From: Richard Jones <R.E.Jones@kent.ac.uk>
+Date: Tue, 12 Nov 2019 22:43:55 +0000
+Dear Dr Buhr
+Your should have received a decision letter on this today. I am sorry that this
+has taken so long. Unfortunately SP&E receives a lot of submissions and getting
+reviewers is a perennial problem.
+Regards
+Richard
+Peter A. Buhr wrote on 11/11/2019 13:10:
+>     26-Jun-2019
+>     Your manuscript entitled "Advanced Control-flow and Concurrency in Cforall"
+>     has been received by Software: Practice and Experience. It will be given
+>     full consideration for publication in the journal.
+>
+> Hi, it has been over 4 months since submission of our manuscript SPE-19-0219
+> with no response.
+>
+> Currently, I am refereeing a paper for IEEE that already cites our prior SP&E
+> paper and the Master's thesis forming the bases of the SP&E paper under
+> review. Hence our work is apropos and we want to get it disseminates as soon as
+> possible.
+>
+> [3] A. Moss, R. Schluntz, and P. A. Buhr, "Cforall: Adding modern programming
+>      language features to C," Software - Practice and Experience, vol. 48,
+>      no. 12, pp. 2111-2146, 2018.
+>
+> [4] T. Delisle, "Concurrency in C for all," Master's thesis, University of
+>      Waterloo, 2018.  [Online].  Available:
+>      https://uwspace.uwaterloo.ca/bitstream/handle/10012/12888
+Date: Mon, 13 Jan 2020 05:33:15 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: pabuhr@uwaterloo.ca
+Subject: Revision reminder - SPE-19-0219
+-Jan-2020
+Dear Dr Buhr
+SPE-19-0219
+This is a reminder that your opportunity to revise and re-submit your
+manuscript will expire 28 days from now. If you require more time please
+contact me directly and I may grant an extension to this deadline, otherwise
+the option to submit a revision online, will not be available.
+I look forward to receiving your revision.
+Sincerely,
+Prof. Richard Jones
+Editor, Software: Practice and Experience
+https://mc.manuscriptcentral.com/spe
+Date: Wed, 5 Feb 2020 04:22:18 +0000
+From: Aaron Thomas <onbehalfof@manuscriptcentral.com>
+Reply-To: speoffice@wiley.com
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: SPE-19-0219.R1 successfully submitted
+-Feb-2020
+Dear Dr Buhr,
+Your manuscript entitled "Advanced Control-flow and Concurrency in Cforall" has
+been successfully submitted online and is presently being given full
+consideration for publication in Software: Practice and Experience.
+Your manuscript number is SPE-19-0219.R1.  Please mention this number in all
+future correspondence regarding this submission.
+You can view the status of your manuscript at any time by checking your Author
+Center after logging into https://mc.manuscriptcentral.com/spe.  If you have
+difficulty using this site, please click the 'Get Help Now' link at the top
+right corner of the site.
+Thank you for submitting your manuscript to Software: Practice and Experience.
+Sincerely,
+Software: Practice and Experience Editorial Office
+Date: Sat, 18 Apr 2020 10:42:13 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: Software: Practice and Experience - Decision on Manuscript ID
+ SPE-19-0219.R1
+-Apr-2020
+Dear Dr Buhr,
+Many thanks for submitting SPE-19-0219.R1 entitled "Advanced Control-flow and Concurrency in Cforall" to Software: Practice and Experience. The paper has now been reviewed and the comments of the referees are included at the bottom of this letter.
+I believe that we are making progress here towards a paper that can be published in Software: Practice and Experience.  However the referees still have significant concerns about the paper. The journal's focus is on practice and experience, and one of the the reviewers' concerns remains that your submission should focus the narrative more on the perspective of the programmer than the language designer. I agree that this would strengthen your submission, and I ask you to address this as well as the referees' other comments.
+A revised version of your manuscript that takes into account the comments of the referee(s) will be reconsidered for publication.
+Please note that submitting a revision of your manuscript does not guarantee eventual acceptance, and that your revision may be subject to re-review by the referees before a decision is rendered.
+You have 90 days from the date of this email to submit your revision. If you are unable to complete the revision within this time, please contact me to request a short extension.
+You can upload your revised manuscript and submit it through your Author Center. Log into https://mc.manuscriptcentral.com/spe  and enter your Author Center, where you will find your manuscript title listed under "Manuscripts with Decisions".
+When submitting your revised manuscript, you will be able to respond to the comments made by the referee(s) in the space provided.  You can use this space to document any changes you make to the original manuscript.
+If you would like help with English language editing, or other article preparation support, Wiley Editing Services offers expert help with English Language Editing, as well as translation, manuscript formatting, and figure formatting at www.wileyauthors.com/eeo/preparation. You can also check out our resources for Preparing Your Article for general guidance about writing and preparing your manuscript at www.wileyauthors.com/eeo/prepresources.
+Once again, thank you for submitting your manuscript to Software: Practice and Experience and I look forward to receiving your revision.
+Sincerely,
+Richard
+Prof. Richard Jones
+Software: Practice and Experience
+R.E.Jones@kent.ac.uk
+Referee(s)' Comments to Author:
+Reviewing: 1
+Comments to the Author
+(A relatively short second review)
+I thank the authors for their revisions and comprehensive response to
+reviewers' comments --- many of my comments have been successfully
+addressed by the revisions.  Here I'll structure my comments around
+the main salient points in that response which I consider would
+benefit from further explanation.
+>  Table 1 is moved to the start and explained in detail.
+I consider this change makes a significant improvement to the paper,
+laying out the landscape of language features at the start, and thus
+addresses my main concerns about the paper.
+I still have a couple of issues --- perhaps the largest is that it's
+still not clear at this point in the paper what some of these options
+are, or crucially how they would be used. I don't know if it's
+possbile to give high-level examples or use cases to be clear about
+these up front - or if that would duplicate too much information from
+later in the paper - either way expanding out the discussion - even if
+just two a couple of sentences for each row - would help me more.  The
+point is not just to define these categories but to ensure the
+readers' understanding of these definitons agrees with that used in
+the paper.
+in a little more detail:
+ * 1st para section 2 begs the question: why not support each
+   dimension independently, and let the programmer or library designer
+   combiine features?
+ * "execution state" seems a relatively low-level description here.
+  I don't think of e.g. the lambda calculus that way. Perhaps it's as
+  good a term as any.
+ * Why must there "be language mechanisms to create, block/unblock,
+   and join with a thread"?  There aren't in Smalltalk (although there
+   are in the runtime).  Especially given in Cforall those mechanisms
+   are *implicit* on thread creation and destruction?
+ * "Case 1 is a function that borrows storage for its state (stack
+   frame/activation) and a thread from its invoker"
+   this much makes perfect sense to me, but I don't understand how a
+   non-stateful, non-theaded function can then retain
+   "this state across callees, ie, function local-variables are
+   retained on the stack across calls."
+   how can it retain function-local values *across calls* when it
+   doesn't have any functional-local state?
+   I'm not sure if I see two separate cases here - rougly equivalent
+   to C functions without static storage, and then C functions *with*
+   static storage. I assumed that was the distinction between cases 1
+   & 3; but perhpas the actual distinction is that 3 has a
+   suspend/resume point, and so the "state" in figure 1 is this
+   component of execution state (viz figs 1 & 2), not the state
+   representing the cross-call variables?
+>    but such evaluation isn't appropriate for garbage-collected or JITTed
+   languages like Java or Go.
+For JITTed languages in particular, reporting peak performance needs
+to "warm up" the JIT with a number of iterators before beginning
+measurement. Actually for JIT's its even worse: see Edd Barrett et al
+OOPSLA 2017.
+minor issues:
+ * footnote A - I've looked at various other papers & the website to
+   try to understand how "object-oriented" Cforall is - I'm still not
+   sure.  This footnote says Cforall has "virtuals" - presumably
+   virtual functions, i.e. dynamic dispatch - and inheritance: that
+   really is OO as far as I (and most OO people) are concerned.  For
+   example Haskell doesn't have inheritance, so it's not OO; while
+   CLOS (the Common Lisp *Object* System) or things like Cecil and
+   Dylan are considered OO even though they have "multiple function
+   parameters as receivers", lack "lexical binding between a structure
+   and set of functions", and don't have explicit receiver invocation
+   syntax.  Python has receiver syntax, but unlike Java or Smalltalk
+   or C++, method declarations still need to have an explicit "self"
+   receiver parameter.  Seems to me that Go, for example, is
+   more-or-less OO with interfaces, methods, and dynamic dispatch (yes
+   also and an explicit receiver syntax but that's not
+   determiniative); while Rust lacks dynamic dispatch built-in.  C is
+   not OO as a language, but as you say given it supports function
+   pointers with structures, it does support an OO programm style.
+   This is why I again recommend just not buying into this fight: not
+   making any claims about whether Cforall is OO or is not - because
+   as I see it, the rest of the paper doesn't depend on whether
+   Cforall is OO or not.  That said: this is just a recommendation,
+   and I won't quibble over this any further.
+ * is a "monitor function" the same as a "mutex function"?
+   if so the paper should pick one term; if not, make the distinction clear.
+ * "As stated on line 1 because state declarations from the generator
+    type can be moved out of the coroutine type into the coroutine main"
+    OK sure, but again: *why* would a programmer want to do that?
+    (Other than, I guess, to show the difference between coroutines &
+    generators?)  Perhaps another way to put this is that the first
+    para of 3.2 gives the disadvantages of coroutines vs-a-vs
+    generators, briefly describes the extended semantics, but never
+    actualy says why a programmer may want those extended semantics,
+    or how they would benefit.  I don't mean to belabour the point,
+    but (generalist?) readers like me would generally benefit from
+    those kinds of discussions about each feature throughout the
+    paper: why might a programmer want to use them?
+> p17 if the multiple-monitor entry procedure really is novel, write a paper
+> about that, and only about that.
+> We do not believe this is a practical suggestion.
+ * I'm honestly not trying to be snide here: I'm not an expert on
+   monitor or concurrent implementations. Brinch Hansen's original
+   monitors were single acquire; this draft does not cite any other
+   previous work that I could see. I'm not suggesting that the brief
+   mention of this mechanism necessarily be removed from this paper,
+   but if this is novel (and a clear advance over a classical OO
+   monitor a-la Java which only acquires the distinguished reciever)
+   then that would be worth another paper in itself.
+> * conclusion should conclude the paper, not the related.
+> We do not understand this comment.if ithis
+My typo: the paper's conclusion should come at the end, after the
+future work section.
+To encourage accountability, I'm signing my reviews in 2020.
+For the record, I am James Noble, kjx@ecs.vuw.ac.nz.
+Reviewing: 2
+Comments to the Author
+I thank the authors for their detailed response. To respond to a couple of points raised  in response to my review (number 2):
+- on the Boehm paper and whether code is "all sequential to the compiler": I now understand the authors' position better and suspect we are in violent agreement, except for whether it's appropriate to use the rather breezy phrase "all sequential to the compiler". It would be straightforward to clarify that code not using the atomics features is optimized *as if* it were sequential, i.e. on the assumption of a lack of data races.
+- on the distinction between "mutual exclusion" and "synchronization": the added citation does help, in that it makes a coherent case for the definition the authors prefer. However, the text could usefully clarify that this is a matter of definition not of fact, given especially that in my assessment the authors' preferred definition is not the most common one. (Although the mention of Hoare's apparent use of this definition is one data point, countervailing ones are found in many contemporaneous or later papers, e.g. Habermann's 1972 "Synchronization of Communicating Processes" (CACM 15(3)), Reed & Kanodia's 1979 "Synchronization with eventcounts and sequencers" (CACM (22(2)) and so on.)
+I am glad to see that the authors have taken on board most of the straightforward improvements I suggested.
+However, a recurring problem of unclear writing still remains through many parts of the paper, including much of sections 2, 3 and 6. To highlight a couple of problem patches (by no means exhaustive):
+- section 2 (an expanded version of what was previously section 5.9) lacks examples and is generally obscure and allusory ("the most advanced feature" -- name it! "in triplets" -- there is only one triplet!; what are "execution locations"? "initialize" and "de-initialize" what? "borrowed from the invoker" is a concept in need of explaining or at least a fully explained example -- in what sense does a plain function borrow" its stack frame? "computation only" as opposed to what? in 2.2, in what way is a "request" fundamental to "synchronization"? and the "implicitly" versus "explicitly" point needs stating as elsewhere, with a concrete example e.g. Java built-in mutexes versus java.util.concurrent).
+- section 6: 6.2 omits the most important facts in preference for otherwise inscrutable detail: "identify the kind of parameter" (first say *that there are* kinds of parameter, and what "kinds" means!); "mutex parameters are documentation" is misleading (they are also semantically significant!) and fails to say *what* they mean; the most important thing is surely that 'mutex' is a language feature for performing lock/unlock operations at function entry/exit. So say it! The meanings of examples f3 and f4 remain unclear. Meanwhile in 6.3, "urgent" is not introduced (we are supposed to infer its meaning from Figure 12, but that Figure is incomprehensible to me), and we are told of "external scheduling"'s long history in Ada but not clearly what it actually means; 6.4's description of "waitfor" tells us it is different from an if-else chain but tries to use two *different* inputs to tell us that the behavior is different; tell us an instance where *the same* values of C1 and C2 give different behavior (I even wrote out a truth table and still don't see the semantic difference)
+The authors frequently use bracketed phrases, and sometimes slashes "/", in ways that are confusing and/or detrimental to readability. Page 13 line 2's "forward (backward)" is one particularly egregious example. In general I would recommend the the authors try to limit their use of parentheses and slashes as a means of forcing a clearer wording to emerge. Also, the use of "eg." is often cursory and does not explain the examples given, which are frequently a one- or two-word phrase of unclear referent.
+Considering the revision more broadly, none of the more extensive or creative rewrites I suggested in my previous review have been attempted, nor any equivalent efforts to improve its readability. The hoisting of the former section 5.9 is a good idea, but the newly added material accompanying it (around Table 1) suffers fresh deficiencies in clarity. Overall the paper is longer than before, even though (as my previous review stated), I believe a shorter paper is required in order to serve the likely purpose of publication. (Indeed, the authors' letter implies that a key goal of publication is to build community and gain external users.)
+Given this trajectory, I no longer see a path to an acceptable revision of the present submission. Instead I suggest the authors consider splitting the paper in two: one half about coroutines and stack management, the other about mutexes, monitors and the runtime. (A briefer presentation of the runtime may be helpful in the first paper also, and a brief recap of the generator and coroutine support is obviously needed in the second too.) Both of these new papers would need to be written with a strong emphasis on clarity, paying great care to issues of structure, wording, choices of example, and restraint (saying what's important, not everything that could be said). I am confident the authors could benefit from getting early feedback from others at their institution. For the performance experiments, of course these do not split evenly -- most (but not all) belong in the second of these two hypothetical papers. But the first of them would still have plenty of meat to it; for me, a clear and thorough study of the design space around coroutines is the most interesting and tantalizing prospect.
+I do not buy the authors' defense of the limited practical experience or "non-micro" benchmarking presented. Yes, gaining external users is hard and I am sympathetic on that point. But building something at least *somewhat* substantial with your own system should be within reach, and without it the "practice and experience" aspects of the work have not been explored. Clearly C\/ is the product of a lot of work over an extended period, so it is a surprise that no such experience is readily available for inclusion.
+Some smaller points:
+It does not seem right to state that a stack is essential to Von Neumann architectures -- since the earliest Von Neumann machines (and indeed early Fortran) did not use one.
+To elaborate on something another reviewer commented on: it is a surprise to find a "Future work" section *after* the "Conclusion" section. A "Conclusions and future work" section often works well.
+Reviewing: 3
+Comments to the Author
+This is the second round of reviewing.
+As in the first review, I found that the paper (and Cforall) contains
+a lot of really interesting ideas, but it remains really difficult to
+have a good sense of which idea I should use and when. This applies in
+different ways to different features from the language:
+* coroutines/generators/threads: here there is
+  some discussion, but it can be improved.
+* interal/external scheduling: I didn't find any direct comparison
+  between these features, except by way of example.
+I requested similar things in my previous review and I see that
+content was added in response to those requests. Unfortunately, I'm
+not sure that I can say it improved the paper's overall read. I think
+in some sense the additions were "too much" -- I would have preferred
+something more like a table or a few paragraphs highlighting the key
+reasons one would pick one construct or the other.
+In general, I do wonder if the paper is just trying to do too much.
+The discussion of clusters and pre-emption in particular feels quite
+rushed.
+## Summary
+I make a number of suggestions below but the two most important
+I think are:
+* Recommend to shorten the comparison on coroutine/generator/threads
+  in Section 2 to a paragraph with a few examples, or possibly a table
+  explaining the trade-offs between the constructs
+* Recommend to clarify the relationship between internal/external
+  scheduling -- is one more general but more error-prone or low-level?
+## Coroutines/generators/threads
+There is obviously a lot of overlap between these features, and in
+particular between coroutines and generators. As noted in the previous
+review, many languages have chosen to offer *only* generators, and to
+build coroutines by stacks of generators invoking one another.
+I believe the newly introduced Section 2 of the paper is trying to
+motivate why each of these constructs exist, but I did not find it
+effective. It was dense and difficult to understand. I think the
+problem is that Section 2 seems to be trying to derive "from first
+principles" why each construct exists, but I think that a more "top
+down" approach would be easier to understand.
+In fact, the end of Section 2.1 (on page 5) contains a particular
+paragraph that embodies this "top down" approach. It starts,
+"programmers can now answer three basic questions", and thus gives
+some practical advice for which construct you should use and when. I
+think giving some examples of specific applications that this
+paragraph, combined with some examples of cases where each construct
+was needed, would be a better approach.
+I don't think this compariosn needs to be very long. It seems clear
+enough that one would
+* prefer generators for simple computations that yield up many values,
+* prefer coroutines for more complex processes that have significant
+  internal structure,
+* prefer threads for cases where parallel execution is desired or
+  needed.
+I did appreciate the comparison in Section 2.3 between async-await in
+JS/Java and generators/coroutines. I agree with its premise that those
+mechanisms are a poor replacement for generators (and, indeed, JS has
+a distinct generator mechanism, for example, in part for this reason).
+I believe I may have asked for this in a previous review, but having
+read it, I wonder if it is really necessary, since those mechanisms
+are so different in purpose.
+## Internal vs external scheduling
+I find the motivation for supporting both internal and external
+scheduling to be fairly implicit. After several reads through the
+section, I came to the conclusion that internal scheduling is more
+expressive than external scheduling, but sometimes less convenient or
+clear. Is this correct? If not, it'd be useful to clarify where
+external scheduling is more expressive.
+The same is true, I think, of the `signal_block` function, which I
+have not encountered before; it seems like its behavior can be modeled
+with multiple condition variables, but that's clearly more complex.
+One question I had about `signal_block`: what happens if one signals
+but no other thread is waiting? Does it block until some other thread
+waits? Or is that user error?
+I would find it very interesting to try and capture some of the
+properties that make internal vs external scheduling the better
+choice.
+For example, it seems to me that external scheduling works well if
+there are only a few "key" operations, but that internal scheduling
+might be better otherwise, simply because it would be useful to have
+the ability to name a signal that can be referenced by many
+methods. Consider the bounded buffer from Figure 13: if it had
+multiple methods for removing elements, and not just `remove`, then
+the `waitfor(remove)` call in `insert` might not be sufficient.
+## Comparison of external scheduling to messaging
+I did enjoy the section comparing external scheduling to Go's
+messaging mechanism, which I believe is a new addition.
+I believe that one difference between the Go program and the Cforall
+equivalent is that the Goroutine has an associated queue, so that
+multiple messages could be enqueued, whereas the Cforall equivalent is
+effectively a "bounded buffer" of length 1. Is that correct? I think
+this should be stated explicitly. (Presumably, one could modify the
+Cforall program to include an explicit vector of queued messages if
+desired, but you would also be reimplementing the channel
+abstraction.)
+Also, in Figure 20, I believe that there is a missing `mutex` keyword.
+The fiugre states:
+```
+void main(GoRtn & gortn) with(gortn) {
+```
+but I think it should probably be as follows:
+```
+void main(GoRtn & mutex gortn) with(gortn) {
+```
+Unless there is some implicit `mutex` associated with being a main
+function for a `monitor thread`.
+## Atomic operations and race freedom
+I was glad to see that the paper acknowledged that Cforall still had
+low-level atomic operations, even if their use is discouraged in favor
+of higher-level alternatives.
+However, I still feel that the conclusion overstates the value of the
+contribution here when it says that "Cforall high-level race-free
+monitors and threads provide the core mechanisms for mutual exclusion
+and synchronization, without the need for volatile and atomics". I
+feel confident that Java programmers, for example, would be advised to
+stick with synchronized methods whenever possible, and it seems to me
+that they offer similar advantages -- but they sometimes wind up using
+volatiles for performance reasons.
+I was also confused by the term "race-free" in that sentence. In
+particular, I don't think that Cforall has any mechanisms for
+preventing *data races*, and it clearly doesn't prevent "race
+conditions" (which would bar all sorts of useful programs). I suppose
+that "race free" here might be referring to the improvements such as
+removing barging behavior.
+## Performance comparisons
+In my previous review, I requested comparisons against Rust and
+node.js, and I see that the new version of the paper includes both,
+which is a good addition.
+One note on the Rust results: I believe that the results are comparing
+against the threads found in Rust's standard library, which are
+essentially a shallow wrapper around pthreads, and hence the
+performance is quite close to pthread performance (as one would
+expect). It would perhaps be more interesting to see a comparison
+built using [tokio] or [async-std], two of the more prominent
+user-space threading libraries that build on Rust's async-await
+feature (which operates quite differently than Javascript's
+async-await, in that it doesn't cause every aync function call to
+schedule a distinct task).
+[tokio]: https://tokio.rs/
+[async-std]: https://async.rs/
+That said, I am satisfied with the performance results as they are in
+the current revision.
+## Minor notes and typos
+Several figures used the `with` keyword. I deduced that `with(foo)`
+permits one to write `bar` instead of `foo.bar`. It seems worth
+introducing. Apologies if this is stated in the paper, if so I missed
+it.
+On page 20, section 6.3, "external scheduling and vice versus" should be
+"external scheduling and vice versa".
+On page 5, section 2.3, the paper states "we content" but it should be
+"we contend".
+Reviewing: Editor
+A few small comments in addition to those of the referees.
+Page 1. I don't believe that it s fair to imply that Scala is  "research vehicle" as it is used by major players, Twitter being the most prominent example.
+Page 15. Must Cforall threads start after construction (e.g. see your example on page 15, line 21)? I can think of examples where it is not desirable that threads start immediately after construction, e.g. a game with N players, each of whom is expensive to create, but all of whom should be started at the same time.
+Page 18, line 17: is using
+Date: Tue, 16 Jun 2020 13:45:03 +0000
+From: Aaron Thomas <onbehalfof@manuscriptcentral.com>
+Reply-To: speoffice@wiley.com
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: SPE-19-0219.R2 successfully submitted
+-Jun-2020
+Dear Dr Buhr,
+Your manuscript entitled "Advanced Control-flow and Concurrency in Cforall" has been successfully submitted online and is presently being given full consideration for publication in Software: Practice and Experience.
+Your manuscript number is SPE-19-0219.R2.  Please mention this number in all future correspondence regarding this submission.
+You can view the status of your manuscript at any time by checking your Author Center after logging into https://mc.manuscriptcentral.com/spe.  If you have difficulty using this site, please click the 'Get Help Now' link at the top right corner of the site.
+Thank you for submitting your manuscript to Software: Practice and Experience.
+Sincerely,
+Software: Practice and Experience Editorial Office
+Date: Wed, 2 Sep 2020 20:55:34 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: Software: Practice and Experience - Decision on Manuscript ID
+ SPE-19-0219.R2
+-Sep-2020
+Dear Dr Buhr,
+Many thanks for submitting SPE-19-0219.R2 entitled "Advanced Control-flow and Concurrency in Cforall" to Software: Practice and Experience. The paper has now been reviewed and the comments of the referees are included at the bottom of this letter. I apologise for the length of time it has taken to get these.
+Both reviewers consider this paper to be close to acceptance. However, before I can accept this paper, I would like you address the comments of Reviewer 2, particularly with regard to the description of the adaptation Java harness to deal with warmup. I would expect to see a convincing argument that the computation has reached a steady state. I would also like you to provide the values for N for each benchmark run. This should be very straightforward for you to do. There are a couple of papers on steady state that you may wish to consult (though I am certainly not pushing my own work).
+) Barrett, Edd; Bolz-Tereick, Carl Friedrich; Killick, Rebecca; Mount, Sarah and Tratt, Laurence. Virtual Machine Warmup Blows Hot and Cold. OOPSLA 2017. https://doi.org/10.1145/3133876
+Virtual Machines (VMs) with Just-In-Time (JIT) compilers are traditionally thought to execute programs in two phases: the initial warmup phase determines which parts of a program would most benefit from dynamic compilation, before JIT compiling those parts into machine code; subsequently the program is said to be at a steady state of peak performance. Measurement methodologies almost always discard data collected during the warmup phase such that reported measurements focus entirely on peak performance. We introduce a fully automated statistical approach, based on changepoint analysis, which allows us to determine if a program has reached a steady state and, if so, whether that represents peak performance or not. Using this, we show that even when run in the most controlled of circumstances, small, deterministic, widely studied microbenchmarks often fail to reach a steady state of peak performance on a variety of common VMs. Repeating our experiment on 3 different machines, we found that at most 43.5% of pairs consistently reach a steady state of peak performance.
+) Kalibera, Tomas and Jones, Richard. Rigorous Benchmarking in Reasonable Time. ISMM  2013. https://doi.org/10.1145/2555670.2464160
+Experimental evaluation is key to systems research. Because modern systems are complex and non-deterministic, good experimental methodology demands that researchers account for uncertainty. To obtain valid results, they are expected to run many iterations of benchmarks, invoke virtual machines (VMs) several times, or even rebuild VM or benchmark binaries more than once. All this repetition costs time to complete experiments. Currently, many evaluations give up on sufficient repetition or rigorous statistical methods, or even run benchmarks only in training sizes. The results reported often lack proper variation estimates and, when a small difference between two systems is reported, some are simply unreliable.In contrast, we provide a statistically rigorous methodology for repetition and summarising results that makes efficient use of experimentation time. Time efficiency comes from two key observations. First, a given benchmark on a given platform is typically prone to much less non-determinism than the common worst-case of published corner-case studies. Second, repetition is most needed where most uncertainty arises (whether between builds, between executions or between iterations). We capture experimentation cost with a novel mathematical model, which we use to identify the number of repetitions at each level of an experiment necessary and sufficient to obtain a given level of precision.We present our methodology as a cookbook that guides researchers on the number of repetitions they should run to obtain reliable results. We also show how to present results with an effect size confidence interval. As an example, we show how to use our methodology to conduct throughput experiments with the DaCapo and SPEC CPU benchmarks on three recent platforms.
+You have 42 days from the date of this email to submit your revision. If you are unable to complete the revision within this time, please contact me to request a short extension.
+You can upload your revised manuscript and submit it through your Author Center. Log into https://mc.manuscriptcentral.com/spe and enter your Author Center, where you will find your manuscript title listed under "Manuscripts with Decisions".
+When submitting your revised manuscript, you will be able to respond to the comments made by the referee(s) in the space provided.  You can use this space to document any changes you make to the original manuscript.
+If you would like help with English language editing, or other article preparation support, Wiley Editing Services offers expert help with English Language Editing, as well as translation, manuscript formatting, and figure formatting at www.wileyauthors.com/eeo/preparation. You can also check out our resources for Preparing Your Article for general guidance about writing and preparing your manuscript at www.wileyauthors.com/eeo/prepresources.
+Once again, thank you for submitting your manuscript to Software: Practice and Experience. I look forward to receiving your revision.
+Sincerely,
+Richard
+Prof. Richard Jones
+Editor, Software: Practice and Experience
+R.E.Jones@kent.ac.uk
+Referee(s)' Comments to Author:
+Reviewing: 1
+Comments to the Author
+Overall, I felt that this draft was an improvement on previous drafts and I don't have further changes to request.
+I appreciated the new language to clarify the relationship of external and internal scheduling, for example, as well as the new measurements of Rust tokio. Also, while I still believe that the choice between thread/generator/coroutine and so forth could be made crisper and clearer, the current draft of Section 2 did seem adequate to me in terms of specifying the considerations that users would have to take into account to make the choice.
+Reviewing: 2
+Comments to the Author
+First: let me apologise for the delay on this review. I'll blame the global pandemic combined with my institution's senior management's counterproductive decisions for taking up most of my time and all of my energy.
+At this point, reading the responses, I think we've been around the course enough times that further iteration is unlikely to really improve the paper any further, so I'm happy to recommend acceptance.    My main comments are that there were some good points in the responses to *all* the reviews and I strongly encourage the authors to incorporate those discursive responses into the final paper so they may benefit readers as well as reviewers.   I agree with the recommendations of reviewer #2 that the paper could usefully be split in to two, which I think I made to a previous revision, but I'm happy to leave that decision to the Editor.
+Finally, the paper needs to describe how the Java harness was adapted to deal with warmup; why the computation has warmed up and reached a steady state - similarly for js and Python. The tables should also give the "N" chosen for each benchmark run.
+minor points
+* don't start sentences with "However"
+* most downloaded isn't an "Award"
+Date: Thu, 1 Oct 2020 05:34:29 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: pabuhr@uwaterloo.ca
+Subject: Revision reminder - SPE-19-0219.R2
+-Oct-2020
+Dear Dr Buhr
+SPE-19-0219.R2
+This is a reminder that your opportunity to revise and re-submit your manuscript will expire 14 days from now. If you require more time please contact me directly and I may grant an extension to this deadline, otherwise the option to submit a revision online, will not be available.
+If your article is of potential interest to the general public, (which means it must be timely, groundbreaking, interesting and impact on everyday society) then please e-mail ejp@wiley.co.uk explaining the public interest side of the research. Wiley will then investigate the potential for undertaking a global press campaign on the article.
+I look forward to receiving your revision.
+Sincerely,
+Prof. Richard Jones
+Editor, Software: Practice and Experience
+https://mc.manuscriptcentral.com/spe
+Date: Tue, 6 Oct 2020 15:29:41 +0000
+From: Mayank Roy Chowdhury <onbehalfof@manuscriptcentral.com>
+Reply-To: speoffice@wiley.com
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: SPE-19-0219.R3 successfully submitted
+-Oct-2020
+Dear Dr Buhr,
+Your manuscript entitled "Advanced Control-flow and Concurrency in Cforall" has been successfully submitted online and is presently being given full consideration for publication in Software: Practice and Experience.
+Your manuscript number is SPE-19-0219.R3.  Please mention this number in all future correspondence regarding this submission.
+You can view the status of your manuscript at any time by checking your Author Center after logging into https://mc.manuscriptcentral.com/spe.  If you have difficulty using this site, please click the 'Get Help Now' link at the top right corner of the site.
+Thank you for submitting your manuscript to Software: Practice and Experience.
+Sincerely,
+Software: Practice and Experience Editorial Office
+Date: Thu, 15 Oct 2020 13:48:52 +0000
+From: Richard Jones <onbehalfof@manuscriptcentral.com>
+Reply-To: R.E.Jones@kent.ac.uk
+To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
+Subject: Software: Practice and Experience - Decision on Manuscript ID
+ SPE-19-0219.R3
+-Oct-2020
+Dear Dr Buhr,
+It is a pleasure to accept your manuscript entitled "Advanced Control-flow and Concurrency in Cforall" in its current form for publication in Software: Practice and Experience.
+Please note although the manuscript is accepted the files will now be checked to ensure that everything is ready for publication, and you may be contacted if final versions of files for publication are required.
+Your article cannot be published until the publisher has received the appropriate signed license agreement. Within the next few days the corresponding author will receive an email from Wiley's Author Services system which will ask them to log in and will present them with the appropriate license for completion.
+Thank you for your fine contribution.
+Sincerely,
+Richard
+Prof. Richard Jones
+Editor, Software: Practice and Experience
+R.E.Jones@kent.ac.uk
+P.S. - You can help your research get the attention it deserves! Check out Wiley's free Promotion Guide for best-practice recommendations for promoting your work at www.wileyauthors.com/eeo/guide. And learn more about Wiley Editing Services which offers professional video, design, and writing services to create shareable video abstracts, infographics, conference posters, lay summaries, and research news stories for your research at www.wileyauthors.com/eeo/promotion.
+This journal accepts artwork submissions for Cover Images. This is an optional service you can use to help increase article exposure and showcase your research. For more information, including artwork guidelines, pricing, and submission details, please visit the Journal Cover Image page at www.wileyauthors.com/eeo/covers. If you want help creating an image, Wiley Editing Services offers a professional cover image design service that creates eye-catching images, ready to be showcased on the journal cover at www.wileyauthors.com/eeo/design.
+Date: Fri, 16 Oct 2020 12:44:42 +0000
+From: Mayank Roy Chowdhury <onbehalfof@manuscriptcentral.com>
+Reply-To: speoffice@wiley.com
+To: pabuhr@uwaterloo.ca
+Subject: Manuscript Accepted - Please submit final updates to SPE-19-0219.R3 [email ref: ENR-AW-1-c]
+-Oct-2020
+Dear Dr. Buhr,
+Manuscript id: SPE-19-0219.R3
+Manuscript title: Advanced Control-flow and Concurrency in Cforall
+Although your manuscript has been accepted for publication it is now being returned to your author center for you to review and make any final adjustments or corrections prior to production and publication.
+Any special instructions will be listed below:
+) Funding Information added in ScholorOne but missing in main document, Kindly add the Funding information in main document.
+) Please provide the clean version of the manuscript without any highlights or tracked changes.
+) Kindly check and make sure citations for all figures and Tables are present in the main document
+Please now log back into your Scholar One Author Center and click on the "Manuscripts Accepted for First Look" queue. In order to update the submission, click on the "submit updated manuscript" link in the "Actions" column and follow the steps as you would during a manuscript submission process.
+On the File Upload screen please upload the FINAL versions of all the files, including print quality image files. For information about image quality requirements, please refer to the guidelines at https://authorservices.wiley.com/asset/photos/electronic_artwork_guidelines.pdf.
+Instructions for uploading replacement files:
+. On the "File Upload" step, click on the "edit" button for the file you wish to replace.
+. In the "Upload a later version" section, browse to locate the replacement final version.
+. Add any comments concerning the replacement (e.g. "high res image").
+. Select whether the new file is a minor or major version (we suggest you select minor version)
+. Click upload.
+. Click 'Submit' when all the files have been uploaded and you will receive an automated email to say that submission is successful.
+Please submit your updates within the next 7 days to ensure there are no unnecessary delays in production.
+Sincerely,
+Software: Practice and Experience Editorial Office
+From: SPE Office <speoffice@wiley.com>
+To: "Peter A. Buhr" <pabuhr@uwaterloo.ca>
+Subject: Re: Manuscript Accepted - Please submit final updates to SPE-19-0219.R3 [email ref: ENR-AW-1-c]
+Date: Mon, 19 Oct 2020 17:04:24 +0000
+Dear Dr. Buhr,
+Thank you very much for contacting the Editorial Office.
+I would like to let you know that the files has been found in order and moved to production.
+Plesae let me know for further assistance in this regard.
+Best Regards
+Mayank Roy Chowdhury
+Editorial Assistant
+Software practice and Experience
+________________________________
+From: Peter A. Buhr <pabuhr@uwaterloo.ca>
+Sent: Sunday, October 18, 2020 2:00 PM
+To: SPE Office <speoffice@wiley.com>
+Cc: Thierry Delisle <tdelisle@uwaterloo.ca>
+Subject: Re: Manuscript Accepted - Please submit final updates to SPE-19-0219.R3 [email ref: ENR-AW-1-c]
+       This is an external email.
+    Mayank Roy Chowdhury <onbehalfof@manuscriptcentral.com> writes:
+    Instructions for uploading replacement files:
+. On the "File Upload" step, click on the "edit" button for the file you wish to replace.
+. In the "Upload a later version" section, browse to locate the replacement final version.
+. Add any comments concerning the replacement (e.g. "high res image").
+. Select whether the new file is a minor or major version (we suggest you select minor version)
+. Click upload.
+. Click 'Submit' when all the files have been uploaded and you will receive an automated email to say that submission is successful.
+There was no "edit" button on the "File Upload" page, so I just upload the
+final version of the PDF and source files using the mechanism on the "File
+Upload" page and submitted that.
+Date: Tue, 20 Oct 2020 13:28:37 +0530
+To: "Dr. Peter Buhr" <pabuhr@uwaterloo.ca>
+From: jpcms@spi-global.com
+Subject: Information: Production Editor Contact Software:Practice and Experience  | Advanced Control-flow and Concurrency in C A
+Dear Dr. Peter Buhr,
+We are in the process of preparing "Advanced Control-flow and Concurrency in C A" for publication. Your production editor, Joel Pacaanas, will support you and your article throughout the process.
+Please get in touch with your Production Editor at SPEproofs@wiley.com;EllaMae.Navor@spi-global.com if you have any questions.
+Sincerely,
+Booking-in Team,
+On behalf of Wiley
+Article ID: SPE_2925
+Article DOI: 10.1002/SPE.2925
+Date: Tue, 20 Oct 2020 10:33:04 +0000
+From: <cs-author@wiley.com>
+To: <pabuhr@uwaterloo.ca>
+Subject: In Production: Your article accepted in Software: Practice and Experience
+Dear Peter Buhr,
+Article ID: SPE2925
+Article DOI: 10.1002/spe.2925
+Internal Article ID: 16922213
+Article: Advanced Control-flow and Concurrency in C A
+Journal: Software: Practice and Experience
+Congratulations on the acceptance of your article for publication in Software: Practice and Experience.
+Your article has been received and the production process is now underway. We look forward to working with you and publishing your article. Using Wiley Author Services, you can track your article's progress.
+Please click below to login - if you are using a different email address than this one, you will need to manually assign this article to your Dashboard (see https://hub.wiley.com/docs/support/assigning-a-missing-article-to-my-dashboard-DOC-11871?utm_source=new%20user%20invitation&utm_medium=email How do I assign a missing article to My Dashboard?):
+https://authorservices.wiley.com/index.html#login?campaign=email_invitation-new
+If applicable, a list of available actions will appear below - check out your Author Services Dashboard for all actions related to your articles.
+Sign your license agreement (REQUIRED)  -- you will receive an email when this task is ready on your dashboard. Track your article's progress to publicationAccess your published articleInvite colleagues to view your published article
+If you need any assistance, please click http://www.wileyauthors.com/help?utm_source=new%20user%20invitation&utm_medium=email here to view our Help section.
+Sincerely,
+Wiley Author Services
+P.S. - Some journals accept artwork submissions for Cover Images. This is an optional service you can use to help increase article exposure and showcase your research. Pricing and placement options vary by journal. For more information, including artwork guidelines, pricing, and submission details, please visit the https://authorservices.wiley.com/author-resources/Journal-Authors/Promotion/journal-cover-image.html?utm_source=as&utm_medium=email&utm_term=invitation_msg&utm_content=covers&utm_campaign=2019feb?campaign=email_invitation-new" target=_blank">Journal Cover Image page. If you want help creating an image, Wiley Editing Services offers a professional https://wileyeditingservices.com/en/article-promotion/cover-image-design.html?utm_source=as&utm_medium=email&utm_term=ie&utm_content=cid&utm_campaign=prodops" target=_blank">Cover Image Design service that creates eye-catching images, ready to be showcased on the journal cover.
+Date: Thu, 22 Oct 2020 20:21:49 +0000
+From: <cs-author@wiley.com>
+To: <pabuhr@uwaterloo.ca>
+Subject: You have actions to complete in Author Services
+Dear Peter Buhr,
+Article ID: SPE2925
+Article DOI: 10.1002/spe.2925
+Internal Article ID: 16922213
+Article: Advanced Control-flow and Concurrency in C A
+Journal: Software: Practice and Experience
+For the above article, you have the following open tasks:
+Sign your license agreement in order to publish your article. Simply click the Sign License button on your https://authorservices.wiley.com?campaign=email_license-notice1">Wiley Author Services Dashboard.
+Need any help? Please visit our https://authorsupport.wiley.com/s/">Author Support Center.
+Sincerely,
+Wiley Author Services
+Date: Thu, 22 Oct 2020 23:13:07 +0000
+From: <cs-author@wiley.com>
+To: <pabuhr@uwaterloo.ca>
+Subject: License was successfully submitted! Thank you!
+Dear Peter Buhr,
+Article ID: SPE2925
+Article DOI: 10.1002/spe.2925
+Internal Article ID: 16922213
+Article: Advanced Control-flow and Concurrency in C A
+Journal: Software: Practice and Experience
+You've successfully completed license signing for your article - thank you! You can view your signed agreement at any time by visiting your https://authorservices.wiley.com?campaign=email_license-confirm">Wiley Author Services Dashboard.
+Sincerely,
+Wiley Author Services
+From: "Pacaanas, Joel -" <jpacaanas@wiley.com>
+To: "Peter A. Buhr" <pabuhr@uwaterloo.ca>
+CC: Thierry Delisle <tdelisle@uwaterloo.ca>
+Subject: RE: Action: Proof of SPE_EV_SPE2925 for Software: Practice And Experience ready for review
+Date: Thu, 5 Nov 2020 02:03:27 +0000
+Dear Dr Buhr,
+Thank you for letting me know. We will wait for your corrections then.
+Best regards,
+Joel
+Joel Q. Pacaanas
+Production Editor
+On behalf of Wiley
+Manila
+We partner with global experts to further innovative research.
+E-mail: jpacaanas@wiley.com
+Tel: +632 88558618
+Fax: +632 5325 0768
+-----Original Message-----
+From: Peter A. Buhr [mailto:pabuhr@uwaterloo.ca]
+Sent: Thursday, November 5, 2020 5:57 AM
+To: SPE Proofs <speproofs@wiley.com>
+Cc: Thierry Delisle <tdelisle@uwaterloo.ca>
+Subject: Re: Action: Proof of SPE_EV_SPE2925 for Software: Practice And Experience ready for review
+       This is an external email.
+    We appreciate that the COVID-19 pandemic may create conditions for you that
+    make it difficult for you to review your proof within standard time
+    frames. If you have any problems keeping to this schedule, please reach out
+    to me at (SPEproofs@wiley.com) to discuss alternatives.
+Hi,
+We are in the middle of reading the proofs but it will take a little more
+time. I can send the proofs back by Monday Nov 9, but probably earlier.
+From: "Pacaanas, Joel -" <jpacaanas@wiley.com>
+To: "Peter A. Buhr" <pabuhr@uwaterloo.ca>
+CC: "tdelisle@uwaterloo.ca" <tdelisle@uwaterloo.ca>
+Subject: RE: Action: Proof of SPE_EV_SPE2925 for Software: Practice And Experience ready for review
+Date: Fri, 20 Nov 2020 05:27:18 +0000
+Dear Peter,
+We have now reset the proof back to original stage. Please refer to the below editable link.
+https://wiley.eproofing.in/Proof.aspx?token=ab7739d5678447fbbe5036f3bcba2445081500061
+Since the proof was reset, your added corrections before has also been removed. Please add them back.
+Please return your corrections at your earliest convenience.
+Best regards,
+Joel
+Joel Q. Pacaanas
+Production Editor
+On behalf of Wiley
+Manila
+We partner with global experts to further innovative research.
+E-mail: jpacaanas@wiley.com
+Tel: +632 88558618
+Fax: +632 5325 0768
+From: "Wiley Online Proofing" <notifications@eproofing.in>
+To: pabuhr@uwaterloo.ca
+Cc: SPEproofs@wiley.com
+Reply-To: eproofing@wiley.com
+Date: 26 Nov 2020 18:57:27 +0000
+Subject: Corrections successfully submitted for SPE_EV_SPE2925, Advanced control-flow in Cforall.
+Corrections successfully submitted
+Dear Dr. Peter Buhr,
+Thank you for reviewing the proof of the Software: Practice And Experience article Advanced control-flow in Cforall.
+View Article https://wiley.eproofing.in/Proof.aspx?token=ab7739d5678447fbbe5036f3bcba2445081500061
+This is a read-only version of your article with the corrections you have marked up.
+If you encounter any problems or have questions please contact me, Joel Pacaanas at (SPEproofs@wiley.com). For the quickest response include the journal name and your article ID (found in the subject line) in all correspondence.
+Best regards,
+Joel Pacaanas

doc/proposals/vtable.md

-              r3c64c668
+              r58fe85a
 default is provided or not, the second syntax can be used to pick a
 parameter on instantiation.
+### Extension: Object Access
+This requires that the resolution scope (see below) is at the type level or
+has explicate points with names. These are the tables and table names used
+here.
+The system already knows where to find the virtual table and the object. If
+the tables have particular identities, or on the user side names, then it is
+meaningful to check if a binding virtual table is the same* as another. The
+main use of this is virtual table declarations also give the type they bind
+and if a binding table matches a known table then the underlyind object in the
+trait object must be of that type.
+* By identity, by value would work and in some senses be more flexiable. But
+  it would be slower and refering to further away functions would be harder.
+This gives one of the main new features of the hierarchical use of virtual
+tables (see below); the ability to recover the underlying object. Or a pointer
+of the approprate type it which both reflects the implementation and gives a
+convenent way to encode the boolean/conditional aspect of the operation which
+is that a different virtual table might be in use.
+There are two general ways to reperent this; a cast or a field access. The
+cast is traditional and would definitely fit if a single pointer repersents
+a trait object with the virtual table as part of the object. However for a
+double pointer field access might be more approprate. By this system though
+it is not the type that is used as the identifier but the virtual table. If
+there is one table per type than it becomes equivilant again. Otherwise the
+table has to be used as the identifier and the type is just a result of that
+which seems important for syntax.
 Hierarchy
 …
 possibly like the one used to create the assertion.
+### Extension: Associated Types Use
+If the `associated_types.md` proposal is accepted the following trait could
+be added:
+    trait is_virtual(dtype T) {
+        dtype table;
+        // An example assertion:
+        const table & get_virtual_table(T &);
+    }
+There may be more assertions but there has to be at least one way to find
+the (possibly default) virtual table. It is required to construct instances
+of the type.
+Without the assotiated type it would look like this:
+    trait is_virtual(dtype T, dtype table) {
+        const table & get_virtual_table(T &);
+    }
+Which is just a little bit longer to use but becomes more problematic if the
+user has to explicately provide the table's name as it doesn't really have its
+own type name. If it does it is probably mangled.
 ### Virtual Tables as Types
 Here we consider encoding plus the implementation of functions on it to be a
 …
 be used in only some of the declarations.
     trait combiner fee = (summation_instance, sum);
+    trait combiner fee = {summation_instance, sum};
     trait combiner foe = summation_instance;

doc/refrat/refrat.tex

-              r3c64c668
+              r58fe85a
 %% Created On       : Wed Apr  6 14:52:25 2016
 %% Last Modified By : Peter A. Buhr
 %% Last Modified On : Wed Jan 31 17:30:23 2018
 %% Update Count     : 108
+%% Last Modified On : Mon Oct  5 09:02:53 2020
+%% Update Count     : 110
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 \usepackage{upquote}                                                                    % switch curled `'" to straight
 \usepackage{calc}
-\usepackage{xspace}
 \usepackage{varioref}                                                                   % extended references
-\usepackage{listings}                                                                   % format program code
 \usepackage[flushmargin]{footmisc}                                              % support label/reference in footnote
 \usepackage{latexsym}                                   % \Box glyph
 \usepackage{mathptmx}                                   % better math font with "times"
 \usepackage[usenames]{color}
+\input{common}                                          % common CFA document macros
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage{breakurl}
+\renewcommand{\UrlFont}{\small\sf}
+\usepackage[pagewise]{lineno}
+\renewcommand{\linenumberfont}{\scriptsize\sffamily}
+\usepackage[firstpage]{draftwatermark}
+\SetWatermarkLightness{0.9}
+% Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore
+% removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR
+% AFTER HYPERREF.
+\renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}}
+\setlength{\topmargin}{-0.45in}                                                 % move running title into header
+\setlength{\headsep}{0.25in}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\CFAStyle                                                                                               % use default CFA format-style
+\lstnewenvironment{C++}[1][]                            % use C++ style
+{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®}#1}}
+{}
+\newcommand{\CFALatin}{}
 % inline code ©...© (copyright symbol) emacs: C-q M-)
 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-.
 …
 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^
 % math escape $...$ (dollar symbol)
+\input{common}                                          % common CFA document macros
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage{breakurl}
+\renewcommand{\UrlFont}{\small\sf}
+\usepackage[pagewise]{lineno}
+\renewcommand{\linenumberfont}{\scriptsize\sffamily}
+\usepackage[firstpage]{draftwatermark}
+\SetWatermarkLightness{0.9}
+% Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore
+% removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR
+% AFTER HYPERREF.
+\renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}}
+\setlength{\topmargin}{-0.45in}                                                 % move running title into header
+\setlength{\headsep}{0.25in}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\CFAStyle                                                                                               % use default CFA format-style
+\lstnewenvironment{C++}[1][]                            % use C++ style
+{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}}
+{}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Names used in the document.
 \newcommand{\Version}{\input{../../version}}
+\newcommand{\Version}{\input{build/version}}
 \newcommand{\Textbf}[2][red]{{\color{#1}{\textbf{#2}}}}
 \newcommand{\Emph}[2][red]{{\color{#1}\textbf{\emph{#2}}}}

doc/theses/thierry_delisle_PhD/.gitignore

-              r3c64c668
+              r58fe85a
 comp_II/build/
+comp_II/img/*.fig.bak
 comp_II/comp_II.pdf
 comp_II/comp_II.ps
+comp_II/presentation.pdf
+thesis/build/
+thesis/fig/*.fig.bak
+thesis/thesis.pdf
+thesis/thesis.ps
 !Makefile

doc/theses/thierry_delisle_PhD/comp_II/Makefile

-              r3c64c668
+              r58fe85a
 Build = build
 Figures = figures
+Figures = img
 Macros = ../../../LaTeXmacros
 TeXLIB = .:${Macros}:${Build}:../../../bibliography:
 …
 ## Define the text source files.
-SOURCES = ${addsuffix .tex, \
-comp_II \
+}
 FIGURES = ${addsuffix .tex, \
+        emptybit \
+        emptytree \
+        emptytls \
+        resize \
+}
 PICTURES = ${addsuffix .pstex, \
+        base \
+        empty \
+        system \
+}
 …
 ## Define the documents that need to be made.
+all: comp_II.pdf presentation.pdf
+comp_II.pdf: ${FIGURES} ${PICTURES}
+presentation.pdf: presentationstyle.sty base.dark.pstex empty.dark.pstex system.dark.pstex
 DOCUMENT = comp_II.pdf
+DOCUMENT = comp_II.pdf presentation.pdf
 BASE = ${basename ${DOCUMENT}}
 …
 # File Dependencies #
+${DOCUMENT} : ${BASE}.ps
+%.pdf : build/%.ps | ${Build}
         ps2pdf $<
+${BASE}.ps : ${BASE}.dvi
         dvips ${Build}/$< -o $@
+build/%.ps : build/%.dvi | ${Build}
+        dvips $< -o $@
+${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \
+                ${Macros}/common.tex ${Macros}/indexstyle ../../../bibliography/pl.bib \
+                local.bib glossary.tex | ${Build}
+build/%.dvi : %.tex Makefile | ${Build}
         # Must have *.aux file containing citations for bibtex
         if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi
         -${BibTeX} ${Build}/${basename $@}
+        if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} $< ; fi
+        -${BibTeX} ${basename $@}
         # Some citations reference others so run again to resolve these citations
         ${LaTeX} ${basename $@}.tex
         -${BibTeX} ${Build}/${basename $@}
+        ${LaTeX} $<
+        -${BibTeX} ${basename $@}
         # Make index from *.aux entries and input index at end of document
         makeglossaries -q -s ${Build}/${basename $@}.ist ${Build}/${basename $@}
+        -makeglossaries -q -s ${basename $@}.ist ${basename $@}
         # Run again to finish citations
         ${LaTeX} ${basename $@}.tex
+        ${LaTeX} $<
 ## Define the default recipes.
 …
         mkdir -p ${Build}
 %.tex : %.fig ${Build}
+%.tex : img/%.fig | ${Build}
         fig2dev -L eepic $< > ${Build}/$@
 %.ps : %.fig | ${Build}
+%.ps : img/%.fig | ${Build}
         fig2dev -L ps $< > ${Build}/$@
 %.pstex : %.fig | ${Build}
+%.pstex : img/%.fig | ${Build}
         fig2dev -L pstex $< > ${Build}/$@
+        fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t
+## pstex with inverted colors
+%.dark.pstex : img/%.fig Makefile | ${Build}
+        fig2dev -L pstex $< > ${Build}/$@
+        sed -i 's/\/col-1 {0 setgray} bind def/\/col-1 {1 setgray} bind def/g' ${Build}/$@
+        sed -i 's/\/col0 {0.000 0.000 0.000 srgb} bind def/\/col0 {1.000 1.000 1.000 srgb} bind def/g' ${Build}/$@
+        sed -i 's/\/col7 {1.000 1.000 1.000 srgb} bind def/\/col7 {0.000 0.000 0.000 srgb} bind def/g' ${Build}/$@
         fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t

doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex

-              r3c64c668
+              r58fe85a
+\documentclass[11pt,fullpage]{article}
+\documentclass[11pt]{article}
+\usepackage{fullpage}
 \usepackage[T1]{fontenc}
 \usepackage[utf8]{inputenc}
-\usepackage{listings}           % for code listings
 \usepackage{xspace}
 \usepackage{xcolor}
 \usepackage{graphicx}
+\usepackage[hidelinks]{hyperref}
+\usepackage{epic,eepic}
+\usepackage{listings}                   % for code listings
 \usepackage{glossaries}
 \usepackage{textcomp}
-\usepackage{geometry}
 % cfa macros used in the document
 \input{common}
+\setlist{topsep=6pt,parsep=0pt}         % global reduce spacing between points
+\newcommand{\uC}{$\mu$\CC}
+\usepackage[hidelinks]{hyperref}
+\setlength{\abovecaptionskip}{5pt plus 3pt minus 2pt}
+\lstMakeShortInline$%                   % single-character for \lstinline
+%\usepackage[margin=1in]{geometry}
+%\usepackage{float}
 \input{glossary}
 …
 \author{
         \huge Thierry Delisle \\
         \Large \vspace*{0.1in} \texttt{tdelisle@uwaterloo.ca} \\
+        \huge Thierry Delisle \vspace*{5pt} \\
+        \Large \texttt{tdelisle@uwaterloo.ca} \vspace*{5pt} \\
         \Large Cheriton School of Computer Science \\
         \Large University of Waterloo
 …
 \begin{document}
 \maketitle
+\thispagestyle{empty}
 \cleardoublepage
 \newcommand{\cit}{\textsuperscript{[Citation Needed]}\xspace}
 \newcommand{\TODO}{~\newline{\large\bf\color{red} TODO :}\xspace}
+\newcommand{\TODO}{{\large\bf\color{red} TODO: }\xspace}
 % ===============================================================================
 …
 \section{Introduction}
 \subsection{\CFA and the \CFA concurrency package}
+\CFA\cit is a modern, polymorphic, non-object-oriented, backwards-compatible extension of the C programming language. It aims to add high productivity features while maintaning the predictible performance of C. As such concurrency in \CFA\cit aims to offer simple and safe high-level tools while still allowing performant code. Concurrent code is written in the syncrhonous programming paradigm but uses \glspl{uthrd} in order to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programing. As such the \CFA scheduler is a user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}.
+The goal of this research is to produce a scheduler that is simple to use and offers acceptable performance in all cases. Here simplicity does not refer to the API but to how much scheduling concerns programmers need to take into account when using the \CFA concurrency package. Therefore, the main goal of this proposal is as follows :
+\CFA~\cite{Moss18} is a modern, polymorphic, non-object-oriented, concurrent, backwards-compatible extension of the C programming language.
+It aims to add high-productivity features while maintaining the predictable performance of C.
+As such, concurrency in \CFA~\cite{Delisle19} aims to offer simple and safe high-level tools while still allowing performant code.
+\CFA concurrent code is written in the synchronous programming paradigm but uses \glspl{uthrd} to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programming.
+As such, the \CFA \newterm{scheduler} is a preemptive user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}.
+\subsection{Scheduling}
+\newterm{Scheduling} occurs when execution switches from one thread to another, where the second thread is implicitly chosen by the scheduler.
+This scheduling is an indirect handoff, as opposed to generators and coroutines that explicitly switch to the next generator and coroutine respectively.
+The cost of switching between two threads for an indirect handoff has two components:
+\begin{enumerate}
+\item
+the cost of actually context-switching, \ie changing the relevant registers to move execution from one thread to the other,
+\item
+and the cost of scheduling, \ie deciding which thread to run next among all the threads ready to run.
+\end{enumerate}
+The first cost is generally constant\footnote{Affecting the constant context-switch cost is whether it is done in one step, where the first thread schedules the second, or in two steps, where the first thread context switches to a third scheduler thread.}, while the scheduling cost can vary based on the system state.
+Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, \ie \newterm{linearizability}\footnote{Meaning however fast the CPU threads run, there is an equivalent sequential order that gives the same result.}, and a new dimension to performance: scalability, where scheduling cost also depends on contention.
+The more threads switch, the more the administration cost of scheduling becomes noticeable.
+It is therefore important to build a scheduler with the lowest possible cost and latency.
+Another important consideration is \newterm{fairness}.
+In principle, scheduling should give the illusion of perfect fairness, where all threads ready to run are running \emph{simultaneously}.
+In practice, there can be advantages to unfair scheduling, similar to the express cash register at a grocery store.
+While the illusion of simultaneity is easier to reason about, it can break down if the scheduler allows too much unfairness.
+Therefore, the scheduler should offer as much fairness as needed to guarantee eventual progress, but use unfairness to help performance.
+\subsection{Research Goal}
+The goal of this research is to produce a scheduler that is simple for programmers to understand and offers good general performance.
+Here understandability does not refer to the API but to how much scheduling concerns programmers need to take into account when writing a \CFA concurrent package.
+Therefore, the main consequence of this goal is :
 \begin{quote}
 The \CFA scheduler should be \emph{viable} for any workload.
+The \CFA scheduler should be \emph{viable} for \emph{any} workload.
 \end{quote}
+This objective includes producing a scheduling strategy with minimal fairness guarantees, creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily and hide blocking I/O operations and, writing sufficient library tools to allow developpers to properly use the scheduler.
+% ===============================================================================
+% ===============================================================================
+\section{Scheduling for \CFA}
+While the \CFA concurrency package doesn't have any particular scheduling needs beyond those of any concurrency package which uses \glspl{uthrd}, it is important that the default \CFA Scheduler be viable in general. Indeed, since the \CFA Scheduler does not target any specific workloads, it is unrealistic to demand that it use the best scheduling strategy in all cases. However, it should offer a viable ``out of the box'' solution for most scheduling problems so that programmers can quickly write performant concurrent without needed to think about which scheduling strategy is more appropriate for their workload. Indeed, only programmers with exceptionnaly high performance requirements should need to write their own scheduler. More specifically, two broad types of schedulering strategies should be avoided in order to avoid penalizing certain types of workloads : feedback-based and priority schedulers.
+For a general-purpose scheduler, it is impossible to produce an optimal algorithm as that requires knowledge of the future behaviour of threads.
+As such, scheduling performance is generally either defined by a best-case scenario, \ie a workload to which the scheduler is tailored, or a worst-case scenario, \ie the scheduler behaves no worse than \emph{X}.
+For this proposal, the performance is evaluated using the second approach to allow \CFA programmers to rely on scheduling performance.
+Because there is no optimal scheduler, ultimately \CFA may allow programmers to write their own scheduler; but that is not the subject of this proposal, which considers only the default scheduler.
+As such, it is important that only programmers with exceptionally high performance requirements should need to write their own scheduler and replace the scheduler in this proposal.
+To achieve the \CFA scheduling goal includes:
+\begin{enumerate}
+        \item producing a scheduling strategy with sufficient fairness guarantees,
+        \item creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily,
+        \item scheduling blocking I/O operations,
+        \item and writing sufficient library tools to allow developers to indirectly use the scheduler, either through tuning knobs in the default scheduler or replacing the default scheduler.
+\end{enumerate}
+% ===============================================================================
+% ===============================================================================
+\section{\CFA Scheduling}
+To schedule user-level threads across all workloads, the scheduler has a number of requirements:
+\paragraph{Correctness} As with any other concurrent data structure or algorithm, the correctness requirement is paramount.
+The scheduler cannot allow threads to be dropped from the ready queue, \ie scheduled but never run, or be executed multiple times when only being scheduled once.
+Since \CFA concurrency has no spurious wake up, this definition of correctness also means the scheduler should have no spurious wake up.
+The \CFA scheduler must be correct.
+\paragraph{Performance} The performance of a scheduler can generally be measured in terms of scheduling cost, scalability and latency.
+\newterm{Scheduling cost} is the cost to switch from one thread to another, as mentioned above.
+For compute-bound concurrent applications with little context switching, the scheduling cost is negligible.
+For applications with high context-switch rates, scheduling cost can begin to dominating the cost.
+\newterm{Scalability} is the cost of adding multiple kernel threads.
+It can increase the time for scheduling because of contention from the multiple threads accessing shared resources, \eg a single ready queue.
+Finally, \newterm{tail latency} is service delay and relates to thread fairness.
+Specifically, latency measures how long a thread waits to run once scheduled and is evaluated by the worst case.
+The \CFA scheduler should offer good performance for all three metrics.
+\paragraph{Fairness} Like performance, this requirement has several aspects : eventual progress, predictability and performance reliability.
+\newterm{Eventual progress} guarantees every scheduled thread is eventually run, \ie prevent starvation.
+As a hard requirement, the \CFA scheduler must guarantee eventual progress, otherwise the above-mentioned illusion of simultaneous execution is broken and the scheduler becomes much more complex to reason about.
+\newterm{Predictability} and \newterm{reliability} mean similar workloads achieve similar performance so programmer execution intuition is respected.
+For example, a thread that yields aggressively should not run more often than other threads.
+While this is intuitive, it does not hold true for many work-stealing or feedback based schedulers.
+The \CFA scheduler must guarantee eventual progress, should be predictable, and offer reliable performance.
+\paragraph{Efficiency} Finally, efficient usage of CPU resources is also an important requirement and is discussed in depth towards the end of the proposal.
+\newterm{Efficiency} means avoiding using CPU cycles when there are no threads to run (to conserve energy), and conversely, using as many available CPU cycles when the workload can benefit from it.
+Balancing these two states is where the complexity lies.
+The \CFA scheduler should be efficient with respect to the underlying (shared) computer.
+\bigskip To achieve these requirements, I can reject two broad types of scheduling strategies : feedback-based and priority schedulers.
 \subsection{Feedback-Based Schedulers}
+Many operating systems use schedulers based on feadback loops in some form, they measure how much CPU a particular thread has used\footnote{Different metrics can be used to here but it is not relevant to the discussion.} and schedule threads based on this metric. These strategies are sensible for operating systems but rely on two assumptions on the workload :
+\begin{enumerate}
+        \item Threads live long enough to be scheduled many times.
+        \item Cooperation among all threads is not simply infeasible, it is a security risk.
+\end{enumerate}
+While these two assumptions generally hold for operating systems, they may not for \CFA programs. In fact, \CFA uses \glspl{uthrd} which have the explicit goal of reducing the cost of threading primitives to allow many smaller threads. This can naturally lead to have threads with much shorter lifetime and only being scheduled a few times. Scheduling strategies based on feadback loops cannot be effective in these cases because they will not have the opportunity to measure the metrics that underlay the algorithm. Note that the problem of feadback loop convergence (reacting too slowly to scheduling events) is not specific to short lived threads but can also occur with threads that show drastic changes in scheduling event, e.g., threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
+In the context of operating systems, these concerns can be overshadowed by a more pressing concern : security. When multiple users are involved, it is possible that some users are malevolent and try to exploit the scheduling strategy in order to achieve some nefarious objective. Security concerns mean that more precise and robust fairness metrics must be used. In the case of the \CFA scheduler, every thread runs in the same user-space and are controlled from the same user. It is then possible to safely ignore the possibility that threads are malevolent and assume that all threads will ignore or cooperate with each other. This allows for a much simpler fairness metric and in this proposal ``fairness'' will be considered as equal opportunities to run once scheduled.
+Since feadback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not user per-threads feedback. Feedback loops in general are not rejected for secondary concerns like idle sleep, but no feedback loop is used to decide which thread to run next.
+Many operating systems use schedulers based on feedback in some form, \eg measuring how much CPU a particular thread has used\footnote{Different metrics can be measured but it is not relevant to the discussion.} and schedule threads based on this metric.
+These strategies are sensible for operating systems but rely on two assumptions for the workload:
+\begin{enumerate}
+        \item Threads live long enough for useful feedback information to be gathered.
+        \item Threads belong to multiple users so fairness across users is important.
+\end{enumerate}
+While these two assumptions generally hold for operating systems, they may not for user-level threading.
+Since \CFA has the explicit goal of allowing many smaller threads, this can naturally lead to threads with much shorter lifetimes that are only scheduled a few times.
+Scheduling strategies based on feedback cannot be effective in these cases because there is no opportunity to measure the metrics that underlie the algorithm.
+Note, the problem of \newterm{feedback convergence} (reacting too slowly to scheduling events) is not specific to short-lived threads but can also occur with threads that show drastic changes in scheduling, \eg threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
+In the context of operating systems, these concerns can be overshadowed by a more pressing concern : security.
+When multiple users are involved, it is possible some users are malevolent and try to exploit the scheduling strategy to achieve some nefarious objective.
+Security concerns mean more precise and robust fairness metrics must be used to guarantee fairness across processes created by users as well as threads created within a process.
+In the case of the \CFA scheduler, every thread runs in the same user space and is controlled by the same user.
+Fairness across users is therefore a given and it is then possible to safely ignore the possibility that threads are malevolent.
+This approach allows for a much simpler fairness metric, and in this proposal, \emph{fairness} is defined as:
+\begin{quote}
+When multiple threads are cycling through the system, the total ordering of threads being scheduled, \ie pushed onto the ready queue, should not differ much from the total ordering of threads being executed, \ie popped from the ready queue.
+\end{quote}
+Since feedback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not use per-threads feedback.
+Feedback in general is not rejected for secondary concerns like idle sleep for kernel threads, but no feedback is used to decide which thread to run next.
 \subsection{Priority Schedulers}
+Another broad category of schedulers are priority schedulers. In these scheduling strategies threads have priorities and the runtime schedules the threads with the highest priority before scheduling other threads. Threads with equal priority are scheduled using a secondary strategy, often something simple like round-robin or FIFO. These priority mean that, as long as there is a thread with a higher priority that desires to run, a thread with a lower priority will not run. This possible starving of threads can dramatically increase programming complexity since starving threads and priority inversion (prioritising a lower priority thread) can both lead to serious problems, leaving programmers between a rock and a hard place.
+An important observation to make is that threads do not need to have explicit priorities for problems to be possible. Indeed, any system with multiple ready-queues and attempts to exhaust one queue before accessing the other queues, could encounter starvation problems. A popular scheduling strategy that suffers from implicit priorities is work-stealing. Work-stealing is generally presented as follows :
+\begin{itemize}
+        \item Each processor has a list of threads.
+\end{itemize}
+\begin{enumerate}
+        \item Run threads from ``this'' processor's list.
+        \item If ``this'' processor's list is empty, run threads from some other processor's list.
+\end{enumerate}
+In a loaded system\footnote{A loaded system is a system where threads are being run at the same rate they are scheduled}, if a thread does not yield or block for an extended period of time, threads on the same processor list will starve if no other processors can exhaust their list.
+Since priorities can be complex to handle for programmers, the scheduling strategy proposed for the \CFA runtime does not use a strategy with either implicit or explicit thread priorities.
+\subsection{Schedulers without feadback or priorities}
+I claim that the ideal default scheduler for the \CFA runtime is a scheduler that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about. The simplest fairness guarantee is to guarantee FIFO ordering, i.e., threads scheduled first will run first. However, enforcing FIFO ordering generally conflicts with scalability across multiple processors because of the additionnal synchronization. Thankfully, strict FIFO is not needed for scheduling. Since concurrency is inherently non-deterministic, fairness concerns in scheduling are only a problem if a thread repeatedly runs before another thread can run\footnote{This is because the non-determinism means that programmers must already handle ordering problems in order to produce correct code and already must rely on weak guarantees, for example that a specific thread will \emph{eventually} run.}. This need for unfairness to persist before problems occur means that the FIFO fairness guarantee can be significantly relaxed without causing problems. For this proposal, the target guarantee is that the \CFA scheduler guarantees \emph{probable} FIFO ordering, which is defined as follows :
+\begin{itemize}
+        \item Given two threads $X$ and $Y$, the odds that thread $X$ runs $N$ times \emph{after} thread $Y$ is scheduled but \emph{before} it is run, decreases exponentially with regards to $N$.
+\end{itemize}
+While this is not a strong guarantee, the probability that problems persist for long period of times decreases exponentially, making persisting problems virtually impossible.
+\subsection{Real-Time}
+While the objective of this proposed scheduler is similar to the objective of real-time scheduling, this proposal is not a proposal for real-time scheduler and as such makes no attempt to offer either soft or hard guarantees on scheduling delays.
+% ===============================================================================
+% ===============================================================================
+\section{Proposal}
+\subsection{Ready-Queue}
+Using trevor's paper\cit as basis, it is simple to build a relaxed FIFO list that is fast and scalable for loaded or overloaded systems. The described queue uses an array of underlying strictly FIFO queue. Pushing new data is done by selecting one of these underlying queues at random, recording a timestamp for the push and pushing to the selected queue. Popping is done by selecting two queues at random and popping from the queue for which the head has the oldest timestamp. In loaded or overloaded systems, it is higly likely that the queues is far from empty, e.i., several tasks are on each of the underlying queues. This means that selecting a queue at random to pop from is higly likely to yield a queue that is not empty.
+When the ready queue is "more empty", i.e., several of the inner queues are empty, selecting a random queue for popping is less likely to yield a valid selection and more attempts need to be made, resulting in a performance degradation. In cases, with few elements on the ready queue and few processors running, performance can be improved by adding information to help processors find which inner queues are used. Preliminary performance tests indicate that with few processors, a bitmask can be used to identify which inner queues are currently in use. This is especially effective in the single-thread case, where the bitmask will always be up-to-date. Furthermore, modern x86 CPUs have a BMI2 extension which allow using the bitmask with very little overhead over directly accessing the readyqueue offerring decent performance even in cases with many empty inner queues. This technique does not solve the problem completely, it randomly attempts to find a block of 64 queues where at least one is used, instead of attempting to find a used queue. For systems with a large number of cores this does not completely solve the problem, but it is a fixed improvement. The size of the blocks are limited by the maximum size atomic instruction can operate on, therefore atomic instructions on large words would increase the 64 queues per block limit.
+\TODO double check the next sentence
+Preliminary result indicate that the bitmask approach with the BMI2 extension can lead to multi-threaded performance that is contention agnostic in the worst case.
+This result suggests that the contention penalty and the increase performance for additionnal thread cancel each other exactly. This may indicate that a relatively small reduction in contention may tip the performance into positive scalling even for the worst case. It can be noted that in cases of high-contention, the use of the bitmask to find queues that are not empty is much less reliable. Indeed, if contention on the bitmask is high, it means it probably changes significantly between the moment it is read and the actual operation on the queues it represents. Furthermore, the objective of the bitmask is to avoid probing queues that are empty. Therefore, in cases where the bitmask is highly contented, it may be preferrable to probe queues randomly, either until contention decreases or until a prior prefetch of the bitmask completes. Ideally, the scheduler would be able to observe that the bitmask is highly contented and adjust its behaviour appropriately. However, I am not aware of any mechanism to query whether a cacheline is in cache or to run other instructions until a cacheline is fetch without blocking on the cacheline. As such, an alternative that may have a similar impact would be for each thread to have their own bitmask, which would be updated both after each scheduler action and after a certain number of failed probing. If the bitmask has little contention, the local bitmask will be mostly up-to-date and several threads won't need to contend as much on the global bitmask. If the bitmask has significant contention, then fetching it becomes more expensive and threads may as well probe randomly. This solution claims that probing randomly or against an out-of-date bitmask is equivalent.
+In cases where this is insufficient, another approach is to use a hiearchical data structure. Creating a tree of nodes to reduce contention has been shown to work in similar cases\cit(SNZI: Scalable NonZero Indicators)\footnote{This particular paper seems to be patented in the US. How does that affect \CFA? Can I use it in my work?}. However, this approach may lead to poorer single-threaded performance due to the inherent pointer chasing, as such, it was not considered as the first approach but as a fallback in case the bitmask approach does not satisfy the performance goals.
+Part of this performance relies on contention being low when there are few threads on the readyqueue. However, this can be assumed reliably if the system handles putting idle processors to sleep, which is addressed in section \ref{sleep}.
+Another broad category of schedulers are priority schedulers.
+In these scheduling strategies, threads have priorities and the runtime schedules the threads with the highest priority before scheduling other threads.
+Threads with equal priority are scheduled using a secondary strategy, often something simple like round robin or FIFO.
+A consequence of priority is that, as long as there is a thread with a higher priority that desires to run, a thread with a lower priority does not run.
+The potential for thread starvation dramatically increases programming complexity since starving threads and priority inversion (prioritizing a lower priority thread) can both lead to serious problems.
+An important observation is that threads do not need to have explicit priorities for problems to occur.
+Indeed, any system with multiple ready queues that attempts to exhaust one queue before accessing the other queues, essentially provides implicit priority, which can encounter starvation problems.
+For example, a popular scheduling strategy that suffers from implicit priorities is work stealing.
+\newterm{Work stealing} is generally presented as follows:
+\begin{enumerate}
+        \item Each processor has a list of ready threads.
+        \item Each processor runs threads from its ready queue first.
+        \item If a processor's ready queue is empty, attempt to run threads from some other processor's ready queue.
+\end{enumerate}
+In a loaded system\footnote{A \newterm{loaded system} is a system where threads are being run at the same rate they are scheduled.}, if a thread does not yield, block, or preempt for an extended period of time, threads on the same processor's list starve if no other processors exhaust their list.
+Since priorities can be complex for programmers to incorporate into their execution intuition, the \CFA scheduling strategy does not provided explicit priorities and attempts to eliminate implicit priorities.
+\subsection{Schedulers without feedback or priorities}
+This proposal conjectures that it is possible to construct a default scheduler for the \CFA runtime that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about.
+The simplest fairness guarantee is FIFO ordering, \ie threads scheduled first run first.
+However, enforcing FIFO ordering generally conflicts with scalability across multiple processors because of the additional synchronization.
+Thankfully, strict FIFO is not needed for sufficient fairness.
+Since concurrency is inherently non-deterministic, fairness concerns in scheduling are only a problem if a thread repeatedly runs before another thread can run.
+Some relaxation is possible because non-determinism means programmers already handle ordering problems to produce correct code and hence rely on weak guarantees, \eg that a thread \emph{eventually} runs.
+Since some reordering does not break correctness, the FIFO fairness guarantee can be significantly relaxed without causing problems.
+For this proposal, the target guarantee is that the \CFA scheduler provides \emph{probable} FIFO ordering, which allows reordering but makes it improbable that threads are reordered far from their position in total ordering.
+The \CFA scheduler fairness is defined as follows:
+\begin{quote}
+Given two threads $X$ and $Y$, the odds that thread $X$ runs $N$ times \emph{after} thread $Y$ is scheduled but \emph{before} it is run, decreases exponentially with regard to $N$.
+\end{quote}
+While this is not a bounded guarantee, the probability that unfairness persist for long periods of times decreases exponentially, making persisting unfairness virtually impossible.
+% ===============================================================================
+% ===============================================================================
+\section{Proposal Details}
+\subsection{Central Ready Queue} \label{sec:queue}
+A central ready queue can be built from a FIFO queue, where user threads are pushed onto the queue when they are ready to run, and processors (kernel-threads acting as virtual processors) pop the user threads from the queue and execute them.
+Alistarh \etal~\cite{alistarh2018relaxed} show it is straightforward to build a relaxed FIFO list that is fast and scalable for loaded or overloaded systems.
+The described queue uses an array of underlying strictly FIFO queues as shown in Figure~\ref{fig:base}\footnote{For this section, the number of underlying queues is assumed to be constant.
+Section~\ref{sec:resize} discusses resizing the array.}.
+Pushing new data is done by selecting one of the underlying queues at random, recording a timestamp for the operation, and pushing to the selected queue.
+Popping is done by selecting two queues at random and popping from the queue with the oldest timestamp.
+A higher number of underlying queues leads to less contention on each queue and therefore better performance.
+In a loaded system, it is highly likely the queues are non-empty, \ie several threads are on each of the underlying queues.
+For this case, selecting a queue at random to pop from is highly likely to yield a queue with available items.
+In Figure~\ref{fig:base}, ignoring the ellipsis, the chances of getting an empty queue is 2/7 per pick, meaning two random picks yield an item approximately 9 times out of 10.
+\begin{figure}
+        \begin{center}
+                \input{base.pstex_t}
+        \end{center}
+        \caption{Loaded relaxed FIFO list base on an array of strictly FIFO lists.
+        A timestamp appears in each node and array cell.}
+        \label{fig:base}
+\end{figure}
+\begin{figure}
+        \begin{center}
+                \input{empty.pstex_t}
+        \end{center}
+        \caption{Underloaded relaxed FIFO list where the array contains many empty cells.}
+        \label{fig:empty}
+\end{figure}
+In an underloaded system, several of the queues are empty, so selecting a random queue for popping is less likely to yield a successful selection and more attempts are needed, resulting in a performance degradation.
+Figure~\ref{fig:empty} shows an example with fewer elements, where the chances of getting an empty queue is 5/7 per pick, meaning two random picks yield an item only half the time.
+Since the ready queue is not empty, the pop operation \emph{must} find an element before returning and therefore must retry.
+Note, the popping kernel thread has no work to do, but CPU cycles are wasted both for available user and kernel threads during the pop operation as the popping thread is using a CPU.
+Overall performance is therefore influenced by the contention on the underlying queues and pop performance is influenced by the item density.
+This leads to four performance cases for the centralized ready queue, as depicted in Table~\ref{tab:perfcases}.
+The number of processors (many or few) refers to the number of kernel threads \emph{actively} attempting to pop user threads from the queues, not the total number of kernel threads.
+The number of threads (many or few) refers to the number of user threads ready to be run.
+Many threads means they outnumber processors significantly and most underlying queues have items, few threads mean there are barely more threads than processors and most underlying queues are empty.
+Cases with fewer threads than processors are discussed in Section~\ref{sec:sleep}.
+\begin{table}
+        \begin{center}
+                \begin{tabular}{|r|l|l|}
+                        \cline{2-3}
+                        \multicolumn{1}{r|}{} & \multicolumn{1}{c|}{Many Processors} & \multicolumn{1}{c|}{Few Processors} \\
+                        \hline
+                        Many Threads & A: good performance & B: good performance \\
+                        \hline
+                        Few Threads  & C: worst performance & D: poor performance \\
+                        \hline
+                \end{tabular}
+        \end{center}
+        \caption{Expected performance of the relaxed FIFO list in different cases.}
+        \label{tab:perfcases}
+\end{table}
+Performance can be improved in Table~\ref{tab:perfcases} case~D by adding information to help processors find which inner queues are used.
+This addition aims to avoid the cost of retrying the pop operation but does not affect contention on the underlying queues and can incur some management cost for both push and pop operations.
+The approach used to encode this information can vary in density and be either global or local.
+\newterm{Density} means the information is either packed in a few cachelines or spread across several cachelines, and \newterm{local information} means each thread uses an independent copy instead of a single global, \ie common, source of information.
+For example, Figure~\ref{fig:emptybit} shows a dense bitmask to identify which inner queues are currently in use.
+This approach means processors can often find user threads in constant time, regardless of how many underlying queues are empty.
+Furthermore, modern x86 CPUs have extended bit manipulation instructions (BMI2) that allow using the bitmask with very little overhead compared to the randomized selection approach for a filled ready queue, offering good performance even in cases with many empty inner queues.
+However, this technique has its limits: with a single word\footnote{Word refers here to however many bits can be written atomically.} bitmask, the total number of underlying queues in the ready queue is limited to the number of bits in the word.
+With a multi-word bitmask, this maximum limit can be increased arbitrarily, but it is not possible to check if the queue is empty by reading the bitmask atomically.
+Finally, a dense bitmap, either single or multi-word, causes additional problems in Table~\ref{tab:perfcases} case C, because many processors are continuously scanning the bitmask to find the few available threads.
+This increased contention on the bitmask(s) reduces performance because of cache misses after updates and the bitmask is updated more frequently by the scanning processors racing to read and/or update that information.
+This increased update frequency means the information in the bitmask is more often stale before a processor can use it to find an item, \ie mask read says there are available user threads but none on queue.
+\begin{figure}
+        \begin{center}
+                {\resizebox{0.73\textwidth}{!}{\input{emptybit}}}
+        \end{center}
+        \vspace*{-5pt}
+        \caption{Underloaded queue with added bitmask to indicate which array cells have items.}
+        \label{fig:emptybit}
+        \begin{center}
+                {\resizebox{0.73\textwidth}{!}{\input{emptytree}}}
+        \end{center}
+        \vspace*{-5pt}
+        \caption{Underloaded queue with added binary search tree indicate which array cells have items.}
+        \label{fig:emptytree}
+        \begin{center}
+                {\resizebox{0.9\textwidth}{!}{\input{emptytls}}}
+        \end{center}
+        \vspace*{-5pt}
+        \caption{Underloaded queue with added per processor bitmask to indicate which array cells have items.}
+        \label{fig:emptytls}
+\end{figure}
+Figure~\ref{fig:emptytree} shows an approach using a hierarchical tree data-structure to reduce contention and has been shown to work in similar cases~\cite{ellen2007snzi}.
+However, this approach may lead to poorer performance in Table~\ref{tab:perfcases} case~B due to the inherent pointer chasing cost and already low contention cost in that case.
+Figure~\ref{fig:emptytls} shows an approach using dense information, similar to the bitmap, but have each thread keep its own independent copy of it.
+While this approach can offer good scalability \emph{and} low latency, the liveliness of the information can become a problem.
+In the simple cases, local copies can become stale and end-up not being useful for the pop operation.
+A more serious problem is that reliable information is necessary for some parts of this algorithm to be correct.
+As mentioned in this section, processors must know \emph{reliably} whether the list is empty or not to decide if they can return \texttt{NULL} or if they must keep looking during a pop operation.
+Section~\ref{sec:sleep} discusses another case where reliable information is required for the algorithm to be correct.
+There is a fundamental tradeoff among these approach.
+Dense global information about empty underlying queues helps zero-contention cases at the cost of the high-contention case.
+Sparse global information helps high-contention cases but increases latency in zero-contention cases to read and ``aggregate'' the information\footnote{Hierarchical structures, \eg binary search tree, effectively aggregate information but follow pointer chains, learning information at each node.
+Similarly, other sparse schemes need to read multiple cachelines to acquire all the information needed.}.
+Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases.
+However, the information can become stale making it difficult to use to ensure correctness.
+The fact that these solutions have these fundamental limits suggest to me a better solution that attempts to combine these properties in an interesting way.
+Also, the lock discussed in Section~\ref{sec:resize} allows for solutions that adapt to the number of processors, which could also prove useful.
 \paragraph{Objectives and Existing Work}
+How much scalability is actually needed is highly debatable, libfibre\cit is has compared favorably to other schedulers in webserver tests\cit and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask. As such the single atomic instruction on a shared cacheline may be sufficiently performant.
+I have built a prototype of this ready-queue (including the bitmask and BMI2 usage, but not the sharded bitmask) and ran performance experiments on it but it is difficult to compare this prototype to a thread scheduler as the prototype is used as a data-queue. I have also integrated this prototype into the \CFA runtime, but have not yet created performance experiments to compare results. I believe that the bitmask approach is currently one of the larger risks of the proposal, early tests lead me to believe it may work but it is not clear that the contention problem can be overcome. The worst-case scenario is a case where the number of processors and the number of ready threads are similar, yet scheduling events are very frequent. Fewer threads should lead to the Idle Sleep mechanism reducing contention while having many threads ready leads to optimal performance. It is difficult to evaluate the likeliness of this worst-case scenario in real workloads. I believe, frequent scheduling events suggest a more ``bursty'' workload where new work is finely divided among many threads which race to completion. This type of workload would only see a peek of contention close to the end of the work, but no sustained contention. Very fine-grained pipelines are less ``bursty'', these may lead to more sustained contention. However, they could also easily benefit from a direct hand-off strategy which would circumvent the problem entirely.
+\subsection{Dynamic Resizing}
+The \CFA runtime system currently handles dynamically adding and removing processors from clusters at any time. Since this is part of the existing design, the proposed scheduler must also support this behaviour. However, dynamicly resizing the clusters is considered a rare event associated with setup, teardown and major configuration changes. This assumptions is made both in the design of the proposed scheduler as well as in the original design of the \CFA runtime system. As such, the proposed scheduler must honor the correctness of these behaviour but does not have any performance objectives with regards to resizing a cluster. How long adding or removing processors take and how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long period of times. This description effectively matches with te description of a Reader-Writer lock, in frequent but invasive updates among frequent (mostly) read operations. In the case of the Ready-Queue described above, read operations are operations that push or pop from the ready-queue but do not invalidate any references to the ready queue data structures. Writes on the other-hand would add or remove inner queues, invalidating references to the array of inner queues in the process. Therefore, the current proposed approach to this problem is the add a per-cluster Reader Writer lock around the ready queue to prevent restructuring of the ready-queue data structure while threads are being pushed or popped.
+There are possible alternatives to the Reader Writer lock solution. This problem is effectively a memory reclamation problem and as such there is a large body of research on the subject. However, the RWlock solution is simple and can be leveraged to solve other problems (e.g. processor ordering and memory reclamation of threads) which makes it an attractive solution.
+How much scalability is actually needed is highly debatable.
+\emph{libfibre}~\cite{libfibre} has compared favourably to other schedulers in webserver tests~\cite{Karsten20} and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask.
+As such, the single atomic instruction on a shared cacheline may be sufficiently performant.
+I have built a prototype of this ready queue in the shape of a data queue, \ie nodes on the queue are structures with a single $int$ representing a thread and intrusive data fields.
+Using this prototype, preliminary performance experiments confirm the expected performance in Table~\ref{tab:perfcases}.
+However, these experiments only offer a hint at the actual performance of the scheduler since threads are involved in more complex operations, \eg threads are not independent of each other: when a thread blocks some other thread must intervene to wake it.
+I have also integrated this prototype into the \CFA runtime, but have not yet created performance experiments to compare results, as creating one-to-one comparisons between the prototype and the \CFA runtime will be complex.
+\subsection{Dynamic Resizing} \label{sec:resize}
+\begin{figure}
+        \begin{center}
+                \input{system.pstex_t}
+        \end{center}
+        \caption{Global structure of the \CFA runtime system.}
+        \label{fig:system}
+\end{figure}
+The \CFA runtime system groups processors together as \newterm{clusters}, as shown in Figure~\ref{fig:system}.
+Threads on a cluster are always scheduled on one of the processors of the cluster.
+Currently, the runtime handles dynamically adding and removing processors from clusters at any time.
+Since this feature is part of the existing design, the proposed scheduler must also support this behaviour.
+However, dynamically resizing a cluster is considered a rare event associated with setup, tear down and major configuration changes.
+This assumption is made both in the design of the proposed scheduler as well as in the original design of the \CFA runtime system.
+As such, the proposed scheduler must honour the correctness of this behaviour but does not have any performance objectives with regard to resizing a cluster.
+That is, the time to add or remove processors and how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long periods of times.
+However, as mentioned in Section~\ref{sec:queue}, contention on the underlying queues can have a direct impact on performance.
+The number of underlying queues must therefore be adjusted as the number of processors grows or shrinks.
+Since the underlying queues are stored in a dense array, changing the number of queues requires resizing the array and expanding the array requires moving it, which can introduce memory reclamation problems if not done correctly.
+\begin{figure}
+        \begin{center}
+                \input{resize}
+        \end{center}
+        \caption{Copy of data structure shown in Figure~\ref{fig:base}.}
+        \label{fig:base2}
+\end{figure}
+It is important to note how the array is used in this case.
+While the array cells are modified by every push and pop operation, the array itself, \ie the pointer that would change when resized, is only read during these operations.
+Therefore the use of this pointer can be described as frequent reads and infrequent writes.
+This description effectively matches with the description of a reader-writer lock, infrequent but invasive updates among frequent read operations.
+In the case of the ready queue described above, read operations are operations that push or pop from the ready queue but do not invalidate any references to the ready queue data structures.
+Writes, on the other hand, would add or remove inner queues, invalidating references to the array of inner queues in a process.
+Therefore, the current proposed approach to this problem is to add a per-cluster reader-writer lock around the ready queue to prevent restructuring of the ready-queue data-structure while threads are being pushed or popped.
+There are possible alternatives to the reader-writer lock solution.
+This problem is effectively a memory reclamation problem and as such there is a large body of research on the subject~\cite{brown2015reclaiming, michael2004hazard}.
+However, the reader-write lock-solution is simple and can be leveraged to solve other problems (\eg processor ordering and memory reclamation of threads), which makes it an attractive solution.
 \paragraph{Objectives and Existing Work}
+The lock must offer scalability and performance on par with the actual ready-queue in order not to introduce a new bottle neck. I have already built a lock that fits the desired requirements and preliminary testing show scalability and performance that exceed the target. As such, I do not consider this lock to be a risk on this project.
+\subsection{Idle Sleep} \label{sleep}
+As mentionned above, idle sleep is the process of putting processors to sleep while they do not have threads to execute. In this context processors are kernel-threads and sleeping refers to asking the kernel to block a thread. This can be achieved with either thread synchronization operations like pthread\_cond\_wait or using signal operations like sigsuspend.
+Support for idle sleep broadly involves calling the operating system to block the kernel thread but also handling the race between the sleeping and the waking up, and handling which kernel thread should sleep or wake-up.
+When a processor decides to sleep, there is a race that occurs between it signalling that it will go to sleep (so other processors can find sleeping processors) and actually blocking the kernel thread. This is equivalent to the classic problem of missing signals when using condition variables, the ``sleepy'' processor indicates that it will sleep but has not yet gone to sleep, if another processor attempts to wake it up, the waking-up operation may claim nothing needs to be done and the signal will have been missed. In cases where threads are scheduled from processors on the current cluster, loosing signals is not necessarily critical, because at least some processors on the cluster are awake. Individual processors always finish shceduling threads before looking for new work, which means that the last processor to go to sleep cannot miss threads scheduled from inside the cluster (if they do, that demonstrates the ready-queue is not linearizable). However, this guarantee does not hold if threads are shceduled from outside the cluster, either due to an external event like timers and I/O, or due to a thread migrating from a different cluster. In this case, missed signals can lead to the cluster deadlocking where it should not\footnote{Clusters ``should'' never deadlock, but for this proposal, cases where \CFA users \emph{actually} wrote \CFA code that leads to a deadlock it is considered as a deadlock that ``should'' happen. }. Therefore, it is important that the scheduling of threads include a mechanism where signals \emph{cannot} be missed. For performance reasons, it can be advantageous to have a secondary mechanism that allows signals to be missed in cases where it cannot lead to a deadlock. To be safe, this process must include a ``handshake'' where it is guaranteed that either~: the sleepy processor notices that a thread was scheduled after it signalled its intent to block or code scheduling threads well see the intent to sleep before scheduling and be able to wake-up the processor. This matter is complicated by the fact that pthread offers few tools to implement this solution and offers no guarantee of ordering of threads waking up for most of these tools.
+Another issues is trying to avoid kernel sleeping and waking frequently. A possible partial solution is to order the processors so that the one which most recently went to sleep is woken up. This allows other sleeping processors to reach deeper sleep state (when these are available) while keeping ``hot'' processors warmer. Note that while this generally means organising the processors in a stack, I believe that the unique index provided by the ReaderWriter lock can be reused to strictly order the waking order of processors, causing a LIFO like waking order. While a strict LIFO stack is probably better, using the processor index could proove useful and offer a sufficiently LIFO ordering.
+Finally, another important aspect of Idle Sleep is when should processors make the decision to sleep and when it is appropriate for sleeping processors to be woken up. Processors that are unnecessarily awake lead to unnecessary contention and power consumption, while too many sleeping processors can lead to sub-optimal throughput. Furthermore, transitions from sleeping to awake and vice-versa also add unnecessary latency. There is already a wealth of research on the subject and I do not plan to implement a novel idea for the Idle Sleep heuristic in this project.
+The lock must offer scalability and performance on par with the actual ready queue in order not to introduce a new bottleneck.
+I have already built a lock that fits the desired requirements and preliminary testing show scalability and performance that exceed the target.
+As such, I do not consider this lock to be a risk for this project.
+\subsection{Idle Sleep} \label{sec:sleep}
+\newterm{Idle sleep} is the process of putting processors to sleep when they have no threads to execute.
+In this context, processors are kernel threads and sleeping refers to asking the kernel to block a thread.
+This operation can be achieved with either thread synchronization operations like $pthread_cond_wait$ or using signal operations like $sigsuspend$.
+The goal of putting idle processors to sleep is:
+\begin{enumerate}
+\item
+reduce contention on the ready queue, since the otherwise idle processors generally contend trying to pop items from the queue,
+\item
+give back unneeded CPU time associated with a process to other user processors executing on the computer,
+\item
+and reduce energy consumption in cases where more idle kernel-threads translate into idle CPUs, which can cycle down.
+\end{enumerate}
+Support for idle sleep broadly involves calling the operating system to block the kernel thread and handling the race between a blocking thread and the waking thread, and handling which kernel thread should sleep or wake up.
+When a processor decides to sleep, there is a race that occurs between it signalling that is going to sleep (so other processors can find sleeping processors) and actually blocking the kernel thread.
+This operation is equivalent to the classic problem of missing signals when using condition variables: the ``sleepy'' processor indicates its intention to block but has not yet gone to sleep when another processor attempts to wake it up.
+The waking-up operation sees the blocked process and signals it, but the blocking process is racing to sleep so the signal is missed.
+In cases where kernel threads are managed as processors on the current cluster, losing signals is not necessarily critical, because at least some processors on the cluster are awake and may check for more processors eventually.
+Individual processors always finish scheduling user threads before looking for new work, which means that the last processor to go to sleep cannot miss threads scheduled from inside the cluster (if they do, that demonstrates the ready queue is not linearizable).
+However, this guarantee does not hold if threads are scheduled from outside the cluster, either due to an external event like timers and I/O, or due to a user (or kernel) thread migrating from a different cluster.
+In this case, missed signals can lead to the cluster deadlocking\footnote{Clusters should only deadlock in cases where a \CFA programmer \emph{actually} writes \CFA code that leads to a deadlock.}.
+Therefore, it is important that the scheduling of threads include a mechanism where signals \emph{cannot} be missed.
+For performance reasons, it can be advantageous to have a secondary mechanism that allows signals to be missed in cases where it cannot lead to a deadlock.
+To be safe, this process must include a ``handshake'' where it is guaranteed that either:
+\begin{enumerate}
+\item
+the sleeping processor notices that a user thread is scheduled after the sleeping processor signalled its intent to block or
+\item
+code scheduling threads sees the intent to sleep before scheduling and be able to wake-up the processor.
+\end{enumerate}
+This matter is complicated by the fact that pthreads and Linux offer few tools to implement this solution and no guarantee of ordering of threads waking up for most of these tools.
+Another important issue is avoiding kernel threads sleeping and waking frequently because there is a significant operating-system cost.
+This scenario happens when a program oscillates between high and low activity, needing most and then few processors.
+A possible partial solution is to order the processors so that the one which most recently went to sleep is woken up.
+This allows other sleeping processors to reach deeper sleep state (when these are available) while keeping ``hot'' processors warmer.
+Note that while this generally means organizing the processors in a stack, I believe that the unique index provided in my reader-writer lock can be reused to strictly order the waking processors, causing a mostly LIFO order.
+While a strict LIFO stack is probably better, the processor index could prove useful for other reasons, while still offering a sufficiently LIFO ordering.
+A final important aspect of idle sleep is when should processors make the decision to sleep and when is it appropriate for sleeping processors to be woken up.
+Processors that are unnecessarily unblocked lead to unnecessary contention, CPU usage, and power consumption, while too many sleeping processors can lead to suboptimal throughput.
+Furthermore, transitions from sleeping to awake and vice versa also add unnecessary latency.
+There is already a wealth of research on the subject~\cite{schillings1996engineering, wiki:thunderherd} and I may use an existing approach for the idle-sleep heuristic in this project, \eg~\cite{Karsten20}.
 \subsection{Asynchronous I/O}
+The final aspect of this proposal is asynchronous I/O. Without it, user threads that execute I/O operations will block the underlying kernel thread. This leads to poor throughput, it would be preferrable to block the user-thread and reuse the underlying kernel-thread to run other ready threads. This requires intercepting the user-threads' calls to I/O operations, redirecting them to an asynchronous I/O interface and handling the multiplexing between the synchronous and asynchronous API. As such, these are the three components needed to implemented to support asynchronous I/O : an OS abstraction layer over the asynchronous interface, an event-engine to (de)multiplex the operations and a synchronous interface for users to use. None of these components currently exist in \CFA and I will need to build all three for this project.
+\paragraph{OS Abstraction}
+One of the fundamental part of this converting blocking I/O operations into non-blocking ones. This relies on having an underlying asynchronous I/O interface to which to direct the I/O operations. While there exists many different APIs for asynchronous I/O, it is not part of this proposal to create a novel API, simply to use an existing one that is sufficient. uC++ uses the \texttt{select} as its interface, which handles pipes and sockets. It entails significant complexity and has performances problems which make it a less interesting alternative. Another interface which is becoming popular recently\cit is \texttt{epoll}. However, epoll also does not handle file system and seems to have problem to linux pipes and \texttt{TTY}s\cit. A very recent alternative that must still be investigated is \texttt{io\_uring}. It claims to address some of the issues with \texttt{epoll} but is too recent to be confident that it does. Finally, a popular cross-platform alternative is \texttt{libuv}, which offers asynchronous sockets and asynchronous file system operations (among other features). However, as a full-featured library it includes much more than what is needed and could conflict with other features of \CFA unless significant efforts are made to merge them together.
+\paragraph{Event-Engine}
+Laying on top of the asynchronous interface layer is the event-engine. This engine is responsible for multiplexing (batching) the synchronous I/O requests into an asynchronous I/O request and demultiplexing the results onto appropriate blocked threads. This can be straightforward for the simple cases, but can become quite complex. Decisions that will need to be made include : whether to poll from a seperate kernel thread or a regularly scheduled user thread, what should be the ordering used when results satisfy many requests, how to handle threads waiting for multiple operations, etc.
+The final aspect of this proposal is asynchronous I/O.
+Without it, user threads that execute I/O operations block the underlying kernel thread, which leads to poor throughput.
+It is preferable to block the user thread performing the I/O and reuse the underlying kernel-thread to run other ready user threads.
+This approach requires intercepting user-thread calls to I/O operations, redirecting them to an asynchronous I/O interface, and handling the multiplexing/demultiplexing between the synchronous and asynchronous API.
+As such, there are three components needed to implement support for asynchronous I/O:
+\begin{enumerate}
+\item
+an OS abstraction layer over the asynchronous interface,
+\item
+an event-engine to (de)multiplex the operations,
+\item
+and a synchronous interface for users.
+\end{enumerate}
+None of these components currently exist in \CFA and I will need to build all three for this project.
+\paragraph{OS Asynchronous Abstraction}
+One fundamental part for converting blocking I/O operations into non-blocking is having an underlying asynchronous I/O interface to direct the I/O operations.
+While there exists many different APIs for asynchronous I/O, it is not part of this proposal to create a novel API.
+It is sufficient to make one work in the complex context of the \CFA runtime.
+\uC uses the $select$~\cite{select} as its interface, which handles ttys, pipes and sockets, but not disk.
+$select$ entails significant complexity and is being replaced in UNIX operating systems, which make it a less interesting alternative.
+Another popular interface is $epoll$~\cite{epoll}, which is supposed to be cheaper than $select$.
+However, $epoll$ also does not handle the file system and anecdotal evidence suggest it has problems with Linux pipes and ttys.
+A popular cross-platform alternative is $libuv$~\cite{libuv}, which offers asynchronous sockets and asynchronous file system operations (among other features).
+However, as a full-featured library it includes much more than I need and could conflict with other features of \CFA unless significant effort is made to merge them together.
+A very recent alternative that I am investigating is $io_uring$~\cite{io_uring}.
+It claims to address some of the issues with $epoll$ and my early investigating suggests that the claim is accurate.
+$io_uring$ uses a much more general approach where system calls are registered to a queue and later executed by the kernel, rather than relying on system calls to support returning an error instead of blocking.
+I believe this approach allows for fewer problems, \eg the manpage for $open$~\cite{open} states:
+\begin{quote}
+Note that [the $O_NONBLOCK$ flag] has no effect for regular files and block devices;
+that is, I/O operations will (briefly) block when device activity is required, regardless of whether $O_NONBLOCK$ is set.
+Since $O_NONBLOCK$ semantics might eventually be implemented, applications should not depend upon blocking behaviour when specifying this flag for regular files and block devices.
+\end{quote}
+This makes approaches based on $select$/$epoll$ less reliable since they may not work for every file descriptors.
+For this reason, I plan to use $io_uring$ as the OS abstraction for the \CFA runtime unless further work encounters a fatal problem.
+However, only a small subset of the features are available in Ubuntu as of April 2020~\cite{wiki:ubuntu-linux}, which will limit performance comparisons.
+I do not believe this will affect the comparison result.
+\paragraph{Event Engine}
+Above the OS asynchronous abstraction is the event engine.
+This engine is responsible for multiplexing (batching) the synchronous I/O requests into asynchronous I/O requests and demultiplexing the results to appropriate blocked user threads.
+This step can be straightforward for simple cases, but becomes quite complex when there are thousands of user threads performing both reads and writes, possibly on overlapping file descriptors.
+Decisions that need to be made include:
+\begin{enumerate}
+\item
+whether to poll from a separate kernel thread or a regularly scheduled user thread,
+\item
+what should be the ordering used when results satisfy many requests,
+\item
+how to handle threads waiting for multiple operations, etc.
+\end{enumerate}
 \paragraph{Interface}
+Finally, for these components to be available, it is necessary to expose them through a synchronous interface. This can be a novel interface but it is preferrable to attempt to intercept the existing POSIX interface in order to be compatible with existing code. This will allow C programs written using this interface to be transparently converted to \CFA with minimal effeort. Where this is not applicable, a novel interface will be created to fill the gaps.
+Finally, for these non-blocking I/O components to be available, it is necessary to expose them through a synchronous interface because that is the \CFA concurrent programming style.
+The interface can be novel but it is preferable to match the existing POSIX interface when possible to be compatible with existing code.
+Matching allows C programs written using this interface to be transparently converted to \CFA with minimal effort.
+Where new functionality is needed, I will add novel interface extensions to fill gaps and provide advanced features.
 …
 % ===============================================================================
 \section{Discussion}
+I believe that runtime system and scheduling are still open topics.
+Many ``state of the art'' production frameworks still use single-threaded event loops because of performance considerations, \eg~\cite{nginx-design}, and, to my knowledge, no widely available system language offers modern threading facilities.
+I believe the proposed work offers a novel runtime and scheduling package, where existing work only offers fragments that users must assemble themselves when possible.
 % ===============================================================================
 % ===============================================================================
 \section{Timeline}
+\cleardoublepage
+\begin{center}
+\begin{tabular}{ | r @{--} l | p{4in} | }
+\hline May 2020 & October 2020   & Creation of the performance benchmark. \\
+\hline November 2020 & March 2021   & Completion of the implementation. \\
+\hline March 2021 & April 2021  & Final Performance experiments. \\
+\hline May 2021 & August 2021 & Thesis writing and defence. \\
+\hline
+\end{tabular}
+\end{center}
 % B I B L I O G R A P H Y
 % -----------------------------
+\addcontentsline{toc}{chapter}{Bibliography}
+\cleardoublepage
+\phantomsection         % allows hyperref to link to the correct page
+\addcontentsline{toc}{section}{\refname}
 \bibliographystyle{plain}
 \bibliography{pl,local}
+% G L O S S A R Y
+% -----------------------------
 \cleardoublepage
 \phantomsection         % allows hyperref to link to the correct page
+% G L O S S A R Y
+% -----------------------------
+\addcontentsline{toc}{chapter}{Glossary}
+\addcontentsline{toc}{section}{Glossary}
 \printglossary
-\cleardoublepage
-\phantomsection         % allows hyperref to link to the correct page
 \end{document}

doc/theses/thierry_delisle_PhD/comp_II/local.bib

-              r3c64c668
+              r58fe85a
 @article{finkel1987dib,
   title={DIB—a distributed implementation of backtracking},
+  title={DIB-a distributed implementation of backtracking},
   author={Finkel, Raphael and Manber, Udi},
   journal={ACM Transactions on Programming Languages and Systems (TOPLAS)},
 …
   organization={ACM}
+}
+% ===============================================================================
+% Algorithms
+% ===============================================================================
+@article{michael2004hazard,
+  title={Hazard pointers: Safe memory reclamation for lock-free objects},
+  author={Michael, Maged M},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  volume={15},
+  number={6},
+  pages={491--504},
+  year={2004},
+  publisher={IEEE}
+}
+@inproceedings{brown2015reclaiming,
+  title={Reclaiming memory for lock-free data structures: There has to be a better way},
+  author={Brown, Trevor Alexander},
+  booktitle={Proceedings of the 2015 ACM Symposium on Principles of Distributed Computing},
+  pages={261--270},
+  year={2015}
+}
+% Trevor's relaxed FIFO list
+@inproceedings{alistarh2018relaxed,
+  title={Relaxed schedulers can efficiently parallelize iterative algorithms},
+  author={Alistarh, Dan and Brown, Trevor and Kopinsky, Justin and Nadiradze, Giorgi},
+  booktitle={Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing},
+  pages={377--386},
+  year={2018}
+}
+% Scalable counters which only support is !0
+@inproceedings{ellen2007snzi,
+  title={SNZI: Scalable nonzero indicators},
+  author={Ellen, Faith and Lev, Yossi and Luchangco, Victor and Moir, Mark},
+  booktitle={Proceedings of the twenty-sixth annual ACM symposium on Principles of distributed computing},
+  pages={13--22},
+  year={2007}
+}
+% ===============================================================================
+% Linux Man Pages
+% ===============================================================================
+@manual{open,
+  key        = "open",
+  title      = "open(2) Linux User's Manual",
+  year       = "2020",
+  month      = "February",
+}
+@manual{epoll,
+  key        = "epoll",
+  title      = "epoll(7) Linux User's Manual",
+  year       = "2019",
+  month      = "March",
+}
+@manual{select,
+  key        = "select",
+  title      = "select(7) Linux User's Manual",
+  year       = "2019",
+  month      = "March",
+}
+@misc{io_uring,
+  title   = {Efficient IO with io\_uring},
+  author  = {Axboe, Jens},
+  year    = "2019",
+  month   = "March",
+  version = {0,4},
+  howpublished = {\url{https://kernel.dk/io_uring.pdf}}
+}
+@misc{libuv,
+  key   = "libuv",
+  title = {libuv},
+  howpublished = {\url{https://github.com/libuv/libuv}}
+}
+% ===============================================================================
+% MISC
+% ===============================================================================
+@misc{nginx-design,
+  key   = "nginx",
+  title={Inside {NGINX}: How We Designed for Performance \& Scale},
+  howpublished= {\href{https://www.nginx.com/blog/inside-nginx-how-we-designed-for-performance-scale}
+                {https://\-www.nginx.com/\-blog/\-inside\--nginx\--how\--we\--designed\--for\--performance\--scale}},
+}
+@article{schillings1996engineering,
+  title={Be engineering insights: Benaphores},
+  author={Schillings, Benoit},
+  journal={Be Newsletters},
+  volume={1},
+  number={26},
+  year={1996}
+}
+@misc{wiki:thunderherd,
+   author = "{Wikipedia contributors}",
+   title = "Thundering herd problem --- {W}ikipedia{,} The Free Encyclopedia",
+   year = "2020",
+   howpublished = {\href{https://en.wikipedia.org/wiki/Thundering_herd_problem}
+                  {https://\-en.wikipedia.org/\-wiki/\-Thundering\_herd\_problem}},},
+   note = "[Online; accessed 14-April-2020]"
+}
+@misc{wiki:ubuntu-linux,
+   author = "{Wikipedia contributors}",
+   title = "Ubuntu version history : Table of versions --- {W}ikipedia{,} The Free Encyclopedia",
+   year = "2020",
+   howpublished = {\href{https://en.wikipedia.org/wiki/Ubuntu_version_history\#Table_of_versions}
+                  {https://\-en.wikipedia.org/\-wiki/\-Ubuntu\_version\_history\#Table\_of\_versions}},
+   note = "[Online; accessed 15-April-2020]"
+}

doc/user/Makefile

r3c64c668	r58fe85a
55	55
56	56	${DOCUMENT} : ${BASE}.ps
57		ps2pdf $<
	57	ps2pdf -dPDFSETTINGS=/prepress $<
58	58
59	59	${BASE}.ps : ${BASE}.dvi

doc/user/user.tex

-              r3c64c668
+              r58fe85a
 %% Created On       : Wed Apr  6 14:53:29 2016
 %% Last Modified By : Peter A. Buhr
 %% Last Modified On : Sat Jul 13 18:36:18 2019
 %% Update Count     : 3876
+%% Last Modified On : Mon Oct  5 08:57:29 2020
+%% Update Count     : 3998
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 \usepackage{upquote}                                                                    % switch curled `'" to straight
 \usepackage{calc}
-\usepackage{xspace}
 \usepackage{varioref}                                                                   % extended references
+\usepackage{listings}                                                                   % format program code
+\usepackage[labelformat=simple,aboveskip=0pt,farskip=0pt]{subfig}
+\renewcommand{\thesubfigure}{\alph{subfigure})}
 \usepackage[flushmargin]{footmisc}                                              % support label/reference in footnote
 \usepackage{latexsym}                                   % \Box glyph
 \usepackage{mathptmx}                                   % better math font with "times"
 \usepackage[usenames]{color}
+\input{common}                                          % common CFA document macros
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage{breakurl}
+\usepackage[pagewise]{lineno}
+\renewcommand{\linenumberfont}{\scriptsize\sffamily}
+\usepackage[firstpage]{draftwatermark}
+\SetWatermarkLightness{0.9}
+% Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore
+% removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR
+% AFTER HYPERREF.
+\renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}}
+\setlength{\topmargin}{-0.45in}                                                 % move running title into header
+\setlength{\headsep}{0.25in}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\CFAStyle                                                                                               % use default CFA format-style
+\lstnewenvironment{C++}[1][]                            % use C++ style
+{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}}
+{}
+\newcommand{\CFALatin}{}
 % inline code ©...© (copyright symbol) emacs: C-q M-)
 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-.
 …
 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^
 % math escape $...$ (dollar symbol)
+\input{common}                                          % common CFA document macros
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage{breakurl}
+\renewcommand\footnoterule{\kern -3pt\rule{0.3\linewidth}{0.15pt}\kern 2pt}
+\usepackage[pagewise]{lineno}
+\renewcommand{\linenumberfont}{\scriptsize\sffamily}
+\usepackage[firstpage]{draftwatermark}
+\SetWatermarkLightness{0.9}
+% Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore
+% removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR
+% AFTER HYPERREF.
+\renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}}
+\setlength{\topmargin}{-0.45in}                                                 % move running title into header
+\setlength{\headsep}{0.25in}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\CFAStyle                                                                                               % use default CFA format-style
+\lstnewenvironment{C++}[1][]                            % use C++ style
+{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}}
+{}
+\newsavebox{\myboxA}
+\newsavebox{\myboxB}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 \newcommand{\G}[1]{{\Textbf[OliveGreen]{#1}}}
 \newcommand{\KWC}{K-W C\xspace}
-\newsavebox{\LstBox}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 Even with all its problems, C continues to be popular because it allows writing software at virtually any level in a computer system without restriction.
 For system programming, where direct access to hardware, storage management, and real-time issues are a requirement, C is usually the only language of choice.
 The TIOBE index~\cite{TIOBE} for July 2018 ranks the top five most \emph{popular} programming languages as \Index*{Java} 16\%, C 14\%, \Index*[C++]{\CC{}} 7.5\%, Python 6\%, Visual Basic 4\% = 47.5\%, where the next 50 languages are less than 4\% each, with a long tail.
 The top 3 rankings over the past 30 years are:
+The TIOBE index~\cite{TIOBE} for February 2020 ranks the top six most \emph{popular} programming languages as \Index*{Java} 17.4\%, C 16.8\%, Python 9.3\%, \Index*[C++]{\CC{}} 6.2\%, \Csharp 5.9\%, Visual Basic 5.9\% = 61.5\%, where the next 50 languages are less than 2\% each, with a long tail.
+The top 4 rankings over the past 35 years are:
 \begin{center}
 \setlength{\tabcolsep}{10pt}
+\begin{tabular}{@{}rccccccc@{}}
+                & 2018  & 2013  & 2008  & 2003  & 1998  & 1993  & 1988  \\ \hline
+Java    & 1             & 2             & 1             & 1             & 16    & -             & -             \\
+\R{C}   & \R{2} & \R{1} & \R{2} & \R{2} & \R{1} & \R{1} & \R{1} \\
+\CC             & 3             & 4             & 3             & 3             & 2             & 2             & 5             \\
+\begin{tabular}{@{}rcccccccc@{}}
+                & 2020  & 2015  & 2010  & 2005  & 2000  & 1995  & 1990  & 1985  \\ \hline
+Java    & 1             & 2             & 1             & 2             & 3             & -             & -             & -             \\
+\R{C}   & \R{2} & \R{1} & \R{2} & \R{1} & \R{1} & \R{2} & \R{1} & \R{1} \\
+Python  & 3             & 7             & 6             & 6             & 22    & 21    & -             & -             \\
+\CC             & 4             & 4             & 4             & 3             & 2             & 1             & 2             & 12    \\
 \end{tabular}
 \end{center}
 …
 The signature feature of \CFA is \emph{\Index{overload}able} \Index{parametric-polymorphic} functions~\cite{forceone:impl,Cormack90,Duggan96} with functions generalized using a ©forall© clause (giving the language its name):
 \begin{lstlisting}
+\begin{cfa}
 ®forall( otype T )® T identity( T val ) { return val; }
 int forty_two = identity( 42 ); §\C{// T is bound to int, forty\_two == 42}§
 \end{lstlisting}
+\end{cfa}
 % extending the C type system with parametric polymorphism and overloading, as opposed to the \Index*[C++]{\CC{}} approach of object-oriented extensions.
 \CFA{}\hspace{1pt}'s polymorphism was originally formalized by \Index*{Glen Ditchfield}\index{Ditchfield, Glen}~\cite{Ditchfield92}, and first implemented by \Index*{Richard Bilson}\index{Bilson, Richard}~\cite{Bilson03}.
 …
 \begin{comment}
 A simple example is leveraging the existing type-unsafe (©void *©) C ©bsearch© to binary search a sorted floating array:
 \begin{lstlisting}
+\begin{cfa}
 void * bsearch( const void * key, const void * base, size_t dim, size_t size,
                                 int (* compar)( const void *, const void * ));
 …
 double key = 5.0, vals[10] = { /* 10 sorted floating values */ };
 double * val = (double *)bsearch( &key, vals, 10, sizeof(vals[0]), comp ); §\C{// search sorted array}§
 \end{lstlisting}
+\end{cfa}
 which can be augmented simply with a polymorphic, type-safe, \CFA-overloaded wrappers:
 \begin{lstlisting}
+\begin{cfa}
 forall( otype T | { int ?<?( T, T ); } ) T * bsearch( T key, const T * arr, size_t size ) {
         int comp( const void * t1, const void * t2 ) { /* as above with double changed to T */ }
 …
 double * val = bsearch( 5.0, vals, 10 ); §\C{// selection based on return type}§
 int posn = bsearch( 5.0, vals, 10 );
 \end{lstlisting}
+\end{cfa}
 The nested function ©comp© provides the hidden interface from typed \CFA to untyped (©void *©) C, plus the cast of the result.
 Providing a hidden ©comp© function in \CC is awkward as lambdas do not use C calling-conventions and template declarations cannot appear at block scope.
 …
 \CFA has replacement libraries condensing hundreds of existing C functions into tens of \CFA overloaded functions, all without rewriting the actual computations.
 For example, it is possible to write a type-safe \CFA wrapper ©malloc© based on the C ©malloc©:
 \begin{lstlisting}
+\begin{cfa}
 forall( dtype T | sized(T) ) T * malloc( void ) { return (T *)malloc( sizeof(T) ); }
 int * ip = malloc(); §\C{// select type and size from left-hand side}§
 double * dp = malloc();
 struct S {...} * sp = malloc();
 \end{lstlisting}
+\end{cfa}
 where the return type supplies the type/size of the allocation, which is impossible in most type systems.
 \end{comment}
 …
 Keyword clashes are accommodated by syntactic transformations using the \CFA backquote escape-mechanism:
 \begin{cfa}
 int ®`®otype®`® = 3; §\C{// make keyword an identifier}§
 double ®`®forall®`® = 3.5;
+int ®``®otype = 3; §\C{// make keyword an identifier}§
+double ®``®forall = 3.5;
 \end{cfa}
 …
 // include file uses the CFA keyword "with".
 #if ! defined( with ) §\C{// nesting ?}§
 #define with ®`®with®`® §\C{// make keyword an identifier}§
+#define with ®``®with §\C{// make keyword an identifier}§
 #define __CFA_BFD_H__
 #endif
+®#include_next <bfdlink.h> §\C{// must have internal check for multiple expansion}§
+®
+§{\color{red}\#\textbf{include\_next} <bfdlink.h>}§ §\C{// must have internal check for multiple expansion}§
 #if defined( with ) && defined( __CFA_BFD_H__ ) §\C{// reset only if set}§
 #undef with
 …
 \section{Exponentiation Operator}
 C, \CC, and Java (and many other programming languages) have no exponentiation operator\index{exponentiation!operator}\index{operator!exponentiation}, \ie $x^y$, and instead use a routine, like \Indexc{pow}, to perform the exponentiation operation.
 \CFA extends the basic operators with the exponentiation operator ©?\?©\index{?\\?@©?\?©} and ©?\=?©\index{?\\=?@©\=?©}, as in, ©x \ y© and ©x \= y©, which means $x^y$ and $x \leftarrow x^y$.
+C, \CC, and Java (and many other programming languages) have no exponentiation operator\index{exponentiation!operator}\index{operator!exponentiation}, \ie $x^y$, and instead use a routine, like \Indexc{pow(x,y)}, to perform the exponentiation operation.
+\CFA extends the basic operators with the exponentiation operator ©?®\®?©\index{?\\?@©?®\®?©} and ©?\=?©\index{?\\=?@©®\®=?©}, as in, ©x ®\® y© and ©x ®\®= y©, which means $x^y$ and $x \leftarrow x^y$.
 The priority of the exponentiation operator is between the cast and multiplicative operators, so that ©w * (int)x \ (int)y * z© is parenthesized as ©((w * (((int)x) \ ((int)y))) * z)©.
 As for \Index{division}, there are exponentiation operators for integral and floating types, including the builtin \Index{complex} types.
+There are exponentiation operators for integral and floating types, including the builtin \Index{complex} types.
 Integral exponentiation\index{exponentiation!unsigned integral} is performed with repeated multiplication\footnote{The multiplication computation is $O(\log y)$.} (or shifting if the exponent is 2).
 Overflow from large exponents or negative exponents return zero.
+Overflow for a large exponent or negative exponent returns zero.
 Floating exponentiation\index{exponentiation!floating} is performed using \Index{logarithm}s\index{exponentiation!logarithm}, so the exponent cannot be negative.
 \begin{cfa}
 …
 1 256 -64 125 ®0® 3273344365508751233 ®0® ®0® -0.015625 18.3791736799526 0.264715-1.1922i
 \end{cfa}
 Note, ©5 ®\® 32© and ©5L ®\® 64© overflow, and ©-4 ®\® -3© is a fraction but stored in an integer so all three computations generate an integral zero.
+Note, ©5 \ 32© and ©5L \ 64© overflow, and ©-4 \ -3© is a fraction but stored in an integer so all three computations generate an integral zero.
 Parenthesis are necessary for complex constants or the expression is parsed as ©1.0f+®(®2.0fi \ 3.0f®)®+2.0fi©.
 The exponentiation operator is available for all the basic types, but for user-defined types, only the integral-computation version is available.
 …
 OT ?®\®?( OT ep, unsigned long int y );
 \end{cfa}
 The user type ©T© must define multiplication, one, ©1©, and, ©*©.
+The user type ©T© must define multiplication, one (©1©), and ©*©.
 …
+\subsection{Loop Control}
+The ©for©/©while©/©do-while© loop-control allows empty or simplified ranges (see Figure~\ref{f:LoopControlExamples}).
+\begin{itemize}
+\item
+An empty conditional implies ©1©.
+\item
+The up-to range ©~©\index{~@©~©} means exclusive range [M,N).
+\item
+The up-to range ©~=©\index{~=@©~=©} means inclusive range [M,N].
+\item
+The down-to range ©-~©\index{-~@©-~©} means exclusive range [N,M).
+\item
+The down-to range ©-~=©\index{-~=@©-~=©} means inclusive range [N,M].
+\item
+©@© means put nothing in this field.
+\item
+©0© is the implicit start value;
+\item
+©1© is the implicit increment value.
+\item
+The up-to range uses ©+=© for increment;
+\item
+The down-to range uses ©-=© for decrement.
+\item
+The loop index is polymorphic in the type of the start value or comparison value when start is implicitly ©0©.
+\end{itemize}
+\begin{figure}
+%\section{\texorpdfstring{\protect\lstinline@case@ Clause}{case Clause}}
+\subsection{\texorpdfstring{\LstKeywordStyle{case} Clause}{case Clause}}
+C restricts the ©case© clause of a ©switch© statement to a single value.
+For multiple ©case© clauses associated with the same statement, it is necessary to have multiple ©case© clauses rather than multiple values.
+Requiring a ©case© clause for each value does not seem to be in the spirit of brevity normally associated with C.
+Therefore, the ©case© clause is extended with a list of values, as in:
 \begin{cquote}
+\begin{tabular}{@{}l|l@{}}
+\multicolumn{1}{c|}{loop control} & \multicolumn{1}{c}{output} \\
+\hline
+\begin{cfa}
+sout | nlOff;
+while ®()® { sout | "empty"; break; } sout | nl;
+do { sout | "empty"; break; } while ®()®; sout | nl;
+for ®()® { sout | "empty"; break; } sout | nl;
+for ( ®0® ) { sout | "A"; } sout | "zero" | nl;
+for ( ®1® ) { sout | "A"; } sout | nl;
+for ( ®10® ) { sout | "A"; } sout | nl;
+for ( ®1 ~= 10 ~ 2® ) { sout | "B"; } sout | nl;
+for ( ®10 -~= 1 ~ 2® ) { sout | "C"; } sout | nl;
+for ( ®0.5 ~ 5.5® ) { sout | "D"; } sout | nl;
+for ( ®5.5 -~ 0.5® ) { sout | "E"; } sout | nl;
+for ( ®i; 10® ) { sout | i; } sout | nl;
+for ( ®i; 1 ~= 10 ~ 2® ) { sout | i; } sout | nl;
+for ( ®i; 10 -~= 1 ~ 2® ) { sout | i; } sout | nl;
+for ( ®i; 0.5 ~ 5.5® ) { sout | i; } sout | nl;
+for ( ®i; 5.5 -~ 0.5® ) { sout | i; } sout | nl;
+for ( ®ui; 2u ~= 10u ~ 2u® ) { sout | ui; } sout | nl;
+for ( ®ui; 10u -~= 2u ~ 2u® ) { sout | ui; } sout | nl;
+enum { N = 10 };
+for ( ®N® ) { sout | "N"; } sout | nl;
+for ( ®i; N® ) { sout | i; } sout | nl;
+for ( ®i; N -~ 0® ) { sout | i; } sout | nl;
+const int start = 3, comp = 10, inc = 2;
+for ( ®i; start ~ comp ~ inc + 1® ) { sout | i; } sout | nl;
+for ( ®i; 1 ~ @® ) { if ( i > 10 ) break;
+        sout | i; } sout | nl;
+for ( ®i; 10 -~ @® ) { if ( i < 0 ) break;
+        sout | i; } sout | nl;
+for ( ®i; 2 ~ @ ~ 2® ) { if ( i > 10 ) break;
+        sout | i; } sout | nl;
+for ( ®i; 2.1 ~ @ ~ @® ) { if ( i > 10.5 ) break;
+        sout | i; i += 1.7; } sout | nl;
+for ( ®i; 10 -~ @ ~ 2® ) { if ( i < 0 ) break;
+        sout | i; } sout | nl;
+for ( ®i; 12.1 ~ @ ~ @® ) { if ( i < 2.5 ) break;
+        sout | i; i -= 1.7; } sout | nl;
+for ( ®i; 5 : j; -5 ~ @® ) { sout | i | j; } sout | nl;
+for ( ®i; 5 : j; -5 -~ @® ) { sout | i | j; } sout | nl;
+for ( ®i; 5 : j; -5 ~ @ ~ 2® ) { sout | i | j; } sout | nl;
+for ( ®i; 5 : j; -5 -~ @ ~ 2® ) { sout | i | j; } sout | nl;
+for ( ®j; -5 ~ @ : i; 5® ) { sout | i | j; } sout | nl;
+for ( ®j; -5 -~ @ : i; 5® ) { sout | i | j; } sout | nl;
+for ( ®j; -5 ~ @ ~ 2 : i; 5® ) { sout | i | j; } sout | nl;
+for ( ®j; -5 -~ @ ~ 2 : i; 5® ) { sout | i | j; } sout | nl;
+for ( ®j; -5 -~ @ ~ 2 : i; 5 : k; 1.5 ~ @® ) {
+        sout | i | j | k; } sout | nl;
+for ( ®j; -5 -~ @ ~ 2 : k; 1.5 ~ @ : i; 5® ) {
+        sout | i | j | k; } sout | nl;
+for ( ®k; 1.5 ~ @ : j; -5 -~ @ ~ 2 : i; 5® ) {
+        sout | i | j | k; } sout | nl;
+\begin{tabular}{@{}l@{\hspace{3em}}l@{\hspace{2em}}l@{}}
+\multicolumn{1}{c@{\hspace{3em}}}{\textbf{\CFA}}        & \multicolumn{1}{c@{\hspace{2em}}}{\textbf{C}} \\
+\begin{cfa}
+switch ( i ) {
+  case ®1, 3, 5®:
+        ...
+  case ®2, 4, 6®:
+        ...
+}
 \end{cfa}
+&
 \begin{cfa}
+empty
+empty
+empty
+zero
+A
+A A A A A A A A A A
+B B B B B
+C C C C C
+D D D D D
+E E E E E
+1 2 3 4 5 6 7 8 9
+3 5 7 9
+8 6 4 2
+.5 1.5 2.5 3.5 4.5
+.5 4.5 3.5 2.5 1.5
+4 6 8 10
+8 6 4 2
+N N N N N N N N N N
+1 2 3 4 5 6 7 8 9
+9 8 7 6 5 4 3 2 1
+6 9
+2 3 4 5 6 7 8 9 10
+9 8 7 6 5 4 3 2 1 0
+4 6 8 10
+.1 3.8 5.5 7.2 8.9
+8 6 4 2 0
+.1 10.4 8.7 7 5.3 3.6
+-5 1 -4 2 -3 3 -2 4 -1
+-5 1 -6 2 -7 3 -8 4 -9
+-5 1 -3 2 -1 3 1 4 3
+-5 1 -7 2 -9 3 -11 4 -13
+-5 1 -4 2 -3 3 -2 4 -1
+-5 1 -6 2 -7 3 -8 4 -9
+-5 1 -3 2 -1 3 1 4 3
+-5 1 -7 2 -9 3 -11 4 -13
+-5 1.5 1 -7 2.5 2 -9 3.5 3 -11 4.5 4 -13 5.5
+-5 1.5 1 -7 2.5 2 -9 3.5 3 -11 4.5 4 -13 5.5
+-5 1.5 1 -7 2.5 2 -9 3.5 3 -11 4.5 4 -13 5.5
+switch ( i ) {
+  case 1: case 3 : case 5:
+        ...
+  case 2: case 4 : case 6:
+        ...
+}
+\end{cfa}
+&
+\begin{cfa}
+// odd values
+// even values
 \end{cfa}
 \end{tabular}
 \end{cquote}
+\caption{Loop Control Examples}
+\label{f:LoopControlExamples}
+\end{figure}
+In addition, subranges are allowed to specify case values.\footnote{
+gcc has the same mechanism but awkward syntax, \lstinline@2 ...42@, because a space is required after a number, otherwise the period is a decimal point.}
+\begin{cfa}
+switch ( i ) {
+  case ®1~5:® §\C{// 1, 2, 3, 4, 5}§
+        ...
+  case ®10~15:® §\C{// 10, 11, 12, 13, 14, 15}§
+        ...
+}
+\end{cfa}
+Lists of subranges are also allowed.
+\begin{cfa}
+case ®1~5, 12~21, 35~42®:
+\end{cfa}
 …
+%\section{\texorpdfstring{\protect\lstinline@case@ Clause}{case Clause}}
+\subsection{\texorpdfstring{\LstKeywordStyle{case} Statement}{case Statement}}
+C restricts the ©case© clause of a ©switch© statement to a single value.
+For multiple ©case© clauses associated with the same statement, it is necessary to have multiple ©case© clauses rather than multiple values.
+Requiring a ©case© clause for each value does not seem to be in the spirit of brevity normally associated with C.
+Therefore, the ©case© clause is extended with a list of values, as in:
+\begin{cquote}
+\begin{tabular}{@{}l@{\hspace{3em}}l@{\hspace{2em}}l@{}}
+\multicolumn{1}{c@{\hspace{3em}}}{\textbf{\CFA}}        & \multicolumn{1}{c@{\hspace{2em}}}{\textbf{C}} \\
+\begin{cfa}
+switch ( i ) {
+  case ®1, 3, 5®:
+\subsection{Non-terminating and Labelled \texorpdfstring{\LstKeywordStyle{fallthrough}}{Non-terminating and Labelled fallthrough}}
+The ©fallthrough© clause may be non-terminating within a ©case© clause or have a target label to common code from multiple case clauses.
+\begin{center}
+\begin{tabular}{@{}lll@{}}
+\begin{cfa}
+choose ( ... ) {
+  case 3:
+        if ( ... ) {
+                ... ®fallthru;® // goto case 4
+        } else {
+                ...
+        }
+        // implicit break
+  case 4:
+\end{cfa}
+&
+\begin{cfa}
+choose ( ... ) {
+  case 3:
+        ... ®fallthrough common;®
+  case 4:
+        ... ®fallthrough common;®
+  ®common:® // below fallthrough
+                          // at case-clause level
+        ...     // common code for cases 3/4
+        // implicit break
+  case 4:
+\end{cfa}
+&
+\begin{cfa}
+choose ( ... ) {
+  case 3:
+        choose ( ... ) {
+          case 4:
+                for ( ... ) {
+                        // multi-level transfer
+                        ... ®fallthru common;®
+                }
+                ...
+        }
         ...
+  case ®2, 4, 6®:
+        ...
+}
+  ®common:® // below fallthrough
+                          // at case-clause level
+\end{cfa}
+\end{tabular}
+\end{center}
+The target label must be below the ©fallthrough© and may not be nested in a control structure, and
+the target label must be at the same or higher level as the containing ©case© clause and located at
+the same level as a ©case© clause; the target label may be case ©default©, but only associated
+with the current ©switch©/©choose© statement.
+\begin{figure}
+\begin{tabular}{@{}l|l@{}}
+\multicolumn{1}{c|}{loop control} & \multicolumn{1}{c}{output} \\
+\hline
+\begin{cfa}[xleftmargin=0pt]
+while ®()® { sout | "empty"; break; }
+do { sout | "empty"; break; } while ®()®;
+for ®()® { sout | "empty"; break; }
+for ( ®0® ) { sout | "A"; } sout | "zero";
+for ( ®1® ) { sout | "A"; }
+for ( ®10® ) { sout | "A"; }
+for ( ®= 10® ) { sout | "A"; }
+for ( ®1 ~= 10 ~ 2® ) { sout | "B"; }
+for ( ®10 -~= 1 ~ 2® ) { sout | "C"; }
+for ( ®0.5 ~ 5.5® ) { sout | "D"; }
+for ( ®5.5 -~ 0.5® ) { sout | "E"; }
+for ( ®i; 10® ) { sout | i; }
+for ( ®i; = 10® ) { sout | i; }
+for ( ®i; 1 ~= 10 ~ 2® ) { sout | i; }
+for ( ®i; 10 -~= 1 ~ 2® ) { sout | i; }
+for ( ®i; 0.5 ~ 5.5® ) { sout | i; }
+for ( ®i; 5.5 -~ 0.5® ) { sout | i; }
+for ( ®ui; 2u ~= 10u ~ 2u® ) { sout | ui; }
+for ( ®ui; 10u -~= 2u ~ 2u® ) { sout | ui; }
+enum { N = 10 };
+for ( ®N® ) { sout | "N"; }
+for ( ®i; N® ) { sout | i; }
+for ( ®i; N -~ 0® ) { sout | i; }
+const int start = 3, comp = 10, inc = 2;
+for ( ®i; start ~ comp ~ inc + 1® ) { sout | i; }
+for ( i; 1 ~ ®@® ) { if ( i > 10 ) break; sout | i; }
+for ( i; 10 -~ ®@® ) { if ( i < 0 ) break; sout | i; }
+for ( i; 2 ~ ®@® ~ 2 ) { if ( i > 10 ) break; sout | i; }
+for ( i; 2.1 ~ ®@® ~ ®@® ) { if ( i > 10.5 ) break; sout | i; i += 1.7; }
+for ( i; 10 -~ ®@® ~ 2 ) { if ( i < 0 ) break; sout | i; }
+for ( i; 12.1 ~ ®@® ~ ®@® ) { if ( i < 2.5 ) break; sout | i; i -= 1.7; }
+for ( i; 5 ®:® j; -5 ~ @ ) { sout | i | j; }
+for ( i; 5 ®:® j; -5 -~ @ ) { sout | i | j; }
+for ( i; 5 ®:® j; -5 ~ @ ~ 2 ) { sout | i | j; }
+for ( i; 5 ®:® j; -5 -~ @ ~ 2 ) { sout | i | j; }
+for ( i; 5 ®:® j; -5 ~ @ ) { sout | i | j; }
+for ( i; 5 ®:® j; -5 -~ @ ) { sout | i | j; }
+for ( i; 5 ®:® j; -5 ~ @ ~ 2 ) { sout | i | j; }
+for ( i; 5 ®:® j; -5 -~ @ ~ 2 ) { sout | i | j; }
+for ( i; 5 ®:® j; -5 -~ @ ~ 2 ®:® k; 1.5 ~ @ ) { sout | i | j | k; }
+for ( i; 5 ®:® j; -5 -~ @ ~ 2 ®:® k; 1.5 ~ @ ) { sout | i | j | k; }
+for ( i; 5 ®:® k; 1.5 ~ @ ®:® j; -5 -~ @ ~ 2 ) { sout | i | j | k; }
 \end{cfa}
+&
 \begin{cfa}
+switch ( i ) {
+  case 1: case 3 : case 5:
+        ...
+  case 2: case 4 : case 6:
+        ...
+}
+\end{cfa}
+&
+\begin{cfa}
+// odd values
+// even values
+empty
+empty
+empty
+zero
+A
+A A A A A A A A A A
+A A A A A A A A A A A
+B B B B B
+C C C C C
+D D D D D
+E E E E E
+1 2 3 4 5 6 7 8 9
+1 2 3 4 5 6 7 8 9 10
+3 5 7 9
+8 6 4 2
+.5 1.5 2.5 3.5 4.5
+.5 4.5 3.5 2.5 1.5
+4 6 8 10
+8 6 4 2
+N N N N N N N N N N
+1 2 3 4 5 6 7 8 9
+9 8 7 6 5 4 3 2 1
+6 9
+2 3 4 5 6 7 8 9 10
+9 8 7 6 5 4 3 2 1 0
+4 6 8 10
+.1 3.8 5.5 7.2 8.9
+8 6 4 2 0
+.1 10.4 8.7 7. 5.3 3.6
+-5 1 -4 2 -3 3 -2 4 -1
+-5 1 -6 2 -7 3 -8 4 -9
+-5 1 -3 2 -1 3 1 4 3
+-5 1 -7 2 -9 3 -11 4 -13
+-5 1 -4 2 -3 3 -2 4 -1
+-5 1 -6 2 -7 3 -8 4 -9
+-5 1 -3 2 -1 3 1 4 3
+-5 1 -7 2 -9 3 -11 4 -13
+-5 1.5 1 -7 2.5 2 -9 3.5 3 -11 4.5 4 -13 5.5
+-5 1.5 1 -7 2.5 2 -9 3.5 3 -11 4.5 4 -13 5.5
+-5 1.5 1 -7 2.5 2 -9 3.5 3 -11 4.5 4 -13 5.5
 \end{cfa}
 \end{tabular}
+\end{cquote}
+In addition, subranges are allowed to specify case values.\footnote{
+gcc has the same mechanism but awkward syntax, \lstinline@2 ...42@, because a space is required after a number, otherwise the period is a decimal point.}
+\begin{cfa}
+switch ( i ) {
+  case ®1~5:® §\C{// 1, 2, 3, 4, 5}§
+        ...
+  case ®10~15:® §\C{// 10, 11, 12, 13, 14, 15}§
+        ...
+}
+\end{cfa}
+Lists of subranges are also allowed.
+\begin{cfa}
+case ®1~5, 12~21, 35~42®:
+\end{cfa}
+\caption{Loop Control Examples}
+\label{f:LoopControlExamples}
+\end{figure}
 % for ()  => for ( ;; )
 …
+\subsection{Loop Control}
+The ©for©/©while©/©do-while© loop-control allows empty or simplified ranges (see Figure~\ref{f:LoopControlExamples}).
+\begin{itemize}
+\item
+The loop index is polymorphic in the type of the comparison value N (when the start value is implicit) or the start value M.
+\item
+An empty conditional implies comparison value of ©1© (true).
+\item
+A comparison N is implicit up-to exclusive range [0,N©®)®©.
+\item
+A comparison ©=© N is implicit up-to inclusive range [0,N©®]®©.
+\item
+The up-to range M ©~©\index{~@©~©} N means exclusive range [M,N©®)®©.
+\item
+The up-to range M ©~=©\index{~=@©~=©} N means inclusive range [M,N©®]®©.
+\item
+The down-to range M ©-~©\index{-~@©-~©} N means exclusive range [N,M©®)®©.
+\item
+The down-to range M ©-~=©\index{-~=@©-~=©} N means inclusive range [N,M©®]®©.
+\item
+©0© is the implicit start value;
+\item
+©1© is the implicit increment value.
+\item
+The up-to range uses operator ©+=© for increment;
+\item
+The down-to range uses operator ©-=© for decrement.
+\item
+©@© means put nothing in this field.
+\item
+©:© means start another index.
+\end{itemize}
 %\subsection{\texorpdfstring{Labelled \protect\lstinline@continue@ / \protect\lstinline@break@}{Labelled continue / break}}
 \subsection{\texorpdfstring{Labelled \LstKeywordStyle{continue} / \LstKeywordStyle{break} Statement}{Labelled continue / break Statement}}
 …
 for ©break©, the target label can also be associated with a ©switch©, ©if© or compound (©{}©) statement.
 \VRef[Figure]{f:MultiLevelExit} shows ©continue© and ©break© indicating the specific control structure, and the corresponding C program using only ©goto© and labels.
 The innermost loop has 7 exit points, which cause continuation or termination of one or more of the 7 \Index{nested control-structure}s.
+The innermost loop has 8 exit points, which cause continuation or termination of one or more of the 7 \Index{nested control-structure}s.
 \begin{figure}
+\begin{tabular}{@{\hspace{\parindentlnth}}l@{\hspace{\parindentlnth}}l@{\hspace{\parindentlnth}}l@{}}
+\multicolumn{1}{@{\hspace{\parindentlnth}}c@{\hspace{\parindentlnth}}}{\textbf{\CFA}}   & \multicolumn{1}{@{\hspace{\parindentlnth}}c}{\textbf{C}}      \\
+\begin{cfa}
+®LC:® {
+        ... §declarations§ ...
+        ®LS:® switch ( ... ) {
+          case 3:
+                ®LIF:® if ( ... ) {
+                        ®LF:® for ( ... ) {
+                                ®LW:® while ( ... ) {
+                                        ... break ®LC®; ...
+                                        ... break ®LS®; ...
+                                        ... break ®LIF®; ...
+                                        ... continue ®LF;® ...
+                                        ... break ®LF®; ...
+                                        ... continue ®LW®; ...
+                                        ... break ®LW®; ...
+                                } // while
+                        } // for
+                } else {
+                        ... break ®LIF®; ...
+                } // if
+        } // switch
+\centering
+\begin{lrbox}{\myboxA}
+\begin{cfa}[tabsize=3]
+®Compound:® {
+        ®Try:® try {
+                ®For:® for ( ... ) {
+                        ®While:® while ( ... ) {
+                                ®Do:® do {
+                                        ®If:® if ( ... ) {
+                                                ®Switch:® switch ( ... ) {
+                                                        case 3:
+                                                                ®break Compound®;
+                                                                ®break Try®;
+                                                                ®break For®;      /* or */  ®continue For®;
+                                                                ®break While®;  /* or */  ®continue While®;
+                                                                ®break Do®;      /* or */  ®continue Do®;
+                                                                ®break If®;
+                                                                ®break Switch®;
+                                                        } // switch
+                                                } else {
+                                                        ... ®break If®; ...     // terminate if
+                                                } // if
+                                } while ( ... ); // do
+                        } // while
+                } // for
+        } ®finally® { // always executed
+        } // try
 } // compound
 \end{cfa}
+&
+\begin{cfa}
+\end{lrbox}
+\begin{lrbox}{\myboxB}
+\begin{cfa}[tabsize=3]
+{
+        ... §declarations§ ...
+        switch ( ... ) {
+          case 3:
+                if ( ... ) {
+                        for ( ... ) {
+                                while ( ... ) {
+                                        ... goto ®LC®; ...
+                                        ... goto ®LS®; ...
+                                        ... goto ®LIF®; ...
+                                        ... goto ®LFC®; ...
+                                        ... goto ®LFB®; ...
+                                        ... goto ®LWC®; ...
+                                        ... goto ®LWB®; ...
+                                  ®LWC®: ; } ®LWB:® ;
+                          ®LFC:® ; } ®LFB:® ;
+                } else {
+                        ... goto ®LIF®; ...
+                } ®L3:® ;
+        } ®LS:® ;
+} ®LC:® ;
+\end{cfa}
+&
+\begin{cfa}
+// terminate compound
+// terminate switch
+// terminate if
+// continue loop
+// terminate loop
+// continue loop
+// terminate loop
+// terminate if
+\end{cfa}
+\end{tabular}
+                ®ForC:® for ( ... ) {
+                        ®WhileC:® while ( ... ) {
+                                ®DoC:® do {
+                                        if ( ... ) {
+                                                switch ( ... ) {
+                                                        case 3:
+                                                                ®goto Compound®;
+                                                                ®goto Try®;
+                                                                ®goto ForB®;      /* or */  ®goto ForC®;
+                                                                ®goto WhileB®;  /* or */  ®goto WhileC®;
+                                                                ®goto DoB®;      /* or */  ®goto DoC®;
+                                                                ®goto If®;
+                                                                ®goto Switch®;
+                                                        } ®Switch:® ;
+                                                } else {
+                                                        ... ®goto If®; ...      // terminate if
+                                                } ®If:®;
+                                } while ( ... ); ®DoB:® ;
+                        } ®WhileB:® ;
+                } ®ForB:® ;
+} ®Compound:® ;
+\end{cfa}
+\end{lrbox}
+\subfloat[\CFA]{\label{f:CFibonacci}\usebox\myboxA}
+\hspace{2pt}
+\vrule
+\hspace{2pt}
+\subfloat[C]{\label{f:CFAFibonacciGen}\usebox\myboxB}
 \caption{Multi-level Exit}
 \label{f:MultiLevelExit}
 …
 try {
         f(...);
 } catch( E e ; §boolean-predicate§ ) {          §\C[8cm]{// termination handler}§
+} catch( E e ; §boolean-predicate§ ) {          §\C{// termination handler}§
         // recover and continue
 } catchResume( E e ; §boolean-predicate§ ) { §\C{// resumption handler}\CRT§
+} catchResume( E e ; §boolean-predicate§ ) { §\C{// resumption handler}§
         // repair and return
 } finally {
 …
 For implicit formatted input, the common case is reading a sequence of values separated by whitespace, where the type of an input constant must match with the type of the input variable.
 \begin{cquote}
 \begin{lrbox}{\LstBox}
+\begin{lrbox}{\myboxA}
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 int x;   double y   char z;
 …
 \end{lrbox}
 \begin{tabular}{@{}l@{\hspace{3em}}l@{\hspace{3em}}l@{}}
 \multicolumn{1}{@{}l@{}}{\usebox\LstBox} \\
+\multicolumn{1}{@{}l@{}}{\usebox\myboxA} \\
 \multicolumn{1}{c@{\hspace{2em}}}{\textbf{\CFA}}        & \multicolumn{1}{c@{\hspace{2em}}}{\textbf{\CC}}       & \multicolumn{1}{c}{\textbf{Python}}   \\
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 …
 hence, names in these include files are not mangled\index{mangling!name} (see~\VRef{s:Interoperability}).
 All other C header files must be explicitly wrapped in ©extern "C"© to prevent name mangling.
 For \Index*[C++]{\CC{}}, the name-mangling issue is often handled internally in many C header-files through checks for preprocessor variable ©__cplusplus©, which adds appropriate ©extern "C"© qualifiers.
+This approach is different from \Index*[C++]{\CC{}} where the name-mangling issue is handled internally in C header-files through checks for preprocessor variable ©__cplusplus©, which adds appropriate ©extern "C"© qualifiers.
 …
 The storage-management routines extend their C equivalents by overloading, alternate names, providing shallow type-safety, and removing the need to specify the allocation size for non-array types.
 Storage management provides the following capabilities:
+C storage management provides the following capabilities:
 \begin{description}
 \item[fill]
 after allocation the storage is filled with a specified character.
+\item[filled]
+after allocation with a specified character or value.
 \item[resize]
 an existing allocation is decreased or increased in size.
 In either case, new storage may or may not be allocated and, if there is a new allocation, as much data from the existing allocation is copied.
+an existing allocation to decreased or increased its size.
+In either case, new storage may or may not be allocated and, if there is a new allocation, as much data from the existing allocation is copied into the new allocation.
 For an increase in storage size, new storage after the copied data may be filled.
 \item[alignment]
 an allocation starts on a specified memory boundary, \eg, an address multiple of 64 or 128 for cache-line purposes.
+\item[align]
+an allocation on a specified memory boundary, \eg, an address multiple of 64 or 128 for cache-line purposes.
 \item[array]
 the allocation size is scaled to the specified number of array elements.
 An array may be filled, resized, or aligned.
 \end{description}
+The table shows allocation routines supporting different combinations of storage-management capabilities:
+\begin{center}
+\begin{tabular}{@{}r|r|l|l|l|l@{}}
+\VRef[Table]{t:AllocationVersusCapabilities} shows allocation routines supporting different combinations of storage-management capabilities.
+\begin{table}
+\centering
+\begin{minipage}{0.75\textwidth}
+\begin{tabular}{@{}r|l|l|l|l|l@{}}
 \multicolumn{1}{c}{}&           & \multicolumn{1}{c|}{fill}     & resize        & alignment     & array \\
 \hline
 C               & ©malloc©                      & no                    & no            & no            & no    \\
                 & ©calloc©                      & yes (0 only)  & no            & no            & yes   \\
                 & ©realloc©                     & no/copy               & yes           & no            & no    \\
+                & ©realloc©                     & copy                  & yes           & no            & no    \\
                 & ©memalign©            & no                    & no            & yes           & no    \\
+                & ©aligned_alloc©\footnote{Same as ©memalign© but size is an integral multiple of alignment, which is universally ignored.}
+                                                        & no                    & no            & yes           & no    \\
                 & ©posix_memalign©      & no                    & no            & yes           & no    \\
+                & ©valloc©                      & no                    & no            & yes (page size)& no   \\
+                & ©pvalloc©\footnote{Same as ©valloc© but rounds size to multiple of page size.}
+                                                        & no                    & no            & yes (page size)& no   \\
 \hline
+C11             & ©aligned_alloc©       & no                    & no            & yes           & no    \\
+\hline
+\CFA    & ©alloc©                       & no/copy/yes   & no/yes        & no            & yes   \\
+                & ©align_alloc©         & no/yes                & no            & yes           & yes   \\
+\CFA    & ©cmemalign©           & yes (0 only)  & no            & yes           & yes   \\
+                & ©realloc©                     & copy                  & yes           & yes           & no    \\
+                & ©alloc©                       & no                    & yes           & no            & yes   \\
+                & ©alloc_set©           & yes                   & yes           & no            & yes   \\
+                & ©alloc_align©         & no                    & yes           & yes           & yes   \\
+                & ©alloc_align_set©     & yes                   & yes           & yes           & yes   \\
 \end{tabular}
+\end{center}
+It is impossible to resize with alignment because the underlying ©realloc© allocates storage if more space is needed, and it does not honour alignment from the original allocation.
+\end{minipage}
+\caption{Allocation Routines versus Storage-Management Capabilities}
+\label{t:AllocationVersusCapabilities}
+\end{table}
+\CFA memory management extends the type safety of all allocations by using the type of the left-hand-side type to determine the allocation size and return a matching type for the new storage.
+Type-safe allocation is provided for all C allocation routines and new \CFA allocation routines, \eg in
+\begin{cfa}
+int * ip = (int *)malloc( sizeof(int) );                §\C{// C}§
+int * ip = malloc();                                                    §\C{// \CFA type-safe version of C malloc}§
+int * ip = alloc();                                                             §\C{// \CFA type-safe uniform alloc}§
+\end{cfa}
+the latter two allocations determine the allocation size from the type of ©p© (©int©) and cast the pointer to the allocated storage to ©int *©.
+\CFA memory management extends allocation safety by implicitly honouring all alignment requirements, \eg in
+\begin{cfa}
+struct S { int i; } __attribute__(( aligned( 128 ) )); // cache-line alignment
+S * sp = malloc();                                                              §\C{// honour type alignment}§
+\end{cfa}
+the storage allocation is implicitly aligned to 128 rather than the default 16.
+The alignment check is performed at compile time so there is no runtime cost.
+\CFA memory management extends the resize capability with the notion of \newterm{sticky properties}.
+Hence, initial allocation capabilities are remembered and maintained when resize requires copying.
+For example, an initial alignment and fill capability are preserved during a resize copy so the copy has the same alignment and extended storage is filled.
+Without sticky properties it is dangerous to use ©realloc©, resulting in an idiom of manually performing the reallocation to maintain correctness.
+\begin{cfa}
+\end{cfa}
+\CFA memory management extends allocation to support constructors for initialization of allocated storage, \eg in
+\begin{cfa}
+struct S { int i; };                                                    §\C{// cache-line aglinment}§
+void ?{}( S & s, int i ) { s.i = i; }
+// assume ?|? operator for printing an S
+S & sp = *®new®( 3 );                                                   §\C{// call constructor after allocation}§
+sout | sp.i;
+®delete®( &sp );
+S * spa = ®anew®( 10, 5 );                                              §\C{// allocate array and initialize each array element}§
+for ( i; 10 ) sout | spa[i] | nonl;
+sout | nl;
+®adelete®( 10, spa );
+\end{cfa}
+Allocation routines ©new©/©anew© allocate a variable/array and initialize storage using the allocated type's constructor.
+Note, the matching deallocation routines ©delete©/©adelete©.
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
-// C unsafe allocation
 extern "C" {
+void * malloc( size_t size );§\indexc{memset}§
+void * calloc( size_t dim, size_t size );§\indexc{calloc}§
+void * realloc( void * ptr, size_t size );§\indexc{realloc}§
+void * memalign( size_t align, size_t size );§\indexc{memalign}§
+int posix_memalign( void ** ptr, size_t align, size_t size );§\indexc{posix_memalign}§
+// C unsafe initialization/copy
+void * memset( void * dest, int c, size_t size );
+void * memcpy( void * dest, const void * src, size_t size );
+}
+        // C unsafe allocation
+        void * malloc( size_t size );§\indexc{malloc}§
+        void * calloc( size_t dim, size_t size );§\indexc{calloc}§
+        void * realloc( void * ptr, size_t size );§\indexc{realloc}§
+        void * memalign( size_t align, size_t size );§\indexc{memalign}§
+        void * aligned_alloc( size_t align, size_t size );§\indexc{aligned_alloc}§
+        int posix_memalign( void ** ptr, size_t align, size_t size );§\indexc{posix_memalign}§
+        void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize );§\indexc{cmemalign}§ // CFA
+        // C unsafe initialization/copy
+        void * memset( void * dest, int c, size_t size );§\indexc{memset}§
+        void * memcpy( void * dest, const void * src, size_t size );§\indexc{memcpy}§
+}
+void * realloc( void * oaddr, size_t nalign, size_t size ); // CFA heap
 forall( dtype T | sized(T) ) {
 // §\CFA§ safe equivalents, i.e., implicit size specification
+        // §\CFA§ safe equivalents, i.e., implicit size specification
         T * malloc( void );
         T * calloc( size_t dim );
         T * realloc( T * ptr, size_t size );
         T * memalign( size_t align );
+        T * cmemalign( size_t align, size_t dim  );
         T * aligned_alloc( size_t align );
         int posix_memalign( T ** ptr, size_t align );
+// §\CFA§ safe general allocation, fill, resize, array
+        T * alloc( void );§\indexc{alloc}§
+        T * alloc( char fill );
+        T * alloc( size_t dim );
+        T * alloc( size_t dim, char fill );
+        T * alloc( T ptr[], size_t dim );
+        T * alloc( T ptr[], size_t dim, char fill );
+// §\CFA§ safe general allocation, align, fill, array
+        T * align_alloc( size_t align );
+        T * align_alloc( size_t align, char fill );
+        T * align_alloc( size_t align, size_t dim );
+        T * align_alloc( size_t align, size_t dim, char fill );
+// §\CFA§ safe initialization/copy, i.e., implicit size specification
+        T * memset( T * dest, char c );§\indexc{memset}§
+        // §\CFA§ safe general allocation, fill, resize, alignment, array
+        T * alloc( void );§\indexc{alloc}§                                      §\C[3.5in]{// variable, T size}§
+        T * alloc( size_t dim );                                                        §\C{// array[dim], T size elements}§
+        T * alloc( T ptr[], size_t dim );                                       §\C{// realloc array[dim], T size elements}§
+        T * alloc_set( char fill );§\indexc{alloc_set}§         §\C{// variable, T size, fill bytes with value}§
+        T * alloc_set( T fill );                                                        §\C{// variable, T size, fill with value}§
+        T * alloc_set( size_t dim, char fill );                         §\C{// array[dim], T size elements, fill bytes with value}§
+        T * alloc_set( size_t dim, T fill );                            §\C{// array[dim], T size elements, fill elements with value}§
+        T * alloc_set( size_t dim, const T fill[] );            §\C{// array[dim], T size elements, fill elements with array}§
+        T * alloc_set( T ptr[], size_t dim, char fill );        §\C{// realloc array[dim], T size elements, fill bytes with value}§
+        T * alloc_align( size_t align );                                        §\C{// aligned variable, T size}§
+        T * alloc_align( size_t align, size_t dim );            §\C{// aligned array[dim], T size elements}§
+        T * alloc_align( T ptr[], size_t align );                       §\C{// realloc new aligned array}§
+        T * alloc_align( T ptr[], size_t align, size_t dim ); §\C{// realloc new aligned array[dim]}§
+        T * alloc_align_set( size_t align, char fill );         §\C{// aligned variable, T size, fill bytes with value}§
+        T * alloc_align_set( size_t align, T fill );            §\C{// aligned variable, T size, fill with value}§
+        T * alloc_align_set( size_t align, size_t dim, char fill ); §\C{// aligned array[dim], T size elements, fill bytes with value}§
+        T * alloc_align_set( size_t align, size_t dim, T fill ); §\C{// aligned array[dim], T size elements, fill elements with value}§
+        T * alloc_align_set( size_t align, size_t dim, const T fill[] ); §\C{// aligned array[dim], T size elements, fill elements with array}§
+        T * alloc_align_set( T ptr[], size_t align, size_t dim, char fill ); §\C{// realloc new aligned array[dim], fill new bytes with value}§
+        // §\CFA§ safe initialization/copy, i.e., implicit size specification
+        T * memset( T * dest, char fill );§\indexc{memset}§
         T * memcpy( T * dest, const T * src );§\indexc{memcpy}§
+// §\CFA§ safe initialization/copy array
         T * amemset( T dest[], char c, size_t dim );
+        // §\CFA§ safe initialization/copy, i.e., implicit size specification, array types
+        T * amemset( T dest[], char fill, size_t dim );
         T * amemcpy( T dest[], const T src[], size_t dim );
+}
 // §\CFA§ allocation/deallocation and constructor/destructor
 forall( dtype T | sized(T), ttype Params | { void ?{}( T *, Params ); } ) T * new( Params p );§\indexc{new}§
 forall( dtype T | { void ^?{}( T * ); } ) void delete( T * ptr );§\indexc{delete}§
 forall( dtype T, ttype Params | { void ^?{}( T * ); void delete( Params ); } )
+// §\CFA§ allocation/deallocation and constructor/destructor, non-array types
+forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * new( Params p );§\indexc{new}§
+forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void delete( T * ptr );§\indexc{delete}§
+forall( dtype T, ttype Params | sized(T) | { void ^?{}( T & ); void delete( Params ); } )
   void delete( T * ptr, Params rest );
 // §\CFA§ allocation/deallocation and constructor/destructor, array
 forall( dtype T | sized(T), ttype Params | { void ?{}( T *, Params ); } ) T * anew( size_t dim, Params p );§\indexc{anew}§
 forall( dtype T | sized(T) | { void ^?{}( T * ); } ) void adelete( size_t dim, T arr[] );§\indexc{adelete}§
 forall( dtype T | sized(T) | { void ^?{}( T * ); }, ttype Params | { void adelete( Params ); } )
+// §\CFA§ allocation/deallocation and constructor/destructor, array types
+forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * anew( size_t dim, Params p );§\indexc{anew}§
+forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( size_t dim, T arr[] );§\indexc{adelete}§
+forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype Params | { void adelete( Params ); } )
   void adelete( size_t dim, T arr[], Params rest );
 \end{cfa}

driver/Makefile.am

r3c64c668	r58fe85a
28	28	@test -z "$(CFA_BINDIR)" \|\| $(MKDIR_P) "$(CFA_BINDIR)"
29	29	@echo " $(INSTALL_PROGRAM_ENV) $(INSTALL_PROGRAM) cfa '$(CFA_BINDIR)/$(CFA_NAME)'"; \
	30	chmod u+w $(CFA_BINDIR);\
30	31	$(INSTALL_PROGRAM_ENV) $(INSTALL_PROGRAM) cfa $(CFA_BINDIR)/$(CFA_NAME) \|\| exit $$?
31	32

driver/cc1.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Fri Aug 26 14:23:51 2005
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Oct 20 08:14:33 2019
 // Update Count     : 385
+// Last Modified On : Tue Nov 17 14:27:08 2020
+// Update Count     : 414
 //
 …
 #include <unistd.h>                                                                             // execvp, fork, unlink
 #include <sys/wait.h>                                                                   // wait
 #include <fcntl.h>
+#include <fcntl.h>                                                                              // creat
 …
 static string o_file;
 static string bprefix;
+static string lang;                                                                             // -x flag
 …
 static string __CFA_FLAGPREFIX__( "__CFA_FLAG" );               // "N__=" suffix
+static string __CFA_FLAGPREFIX__( "__CFA_FLAG" );               // "__CFA_FLAG__=" suffix
 static void checkEnv1( const char * args[], int & nargs ) { // stage 1
 …
                         if ( prefix( val, "-compiler=" ) ) {
                                 compiler_path = val.substr( 10 );
+                        } else if ( prefix( val, "-x=" ) ) {
+                                lang = val.substr( 3 );
                         } // if
                 } // if
 …
                         } else if ( val == "-CFA" ) {
                                 CFA_flag = true;
                         } else if ( val == "-save-temps" ) {
+                        } else if ( val == "-save-temps" || val == "--save-temps" ) {
                                 save_temps = true;
                         } else if ( prefix( val, "-o=" ) ) {            // output file for -CFA
 …
                         } else if ( prefix( val, "-B=" ) ) {            // location of cfa-cpp
                                 bprefix = val.substr( 3 );
+                        } else if ( prefix( val, "-x=" ) ) {            // ignore
                         } else {                                                                        // normal flag for cfa-cpp
                                 args[nargs++] = ( *new string( arg.substr( arg.find_first_of( "=" ) + 1 ) ) ).c_str();
 …
 } // checkEnv2
+static char tmpname[] = P_tmpdir "/CFAXXXXXX.ifa";
+#define CFA_SUFFIX ".ifa"
+static char tmpname[] = P_tmpdir "/CFAXXXXXX" CFA_SUFFIX;
 static int tmpfilefd = -1;
 static bool startrm = false;
 …
                         if ( arg == "-quiet" ) {
                         } else if ( arg == "-imultilib" || arg == "-imultiarch" ) {
                                 i += 1;                                                                 // and the argument
+                                i += 1;                                                                 // and argument
                         } else if ( prefix( arg, "-A" ) ) {
                         } else if ( prefix( arg, "-D__GNU" ) ) {
 …
                                 //********
                         } else if ( arg == "-D" && prefix( argv[i + 1], "__GNU" ) ) {
                                 i += 1;                                                                 // and the argument
+                                i += 1;                                                                 // and argument
                                 // strip flags controlling cpp step
 …
                                 cpp_flag = true;
                         } else if ( arg == "-D" && string( argv[i + 1] ) == "__CPP__" ) {
                                 i += 1;                                                                 // and the argument
+                                i += 1;                                                                 // and argument
                                 cpp_flag = true;
 …
                                 cpp_out = argv[i];
                         } else {
                                 args[nargs++] = argv[i];                                // pass the flag along
+                                args[nargs++] = argv[i];                                // pass flag along
                                 // CPP flags with an argument
                                 if ( arg == "-D" || arg == "-U" || arg == "-I" || arg == "-MF" || arg == "-MT" || arg == "-MQ" ||
 …
                                          arg == "-iwithprefix" || arg == "-iwithprefixbefore" || arg == "-isystem" || arg == "-isysroot" ) {
                                         i += 1;
                                         args[nargs++] = argv[i];                        // pass the argument along
+                                        args[nargs++] = argv[i];                        // pass argument along
                                         #ifdef __DEBUG_H__
                                         cerr << "argv[" << i << "]:\"" << argv[i] << "\"" << endl;
                                         #endif // __DEBUG_H__
                                 } else if ( arg == "-MD" || arg == "-MMD" ) {
+                                        // gcc frontend generates the dependency file-name after the -MD/-MMD flag, but it is necessary to
+                                        // prefix that file name with -MF.
                                         args[nargs++] = "-MF";                          // insert before file
                                         i += 1;
                                         args[nargs++] = argv[i];                        // pass the argument along
+                                        args[nargs++] = argv[i];                        // pass argument along
                                         #ifdef __DEBUG_H__
                                         cerr << "argv[" << i << "]:\"" << argv[i] << "\"" << endl;
 …
                 args[0] = compiler_path.c_str();
+                suffix( cpp_in, args, nargs );                                  // check suffix
+                if ( lang.size() == 0 ) {
+                        suffix( cpp_in, args, nargs );                          // check suffix
+                } else {
+                        args[nargs++] = "-x";
+                        args[nargs++] = ( *new string( lang.c_str() ) ).c_str();
+                } // if
                 args[nargs++] = cpp_in;
                 if ( o_flag ) {                                                                 // location for output
 …
         // Run the C preprocessor and save the output in the given file.
         if ( fork() == 0 ) {                                                             // child process ?
+        if ( fork() == 0 ) {                                                            // child process ?
                 // -o xxx.ii cannot be used to write the output file from cpp because no output file is created if cpp detects
                 // an error (e.g., cannot find include file). Whereas, output is always generated, even when there is an error,
 …
                 args[0] = compiler_path.c_str();
+                suffix( cpp_in, args, nargs );                                  // check suffix
+                if ( lang.size() == 0 ) {
+                        suffix( cpp_in, args, nargs );                          // check suffix
+                } else {
+                        args[nargs++] = "-x";
+                        args[nargs++] = ( *new string( lang.c_str() ) ).c_str();
+                } // if
                 args[nargs++] = cpp_in;                                                 // input to cpp
                 args[nargs] = nullptr;                                                  // terminate argument list
 …
         if ( WIFSIGNALED(code) ) {                                                      // child failed ?
+                rmtmpfile();                                                                    // remove tmpname
                 cerr << "CC1 Translator error: stage 1, child failed " << WTERMSIG(code) << endl;
                 exit( EXIT_FAILURE );
         } // if
         exit( WEXITSTATUS(code) );                                                      // bad cpp result stops top-level gcc
+        exit( WEXITSTATUS( code ) );                                            // bad cpp result stops top-level gcc
 } // Stage1
 …
                         } else if ( arg == "-fno-diagnostics-color" ) {
                                 color_arg = Color_Auto;
+                        }
+                        } // if
                         if ( arg == "-quiet" || arg == "-version" || arg == "-fpreprocessed" ||
                                 // Currently CFA does not suppose precompiled .h files.
                                 prefix( arg, "--output-pch" ) ) {
+                                 // Currently CFA does not suppose precompiled .h files.
+                                 prefix( arg, "--output-pch" ) ) {
                                 // strip inappropriate flags with an argument
 …
                         } else {
                                 args[nargs++] = argv[i];                                // pass the flag along
+                                args[nargs++] = argv[i];                                // pass flag along
                                 if ( arg == "-o" ) {
                                         i += 1;
                                         cpp_out = argv[i];
                                         args[nargs++] = argv[i];                        // pass the argument along
+                                        args[nargs++] = argv[i];                        // pass argument along
                                         #ifdef __DEBUG_H__
                                         cerr << "arg:\"" << argv[i] << "\"" << endl;
 …
                         } // if
                         cfa_cpp_out = cfa_cpp_out.substr( 0, dot ) + ".ifa";
+                        cfa_cpp_out = cfa_cpp_out.substr( 0, dot ) + CFA_SUFFIX;
                         if ( creat( cfa_cpp_out.c_str(), 0666 ) == -1 ) {
                                 perror( "CC1 Translator error: stage 2, creat" );
 …
         // output.  Otherwise, run the cfa-cpp preprocessor on the temporary file and save the result into the output file.
         if ( fork() == 0 ) {                                                            // child runs CFA
+        if ( fork() == 0 ) {                                                            // child runs CFA preprocessor
                 cargs[0] = ( *new string( bprefix + "cfa-cpp" ) ).c_str();
                 cargs[ncargs++] = cpp_in;
 …
         #endif // __DEBUG_H__
         if ( fork() == 0 ) {                                                            // child runs CFA
+        if ( fork() == 0 ) {                                                            // child runs gcc
                 args[0] = compiler_path.c_str();
                 args[nargs++] = "-S";                                                   // only compile and put assembler output in specified file

driver/cfa.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Aug 20 13:44:49 2002
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Jan 31 16:48:03 2020
 // Update Count     : 421
+// Last Modified On : Tue Nov 17 14:27:28 2020
+// Update Count     : 440
 //
 #include <iostream>
+#include <cstdio>      // perror
+#include <cstdlib>     // putenv, exit
+#include <climits>     // PATH_MAX
+#include <unistd.h>    // execvp
+#include <string>      // STL version
+#include <string.h>    // strcmp
+#include <algorithm>   // find
+#include <cstdio>                                                                               // perror
+#include <cstdlib>                                                                              // putenv, exit
+#include <climits>                                                                              // PATH_MAX
+#include <string>                                                                               // STL version
+#include <algorithm>                                                                    // find
+#include <unistd.h>                                                                             // execvp
 #include <sys/types.h>
 #include <sys/stat.h>
 …
 using std::to_string;
+// #define __DEBUG_H__
+// "N__=" suffix
+static string __CFA_FLAGPREFIX__( "__CFA_FLAG" );
+void Putenv( char * argv[], string arg ) {
+//#define __DEBUG_H__
+#define xstr(s) str(s)
+#define str(s) #s
+static string __CFA_FLAGPREFIX__( "__CFA_FLAG" );               // "__CFA_FLAG__=" suffix
+static void Putenv( char * argv[], string arg ) {
         // environment variables must have unique names
         static int flags = 0;
 …
 } // Putenv
+// check if string has prefix
+bool prefix( const string & arg, const string & pre ) {
+static bool prefix( const string & arg, const string & pre ) { // check if string has prefix
         return arg.substr( 0, pre.size() ) == pre;
 } // prefix
 inline bool ends_with(const string & str, const string & sfix) {
+static inline bool ends_with(const string & str, const string & sfix) {
         if (sfix.size() > str.size()) return false;
         return std::equal(str.rbegin(), str.rbegin() + sfix.size(), sfix.rbegin(), sfix.rend());
 …
 // check if string has suffix
 bool suffix( const string & arg ) {
+static bool suffix( const string & arg ) {
         enum { NumSuffixes = 3 };
         static const string suffixes[NumSuffixes] = { "cfa", "hfa", "ifa" };
 …
 } // suffix
 static inline bool dirExists( const string & path ) {   // check if directory exists
     struct stat info;
 …
 static inline string dir(const string & path) {
         return path.substr(0, path.find_last_of('/'));
+}
+} // dir
 // Different path modes
 …
+}
-#define xstr(s) str(s)
-#define str(s) #s
 int main( int argc, char * argv[] ) {
 …
         PathMode path = FromProc();
         const char *args[argc + 100];                                           // cfa command line values, plus some space for additional flags
+        const char * args[argc + 100];                                          // cfa command line values, plus some space for additional flags
         int sargs = 1;                                                                          // starting location for arguments in args list
         int nargs = sargs;                                                                      // number of arguments in args list; 0 => command name
         const char *libs[argc + 20];                                            // non-user libraries must come separately, plus some added libraries and flags
+        const char * libs[argc + 20];                                           // non-user libraries must come separately, plus some added libraries and flags
         int nlibs = 0;
 …
                         if ( arg == "-Xlinker" || arg == "-o" ) {
                                 args[nargs++] = argv[i];                                // pass argument along
+                                args[nargs++] = argv[i];                                // pass flag along
                                 i += 1;
                                 if ( i == argc ) continue;                              // next argument available ?
                                 args[nargs++] = argv[i];                                // pass argument along
                                 if ( arg == "-o" ) o_file = i;                  // remember file
-                        } else if ( arg == "-XCFA" ) {                          // CFA pass through
-                                i += 1;
-                                if ( i == argc ) continue;                              // next argument available ?
-                                Putenv( argv, argv[i] );
                                 // CFA specific arguments
+                        } else if ( strncmp(arg.c_str(), "-XCFA", 5) == 0 ) { // CFA pass through
+                                if ( arg.size() == 5 ) {
+                                        i += 1;
+                                        if ( i == argc ) continue;                      // next argument available ?
+                                        Putenv( argv, argv[i] );
+                                } else if ( arg[5] == ',' ) {                   // CFA specific arguments
+                                        Putenv( argv, argv[i] + 6 );
+                                } else {                                                                // CFA specific arguments
+                                        args[nargs++] = argv[i];
+                                } // if
                         } else if ( arg == "-CFA" ) {
                                 CFA_flag = true;                                                // strip the -CFA flag
 …
                         } else if ( arg == "-nodebug" ) {
                                 debug = false;                                                  // strip the nodebug flag
-                        } else if ( arg == "-nolib" ) {
-                                nolib = true;                                                   // strip the nodebug flag
                         } else if ( arg == "-quiet" ) {
                                 quiet = true;                                                   // strip the quiet flag
                         } else if ( arg == "-noquiet" ) {
                                 quiet = false;                                                  // strip the noquiet flag
+                        } else if ( arg == "-no-include-stdhdr" ) {
+                                noincstd_flag = true;                                   // strip the no-include-stdhdr flag
+                        } else if ( arg == "-nolib" ) {
+                                nolib = true;                                                   // strip the nolib flag
                         } else if ( arg == "-help" ) {
                                 help = true;                                                    // strip the help flag
                         } else if ( arg == "-nohelp" ) {
                                 help = false;                                                   // strip the nohelp flag
-                        } else if ( arg == "-no-include-stdhdr" ) {
-                                noincstd_flag = true;                                   // strip the no-include-stdhdr flag
                         } else if ( arg == "-cfalib") {
                                 compiling_libs = true;
 …
                         } else if ( arg == "-v" ) {
                                 verbose = true;                                                 // verbosity required
                                 args[nargs++] = argv[i];                                // pass argument along
+                                args[nargs++] = argv[i];                                // pass flag along
                         } else if ( arg == "-g" ) {
                                 debugging = true;                                               // symbolic debugging required
                                 args[nargs++] = argv[i];                                // pass argument along
                         } else if ( arg == "-save-temps" ) {
                                 args[nargs++] = argv[i];                                // pass argument along
+                                args[nargs++] = argv[i];                                // pass flag along
+                        } else if ( arg == "-save-temps" || arg == "--save-temps" ) {
+                                args[nargs++] = argv[i];                                // pass flag along
                                 Putenv( argv, arg );                                    // save cfa-cpp output
                         } else if ( prefix( arg, "-x" ) ) {                     // file suffix ?
                                 string lang;
                                 args[nargs++] = argv[i];                                // pass argument along
+                                args[nargs++] = argv[i];                                // pass flag along
                                 if ( arg.length() == 2 ) {                              // separate argument ?
                                         i += 1;
 …
                                         lang = arg.substr( 2 );
                                 } // if
+                                x_flag = lang != "none";
+                                if ( x_flag ) {
+                                        cerr << argv[0] << " warning, only one -x flag per compile, ignoring subsequent flag." << endl;
+                                } else {
+                                        x_flag = true;
+                                        Putenv( argv, string( "-x=" ) + lang );
+                                } // if
                         } else if ( prefix( arg, "-std=" ) || prefix( arg, "--std=" ) ) {
                                 std_flag = true;                                                // -std=XX provided
                                 args[nargs++] = argv[i];                                // pass argument along
+                                args[nargs++] = argv[i];                                // pass flag along
                         } else if ( arg == "-w" ) {
                                 args[nargs++] = argv[i];                                // pass argument along
+                                args[nargs++] = argv[i];                                // pass flag along
                                 Putenv( argv, arg );
                         } else if ( prefix( arg, "-W" ) ) {                     // check before next tests
                                 if ( arg == "-Werror" || arg == "-Wall" ) {
                                         args[nargs++] = argv[i];                        // pass argument along
+                                        args[nargs++] = argv[i];                        // pass flag along
                                         Putenv( argv, argv[i] );
                                 } else {
 …
                                 bprefix = arg.substr(2);                                // strip the -B flag
                         } else if ( arg == "-c" || arg == "-S" || arg == "-E" || arg == "-M" || arg == "-MM" ) {
                                 args[nargs++] = argv[i];                                // pass argument along
+                                args[nargs++] = argv[i];                                // pass flag along
                                 if ( arg == "-E" || arg == "-M" || arg == "-MM" ) {
                                         cpp_flag = true;                                        // cpp only
                                 } // if
                                 link = false;                           // no linkage required
+                        } else if ( arg == "-D" || arg == "-U" || arg == "-I" || arg == "-MF" || arg == "-MT" || arg == "-MQ" ||
+                                                arg == "-include" || arg == "-imacros" || arg == "-idirafter" || arg == "-iprefix" ||
+                                                arg == "-iwithprefix" || arg == "-iwithprefixbefore" || arg == "-isystem" || arg == "-isysroot" ) {
+                                args[nargs++] = argv[i];                                // pass flag along
+                                i += 1;
+                                args[nargs++] = argv[i];                                // pass argument along
                         } else if ( arg[1] == 'l' ) {
                                 // if the user specifies a library, load it after user code
 …
         #ifdef __x86_64__
         args[nargs++] = "-mcx16";                                                       // allow double-wide CAA
+        args[nargs++] = "-mcx16";                                                       // allow double-wide CAS
         #endif // __x86_64__
 …
         string libbase;
         switch(path) {
         case Installed:
+          case Installed:
                 args[nargs++] = "-I" CFA_INCDIR;
                 // do not use during build
 …
                 libbase = CFA_LIBDIR;
                 break;
         case BuildTree:
         case Distributed:
+          case BuildTree:
+          case Distributed:
                 args[nargs++] = "-I" TOP_SRCDIR "libcfa/src";
                 // do not use during build
 …
         string libdir = libbase + arch + "-" + config;
         if (path != Distributed) {
+        if ( path != Distributed ) {
                 if ( ! nolib && ! dirExists( libdir ) ) {
                         cerr << argv[0] << " internal error, configuration " << config << " not installed." << endl;
 …
         } // if
+        string preludedir;
         switch(path) {
+        case Installed   : Putenv( argv, "--prelude-dir=" + libdir ); break;
+        case BuildTree   : Putenv( argv, "--prelude-dir=" + libdir + "/prelude" ); break;
+        case Distributed : Putenv( argv, "--prelude-dir=" + dir(argv[0]) ); break;
+        }
+          case Installed   : preludedir = libdir; break;
+          case BuildTree   : preludedir = libdir + "/prelude"; break;
+          case Distributed : preludedir = dir(argv[0]); break;
+        } // switch
+        Putenv( argv, "--prelude-dir=" + preludedir );
+        args[nargs++] = "-include";
+        args[nargs++] = (*new string(preludedir + "/defines.hfa")).c_str();
         for ( int i = 0; i < nlibs; i += 1 ) {                          // copy non-user libraries after all user libraries
 …
                 args[nargs++] = "-Wl,--pop-state";
                 args[nargs++] = "-pthread";
+                #if defined(  __x86_64__ ) || defined( __ARM_ARCH )
+                args[nargs++] = "-latomic";                                             // allow double-wide CAS
+                #endif // __x86_64__
                 args[nargs++] = "-ldl";
-                args[nargs++] = "-lrt";
                 args[nargs++] = "-lm";
         } // if
 …
         if ( bprefix.length() == 0 ) {
                 switch(path) {
                 case Installed   : bprefix = installlibdir; break;
                 case BuildTree   : bprefix = srcdriverdir ; break;
                 case Distributed : bprefix = dir(argv[0]) ; break;
+                }
                 if ( bprefix[bprefix.length() - 1] != '/' ) bprefix += '/';
                 Putenv( argv, string("-B=") + bprefix );
         } // if
+                  case Installed   : bprefix = installlibdir; break;
+                  case BuildTree   : bprefix = srcdriverdir ; break;
+                  case Distributed : bprefix = dir(argv[0]) ; break;
+                } // switch
+        } // if
+        if ( bprefix[bprefix.length() - 1] != '/' ) bprefix += '/';
+        Putenv( argv, string("-B=") + bprefix );
         args[nargs++] = "-Xlinker";                                                     // used by backtrace
 …
                 args[nargs++] = "-Wno-cast-function-type";
                 #endif // HAVE_CAST_FUNCTION_TYPE
                 if ( ! std_flag ) {                                                             // default c11, if none specified
                         args[nargs++] = "-std=gnu11";
+                if ( ! std_flag && ! x_flag ) {
+                        args[nargs++] = "-std=gnu11";                           // default c11, if none specified
                 } // if
                 args[nargs++] = "-fgnu89-inline";
 …
         // execute the command and return the result
         execvp( args[0], (char *const *)args );                         // should not return
+        execvp( args[0], (char * const *)args );                        // should not return
         perror( "CFA Translator error: execvp" );
         exit( EXIT_FAILURE );

libcfa/configure.ac

-              r3c64c668
+              r58fe85a
 AC_PREREQ([2.68])
 AC_INIT([cfa-cc],[1.0.0.0],[cforall@plg.uwaterloo.ca])
+AC_INIT([cfa-cc],[1.0.0],[cforall@plg.uwaterloo.ca])
 AC_CONFIG_AUX_DIR([automake])
 AC_CONFIG_MACRO_DIRS([automake])
 AM_SILENT_RULES([yes])
 m4_include([../automake/cfa.m4])
+m4_include([../tools/build/cfa.m4])
 AM_INIT_AUTOMAKE([subdir-objects])
 …
         [  --enable-distcc     whether or not to enable distributed compilation],
         enable_distcc=$enableval, enable_distcc=no)
+AC_ARG_WITH(bwlimit,
+        [  --with-bwlimit=RATE     RATE the maximum rate at which rsync will be limited when using distributed builds],
+        DIST_BWLIMIT=$withval, DIST_BWLIMIT=0)
 echo -n "checking for distributated build... "
 …
 AC_SUBST(CFADIR_HASH)
 AC_SUBST(CFA_VERSION)
+AC_SUBST(DIST_BWLIMIT)
 #==============================================================================
 …
 AM_CONDITIONAL([BUILDLIB], [test "x${CONFIG_BUILDLIB}" = "xyes"])
+AM_T='$(T)'
+AC_SUBST(AM_T)
 #==============================================================================
 #Trasforming cc1 will break compilation
 …
 # Checks for programs.
 LT_INIT
+LT_INIT([disable-static])
 AC_PROG_CXX
 …
 AC_PROG_MAKE_SET
+#io_uring 5.4 and earlier uses defines
+#io_uring 5.5 uses enum values
+#io_uring 5.6 and later uses probes
+AH_TEMPLATE([CFA_HAVE_LINUX_IO_URING_H],[Defined if io_uring support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_NOP],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_NOP.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_READV],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_READV.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_WRITEV],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_WRITEV.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_FSYNC],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_FSYNC.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_READ_FIXED],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_READ_FIXED.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_WRITE_FIXED],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_WRITE_FIXED.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_POLL_ADD],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_POLL_ADD.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_POLL_REMOVE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_POLL_REMOVE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_SYNC_FILE_RANGE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_SYNC_FILE_RANGE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_SENDMSG],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_SENDMSG.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_RECVMSG],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_RECVMSG.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_TIMEOUT],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_TIMEOUT.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_TIMEOUT_REMOVE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_TIMEOUT_REMOVE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_ACCEPT],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_ACCEPT.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_ASYNC_CANCEL],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_ASYNC_CANCEL.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_LINK_TIMEOUT],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_LINK_TIMEOUT.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_CONNECT],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_CONNECT.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_FALLOCATE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_FALLOCATE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_OPENAT],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_OPENAT.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_CLOSE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_CLOSE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_FILES_UPDATE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_FILES_UPDATE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_STATX],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_STATX.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_READ],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_READ.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_WRITE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_WRITE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_FADVISE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_FADVISE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_MADVISE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_MADVISE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_SEND],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_SEND.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_RECV],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_RECV.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_OPENAT2],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_OPENAT2.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_EPOLL_CTL],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_EPOLL_CTL.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_SPLICE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_SPLICE.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_PROVIDE_BUFFERS],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_PROVIDE_BUFFERS.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_REMOVE_BUFFER],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_REMOVE_BUFFER.])
+AH_TEMPLATE([CFA_HAVE_IORING_OP_TEE],[Defined if io_uring support is present when compiling libcfathread and supports the operation IORING_OP_TEE.])
+AH_TEMPLATE([CFA_HAVE_IOSQE_FIXED_FILE],[Defined if io_uring support is present when compiling libcfathread and supports the flag FIXED_FILE.])
+AH_TEMPLATE([CFA_HAVE_IOSQE_IO_DRAIN],[Defined if io_uring support is present when compiling libcfathread and supports the flag IO_DRAIN.])
+AH_TEMPLATE([CFA_HAVE_IOSQE_ASYNC],[Defined if io_uring support is present when compiling libcfathread and supports the flag ASYNC.])
+AH_TEMPLATE([CFA_HAVE_IOSQE_IO_LINK],[Defined if io_uring support is present when compiling libcfathread and supports the flag IO_LINK.])
+AH_TEMPLATE([CFA_HAVE_IOSQE_IO_HARDLINK],[Defined if io_uring support is present when compiling libcfathread and supports the flag IO_HARDLINK.])
+AH_TEMPLATE([CFA_HAVE_SPLICE_F_FD_IN_FIXED],[Defined if io_uring support is present when compiling libcfathread and supports the flag SPLICE_F_FD_IN_FIXED.])
+AH_TEMPLATE([CFA_HAVE_IORING_SETUP_ATTACH_WQ],[Defined if io_uring support is present when compiling libcfathread and supports the flag IORING_SETUP_ATTACH_WQ.])
+AH_TEMPLATE([CFA_HAVE_PREADV2],[Defined if preadv2 support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_PWRITEV2],[Defined if pwritev2 support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_PWRITEV2],[Defined if pwritev2 support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_STATX],[Defined if statx support is present when compiling libcfathread.])
+AH_TEMPLATE([CFA_HAVE_OPENAT2],[Defined if openat2 support is present when compiling libcfathread.])
+AH_TEMPLATE([__CFA_NO_STATISTICS__],[Defined if libcfathread was compiled without support for statistics.])
+define(ioring_ops, [IORING_OP_NOP,IORING_OP_READV,IORING_OP_WRITEV,IORING_OP_FSYNC,IORING_OP_READ_FIXED,IORING_OP_WRITE_FIXED,IORING_OP_POLL_ADD,IORING_OP_POLL_REMOVE,IORING_OP_SYNC_FILE_RANGE,IORING_OP_SENDMSG,IORING_OP_RECVMSG,IORING_OP_TIMEOUT,IORING_OP_TIMEOUT_REMOVE,IORING_OP_ACCEPT,IORING_OP_ASYNC_CANCEL,IORING_OP_LINK_TIMEOUT,IORING_OP_CONNECT,IORING_OP_FALLOCATE,IORING_OP_OPENAT,IORING_OP_CLOSE,IORING_OP_FILES_UPDATE,IORING_OP_STATX,IORING_OP_READ,IORING_OP_WRITE,IORING_OP_FADVISE,IORING_OP_MADVISE,IORING_OP_SEND,IORING_OP_RECV,IORING_OP_OPENAT2,IORING_OP_EPOLL_CTL,IORING_OP_SPLICE,IORING_OP_PROVIDE_BUFFERS,IORING_OP_REMOVE_BUFFER,IORING_OP_TEE])
+define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_ASYNC,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,SPLICE_F_FD_IN_FIXED,IORING_SETUP_ATTACH_WQ])
+define(ioring_from_decls, [
+        m4_foreach([op], [ioring_ops], [
+                AC_CHECK_DECL(op, [AC_DEFINE([CFA_HAVE_]op)], [], [[#include <linux/io_uring.h>]])
+        ])
+])
+AC_CHECK_HEADERS([linux/io_uring.h], [
+        AC_DEFINE(CFA_HAVE_LINUX_IO_URING_H)
+        AC_CHECK_HEADER([liburing.h], [
+                AC_CHECK_LIB([uring], [io_uring_get_probe], [
+                        m4_foreach([op], [ioring_ops], [
+                                AC_CHECK_DECL(op, [
+                                        AC_RUN_IFELSE([
+                                                AC_LANG_PROGRAM(
+                                                        [[#include <liburing.h>]],
+                                                        [[int main() {]]
+                                                        [[      struct io_uring_probe *probe = io_uring_get_probe();]]
+                                                        [[      if(io_uring_opcode_supported(probe, ]]op[[))]]
+                                                        [[              return 0;]]
+                                                        [[      else]]
+                                                        [[              return 1;]]
+                                                        [[}]]
+                                                )
+                                        ],[
+                                                AC_DEFINE([CFA_HAVE_]op)
+                                        ],[
+                                                AC_MSG_FAILURE([Check support for] op [ with liburing failed])
+                                        ])
+                                ], [], [[#include <linux/io_uring.h>]])
+                        ])
+                ], [
+                        ioring_from_decls
+                ])
+        ], [
+                ioring_from_decls
+        ])
+        # check support for various io_uring flags
+        m4_foreach([op], [ioring_flags], [
+                AC_CHECK_DECL(op, [AC_DEFINE([CFA_HAVE_]op)], [], [[#include <linux/io_uring.h>]])
+        ])
+])
+AC_CHECK_FUNC([preadv2], [AC_DEFINE([CFA_HAVE_PREADV2])])
+AC_CHECK_FUNC([pwritev2], [AC_DEFINE([CFA_HAVE_PWRITEV2])])
 AC_CONFIG_FILES([
         Makefile
 …
         prelude/Makefile
         ])
+AC_CONFIG_FILES([src/concurrency/io/call.cfa], [python3 ${srcdir}/src/concurrency/io/call.cfa.in > src/concurrency/io/call.cfa])
+AC_CONFIG_HEADERS(prelude/defines.hfa)
 AC_OUTPUT()

libcfa/prelude/Makefile.am

-              r3c64c668
+              r58fe85a
 # put into lib for now
 cfalibdir = ${CFA_LIBDIR}
+cfalib_DATA = gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c
+cfalib_DATA = gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c defines.hfa
+EXTRA_DIST = bootloader.cf builtins.c builtins.def extras.c extras.regx extras.regx2 prelude-gen.cc prototypes.awk prototypes.c prototypes.sed sync-builtins.cf
 CC = @LOCAL_CFACC@
 …
 MOSTLYCLEANFILES = bootloader.c builtins.cf extras.cf gcc-builtins.c gcc-builtins.cf prelude.cfa
+DISTCLEANFILES = $(DEPDIR)/builtins.Po
 MAINTAINERCLEANFILES = ${addprefix ${libdir}/,${cfalib_DATA}} ${addprefix ${libdir}/,${lib_LIBRARIES}}
 if ENABLE_DISTCC
 distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c $(srcdir)/../../tools/build/push2dist.sh
         ${AM_V_GEN}$(srcdir)/../../tools/build/push2dist.sh @CFADIR_HASH@
+        ${AM_V_GEN}$(srcdir)/../../tools/build/push2dist.sh @CFADIR_HASH@ @DIST_BWLIMIT@
         @echo "Dummy file to track distribution to remote hosts" > ${@}

libcfa/prelude/bootloader.cf

-              r3c64c668
+              r58fe85a
 extern "C" { static inline int invoke_main(int argc, char* argv[], char* envp[]); }
+int cfa_args_argc;
+char ** cfa_args_argv;
+char ** cfa_args_envp;
 int main(int argc, char* argv[], char* envp[]) {
+        cfa_args_argc = argc;
+        cfa_args_argv = argv;
+        cfa_args_envp = envp;
         return invoke_main(argc, argv, envp);
+}

libcfa/prelude/builtins.c

-              r3c64c668
+              r58fe85a
 // Author           : Peter A. Buhr
 // Created On       : Fri Jul 21 16:21:03 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Nov 21 16:31:39 2019
 // Update Count     : 101
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Oct 27 14:42:00 2020
+// Update Count     : 111
 //
+#define __cforall_builtins__
 // type that wraps a pointer and a destructor-like function - used in generating implicit destructor calls for struct members in user-defined functions
 …
 void abort( const char fmt[], ... ) __attribute__ (( format(printf, 1, 2), __nothrow__, __leaf__, __noreturn__ ));
+forall(dtype T)
+static inline T & identity(T & i) {
+        return i;
+}
+// generator support
+struct $generator {
+        inline int;
+};
+static inline void  ?{}($generator & this) { ((int&)this) = 0; }
+static inline void ^?{}($generator &) {}
+trait is_generator(dtype T) {
+      void main(T & this);
+      $generator * get_generator(T & this);
+};
+forall(dtype T | is_generator(T))
+static inline T & resume(T & gen) {
+        main(gen);
+        return gen;
+}
 // implicit increment, decrement if += defined, and implicit not if != defined
 …
 // universal typed pointer constant
 static inline forall( dtype DT ) DT * intptr( uintptr_t addr ) { return (DT *)addr; }
+static inline forall( ftype FT ) FT * intptr( uintptr_t addr ) { return (FT *)addr; }
+#if defined(__SIZEOF_INT128__)
+// constructor for 128-bit numbers (all constants are unsigned as +/- are operators)
+static inline void ?{}( unsigned int128 & this, unsigned long int h, unsigned long int l ) {
+        this = (unsigned int128)h << 64 | (unsigned int128)l;
+} // ?{}
+#endif // __SIZEOF_INT128__
 // exponentiation operator implementation

libcfa/src/Makefile.am

-              r3c64c668
+              r58fe85a
 ## Created On       : Sun May 31 08:54:01 2015
 ## Last Modified By : Peter A. Buhr
 ## Last Modified On : Mon Jul 15 22:43:27 2019
 ## Update Count     : 241
+## Last Modified On : Wed Dec  9 22:46:14 2020
+## Update Count     : 250
 ###############################################################################
 …
 ACLOCAL_AMFLAGS  = -I automake
 include $(srcdir)/../../src/cfa.make
+include $(top_srcdir)/../tools/build/cfa.make
 libdir = ${CFA_LIBDIR}
 …
 # AM_CFAFLAGS for only cfa source
 # use -no-include-stdhdr to prevent rebuild cycles
 # The built sources must not depend on the installed headers
 AM_CFAFLAGS = -quiet -cfalib -I$(srcdir)/stdhdr $(if $(findstring ${gdbwaittarget}, ${@}), -XCFA --gdb) @CONFIG_CFAFLAGS@
 AM_CFLAGS = -g -Wall -Wno-unused-function -fPIC -pthread @ARCH_FLAGS@ @CONFIG_CFLAGS@
+# The built sources must not depend on the installed inst_headers_src
+AM_CFAFLAGS = -quiet -cfalib -I$(srcdir)/stdhdr -I$(srcdir)/concurrency $(if $(findstring ${gdbwaittarget}, ${@}), -XCFA --gdb) @CONFIG_CFAFLAGS@
+AM_CFLAGS = -g -Wall -Wno-unused-function -fPIC -fexceptions -pthread @ARCH_FLAGS@ @CONFIG_CFLAGS@
 AM_CCASFLAGS = -g -Wall -Wno-unused-function @ARCH_FLAGS@ @CONFIG_CFLAGS@
 CFACC = @CFACC@
 …
 #----------------------------------------------------------------------------------------------------------------
 if BUILDLIB
+headers_nosrc = math.hfa gmp.hfa time_t.hfa bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa
+headers = fstream.hfa iostream.hfa iterator.hfa limits.hfa rational.hfa time.hfa stdlib.hfa common.hfa \
+          containers/maybe.hfa containers/pair.hfa containers/result.hfa containers/vector.hfa
+libsrc = startup.cfa interpose.cfa bits/debug.cfa assert.cfa exception.c virtual.c heap.cfa ${headers:.hfa=.cfa}
+inst_headers_nosrc = \
+        bitmanip.hfa \
+        clock.hfa \
+        exception.hfa \
+        exception.h \
+        gmp.hfa \
+        math.hfa \
+        time_t.hfa \
+        bits/align.hfa \
+        bits/containers.hfa \
+        bits/debug.hfa \
+        bits/defs.hfa \
+        bits/locks.hfa \
+        bits/collection.hfa \
+        bits/stack.hfa \
+        bits/queue.hfa \
+        bits/sequence.hfa \
+        concurrency/iofwd.hfa \
+        containers/list.hfa \
+        containers/stackLockFree.hfa \
+        vec/vec.hfa \
+        vec/vec2.hfa \
+        vec/vec3.hfa \
+        vec/vec4.hfa
+inst_headers_src = \
+        common.hfa \
+        fstream.hfa \
+        heap.hfa \
+        iostream.hfa \
+        iterator.hfa \
+        limits.hfa \
+        memory.hfa \
+        parseargs.hfa \
+        rational.hfa \
+        stdlib.hfa \
+        time.hfa \
+        containers/maybe.hfa \
+        containers/pair.hfa \
+        containers/result.hfa \
+        containers/vector.hfa
+libsrc = ${inst_headers_src} ${inst_headers_src:.hfa=.cfa} \
+        assert.cfa \
+        bits/algorithm.hfa \
+        bits/debug.cfa \
+        exception.c \
+        interpose.cfa \
+        lsda.h \
+        startup.cfa \
+        startup.hfa \
+        virtual.c \
+        virtual.h
 # not all platforms support concurrency, add option do disable it
+thread_headers_nosrc = concurrency/invoke.h
+thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
+thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+inst_thread_headers_nosrc = \
+        bits/random.hfa \
+        concurrency/clib/cfathread.h \
+        concurrency/invoke.h \
+        concurrency/future.hfa \
+        concurrency/kernel/fwd.hfa
+inst_thread_headers_src = \
+        concurrency/coroutine.hfa \
+        concurrency/exception.hfa \
+        concurrency/kernel.hfa \
+        concurrency/locks.hfa \
+        concurrency/monitor.hfa \
+        concurrency/mutex.hfa \
+        concurrency/thread.hfa
+thread_libsrc = ${inst_thread_headers_src} ${inst_thread_headers_src:.hfa=.cfa} \
+        bits/signal.hfa \
+        concurrency/alarm.cfa \
+        concurrency/alarm.hfa \
+        concurrency/clib/cfathread.cfa \
+        concurrency/CtxSwitch-@ARCHITECTURE@.S \
+        concurrency/invoke.c \
+        concurrency/io.cfa \
+        concurrency/io/setup.cfa \
+        concurrency/io/types.hfa \
+        concurrency/io/call.cfa \
+        concurrency/iofwd.hfa \
+        concurrency/kernel_private.hfa \
+        concurrency/kernel/startup.cfa \
+        concurrency/preemption.cfa \
+        concurrency/preemption.hfa \
+        concurrency/ready_queue.cfa \
+        concurrency/ready_subqueue.hfa \
+        concurrency/snzi.hfa \
+        concurrency/stats.cfa \
+        concurrency/stats.hfa \
+        concurrency/stats.hfa
 else
 headers =
 thread_headers =
 headers_nosrc =
 thread_headers_nosrc =
+inst_headers_src =
+inst_thread_headers_src =
+inst_headers_nosrc =
+inst_thread_headers_nosrc =
 libsrc =
 endif
 …
 prelude.o : prelude.cfa extras.cf gcc-builtins.cf builtins.cf @LOCAL_CFACC@ @CFACPP@
         ${AM_V_GEN}$(CFACOMPILE) -quiet -XCFA -l ${<} -c -o ${@}
+        ${AM_V_GEN}$(CFACOMPILE) -quiet -XCFA,-l ${<} -c -o ${@}
 prelude.lo: prelude.cfa extras.cf gcc-builtins.cf builtins.cf @LOCAL_CFACC@ @CFACPP@
         ${AM_V_GEN}$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile \
+        $(CFACOMPILE) -quiet -XCFA -l ${<} -c -o ${@}
+#----------------------------------------------------------------------------------------------------------------
+libcfa_la_SOURCES = prelude.cfa ${libsrc}
+        $(CFACOMPILE) -quiet -XCFA,-l ${<} -c -o ${@}
+#----------------------------------------------------------------------------------------------------------------
+libcfa_la_SOURCES = ${libsrc}
+nodist_libcfa_la_SOURCES = prelude.cfa
 libcfa_la_LDFLAGS = -version-info @CFA_VERSION@
 …
 cfa_includedir = $(CFA_INCDIR)
+nobase_cfa_include_HEADERS = ${stdhdr} ${headers} ${headers_nosrc} ${thread_headers} ${thread_headers_nosrc}
+nobase_cfa_include_HEADERS = ${stdhdr} ${inst_headers_src} ${inst_headers_nosrc} ${inst_thread_headers_src} ${inst_thread_headers_nosrc}
+EXTRA_DIST = stdhdr
 #----------------------------------------------------------------------------------------------------------------
 maintainer-clean-local:
         -rm -rf ${CFA_INCDIR} ${CFA_LIBDIR}
+distclean-local:
+        find ${builddir} -path '*.Plo' -delete

libcfa/src/bits/containers.hfa

-              r3c64c668
+              r58fe85a
 #include "bits/align.hfa"
 #include "bits/defs.hfa"
+#include <stdio.h>
 //-----------------------------------------------------------------------------
 // Array
 …
         #define __small_array_t(T) __small_array(T)
 #else
         #define __small_array_t(T) struct __small_array
+        #define __small_array_t(T) __small_array
 #endif
 …
         static inline forall( dtype T | is_node(T) ) {
                 void ?{}( __queue(T) & this ) with( this ) {
                         head{ 1p };
                         tail{ &head };
                         verify(*tail == 1p);
+                        (this.head){ 1p };
+                        (this.tail){ &this.head };
+                        verify(*this.tail == 1p);
+                }
                 void append( __queue(T) & this, T * val ) with( this ) {
+                        verify(tail != 0p);
+                        verify(*tail == 1p);
+                        *tail = val;
+                        tail = &get_next( *val );
+                        *tail = 1p;
+                        verify(this.tail != 0p);
+                        verify(*this.tail == 1p);
+                        *this.tail = val;
+                        this.tail = &get_next( *val );
+                        *this.tail = 1p;
+                }
+                T * peek( __queue(T) & this ) {
+                        verify(*this.tail == 1p);
+                        T * front = this.head;
+                        if( front != 1p ) {
+                                verify(*this.tail == 1p);
+                                return front;
+                        }
+                        verify(*this.tail == 1p);
+                        return 0p;
+                }
                 T * pop_head( __queue(T) & this ) {
                         verify(*this.tail == 1p);
                         T * head = this.head;
                         if( head != 1p ) {
                                 this.head = get_next( *head );
                                 if( get_next( *head ) == 1p ) {
+                        T * _head = this.head;
+                        if( _head != 1p ) {
+                                this.head = get_next( *_head );
+                                if( get_next( *_head ) == 1p ) {
                                         this.tail = &this.head;
+                                }
                                 get_next( *head ) = 0p;
+                                get_next( *_head ) = 0p;
                                 verify(*this.tail == 1p);
+                                return head;
+                                verify( get_next(*_head) == 0p );
+                                return _head;
+                        }
                         verify(*this.tail == 1p);
 …
                         (*it) = get_next( *val );
                         if( tail == &get_next( *val ) ) {
                                 tail = it;
+                        if( this.tail == &get_next( *val ) ) {
+                                this.tail = it;
+                        }
                         get_next( *val ) = 0p;
                         verify( (head == 1p) == (&head == tail) );
                         verify( *tail == 1p );
+                        verify( (this.head == 1p) == (&this.head == this.tail) );
+                        verify( *this.tail == 1p );
                         return val;
+                }
                 int ?!=?( const __queue(T) & this, __attribute__((unused)) zero_t zero ) {
                         return this.head != 0;
+                        return this.head != 1p;
+                }
+        }
 …
         forall(dtype T )
         static inline [void] ?{}( __dllist(T) & this, * [T * & next, T * & prev] ( T & ) __get ) {
                 this.head{ 0p };
+                (this.head){ 0p };
                 this.__get = __get;
+        }
 …
                 void push_front( __dllist(T) & this, T & node ) with( this ) {
                         verify(__get);
                         if ( head ) {
                                 __get( node ).next = head;
                                 __get( node ).prev = __get( *head ).prev;
+                        if ( this.head ) {
+                                __get( node ).next = this.head;
+                                __get( node ).prev = __get( *this.head ).prev;
                                 // inserted node must be consistent before it is seen
                                 // prevent code movement across barrier
                                 asm( "" : : : "memory" );
                                 __get( *head ).prev = &node;
+                                __get( *this.head ).prev = &node;
                                 T & _prev = *__get( node ).prev;
                                 __get( _prev ).next = &node;
 …
                         // prevent code movement across barrier
                         asm( "" : : : "memory" );
                         head = &node;
+                        this.head = &node;
+                }
                 void remove( __dllist(T) & this, T & node ) with( this ) {
                         verify(__get);
                         if ( &node == head ) {
                                 if ( __get( *head ).next == head ) {
                                         head = 0p;
+                        if ( &node == this.head ) {
+                                if ( __get( *this.head ).next == this.head ) {
+                                        this.head = 0p;
                                 } else {
                                         head = __get( *head ).next;
+                                        this.head = __get( *this.head ).next;
+                                }
+                        }
 …
                         return this.head != 0;
+                }
+                void move_to_front( __dllist(T) & src, __dllist(T) & dst, T & node ) {
+                        remove    (src, node);
+                        push_front(dst, node);
+                }
+        }
         #undef next

libcfa/src/bits/debug.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Thu Mar 30 12:30:01 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 13:03:16 2020
 // Update Count     : 11
+// Last Modified On : Wed Jun 17 11:07:13 2020
+// Update Count     : 12
 //
-extern "C" {
 #include <stdio.h>
 #include <stdlib.h>
 …
 #include <stdarg.h>
 #include <unistd.h>
+}
 enum { buffer_size = 4096 };

libcfa/src/bits/debug.hfa

-              r3c64c668
+              r58fe85a
 // Author           : Thierry Delisle
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 12:29:21 2020
 // Update Count     : 9
+// Last Modified By : Andrew Beach
+// Last Modified On : Mon Apr 27 10:15:00 2020
+// Update Count     : 10
 //
 #pragma once
+#include <assert.h>
 #ifdef __CFA_DEBUG__
 …
         #define __cfaabi_dbg_ctx_param const char caller[]
         #define __cfaabi_dbg_ctx_param2 , const char caller[]
+        #define __cfaabi_dbg_ctx_fwd caller
+        #define __cfaabi_dbg_ctx_fwd2 , caller
 #else
         #define __cfaabi_dbg_debug_do(...)
 …
         #define __cfaabi_dbg_ctx_param
         #define __cfaabi_dbg_ctx_param2
+        #define __cfaabi_dbg_ctx_fwd
+        #define __cfaabi_dbg_ctx_fwd2
 #endif
 …
 #endif
         #include <stdarg.h>
-        #include <stdio.h>
         extern void __cfaabi_bits_write( int fd, const char buffer[], int len );
 …
         extern void __cfaabi_bits_print_vararg( int fd, const char fmt[], va_list arg );
         extern void __cfaabi_bits_print_buffer( int fd, char buffer[], int buffer_size, const char fmt[], ... ) __attribute__(( format(printf, 4, 5) ));
+#if defined(__CFA_DEBUG_PRINT__) \
+                || defined(__CFA_DEBUG_PRINT_IO__) || defined(__CFA_DEBUG_PRINT_IO_CORE__) \
+                || defined(__CFA_DEBUG_PRINT_MONITOR__) || defined(__CFA_DEBUG_PRINT_PREEMPTION__) \
+                || defined(__CFA_DEBUG_PRINT_RUNTIME_CORE__) || defined(__CFA_DEBUG_PRINT_EXCEPTION__) \
+                || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+        #include <stdio.h>
+        #include <unistd.h>
+#endif
 #ifdef __cforall
+}
 #endif
+// Deprecated: Use the versions with the new module names.
 #ifdef __CFA_DEBUG_PRINT__
         #define __cfaabi_dbg_write( buffer, len )         __cfaabi_bits_write( STDERR_FILENO, buffer, len )
         #define __cfaabi_dbg_acquire()                    __cfaabi_bits_acquire()
         #define __cfaabi_dbg_release()                    __cfaabi_bits_release()
         #define __cfaabi_dbg_print_safe(...)              __cfaabi_bits_print_safe   (__VA_ARGS__)
         #define __cfaabi_dbg_print_nolock(...)            __cfaabi_bits_print_nolock (__VA_ARGS__)
         #define __cfaabi_dbg_print_buffer(...)            __cfaabi_bits_print_buffer (__VA_ARGS__)
         #define __cfaabi_dbg_print_buffer_decl(...)       char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( __dbg_text, __dbg_len );
         #define __cfaabi_dbg_print_buffer_local(...)      __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_dbg_write( __dbg_text, __dbg_len );
+        #define __cfaabi_dbg_print_safe(...)              __cfaabi_bits_print_safe   ( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_nolock(...)            __cfaabi_bits_print_nolock ( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_buffer(...)            __cfaabi_bits_print_buffer ( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_buffer_decl(...)       char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( STDERR_FILENO, __dbg_text, __dbg_len );
+        #define __cfaabi_dbg_print_buffer_local(...)      __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_dbg_write( STDERR_FILENO, __dbg_text, __dbg_len );
 #else
         #define __cfaabi_dbg_write(...)               ((void)0)
 …
 #endif
+// Debug print functions and statements:
+// Most are wrappers around the bits printing function but are not always used.
+// If they are used depends if the group (first argument) is active or not. The group must be one
+// defined belowe. The other arguments depend on the wrapped function.
+#define __cfadbg_write(group, buffer, len) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_write(STDERR_FILENO, buffer, len))
+#define __cfadbg_acquire(group) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_acquire())
+#define __cfadbg_release(group) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_release())
+#define __cfadbg_print_safe(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_print_safe(STDERR_FILENO, __VA_ARGS__))
+#define __cfadbg_print_nolock(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_print_nolock(STDERR_FILENO, __VA_ARGS__))
+#define __cfadbg_print_buffer(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_print_buffer(STDERR_FILENO, __VA_ARGS__))
+#define __cfadbg_print_buffer_decl(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( __dbg_text, __dbg_len ))
+#define __cfadbg_print_buffer_local(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(__dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write(STDERR_FILENO, __dbg_text, __dbg_len))
+// The debug print groups:
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_IO__)
+#       define __CFADBG_PRINT_GROUP_io(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_io(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_IO__) || defined(__CFA_DEBUG_PRINT_IO_CORE__)
+#       define __CFADBG_PRINT_GROUP_io_core(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_io_core(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_MONITOR__)
+#       define __CFADBG_PRINT_GROUP_monitor(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_monitor(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_PREEMPTION__)
+#       define __CFADBG_PRINT_GROUP_preemption(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_preemption(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_RUNTIME_CORE__)
+#       define __CFADBG_PRINT_GROUP_runtime_core(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_runtime_core(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+#       define __CFADBG_PRINT_GROUP_ready_queue(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_ready_queue(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_EXCEPTION__)
+#       define __CFADBG_PRINT_GROUP_exception(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_exception(...) ((void)0)
+#endif
 // Local Variables: //
 // mode: c //

libcfa/src/bits/defs.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Thu Nov  9 13:24:10 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Jan 28 22:38:27 2020
 // Update Count     : 9
+// Last Modified On : Sat Oct 24 10:53:15 2020
+// Update Count     : 21
 //
 #pragma once
-#include <stdbool.h>
-#include <stddef.h>
 #include <stdint.h>
+#include <assert.h>
 #define likely(x)   __builtin_expect(!!(x), 1)
 …
 #define __cfa_anonymous_object(x) inline struct x
 #else
 #define __cfa_anonymous_object(x) x __cfa_anonymous_object
+#define __cfa_anonymous_object(x) struct x __cfa_anonymous_object
 #endif
 …
 #endif
+static inline long long rdtscl(void) {
+    unsigned int lo, hi;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+static inline long long int rdtscl(void) {
+        #if defined( __i386 ) || defined( __x86_64 )
+        unsigned int lo, hi;
+        __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+        return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+        #elif defined( __aarch64__ ) || defined( __arm__ )
+        // https://github.com/google/benchmark/blob/v1.1.0/src/cycleclock.h#L116
+        long long int virtual_timer_value;
+        asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
+        return virtual_timer_value;
+        #else
+                #error unsupported hardware architecture
+        #endif
+}

libcfa/src/bits/locks.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Oct 31 15:14:38 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 13:03:19 2020
 // Update Count     : 11
+// Last Modified On : Wed Aug 12 14:18:07 2020
+// Update Count     : 13
 //
 …
 // pause to prevent excess processor bus usage
+#if defined( __sparc )
+        #define Pause() __asm__ __volatile__ ( "rd %ccr,%g0" )
+#elif defined( __i386 ) || defined( __x86_64 )
+#if defined( __i386 ) || defined( __x86_64 )
         #define Pause() __asm__ __volatile__ ( "pause" : : : )
 #elif defined( __ARM_ARCH )
         #define Pause() __asm__ __volatile__ ( "nop" : : : )
+        #define Pause() __asm__ __volatile__ ( "YIELD" : : : )
 #else
         #error unsupported architecture
 …
                 #ifdef __CFA_DEBUG__
                         void __cfaabi_dbg_record(__spinlock_t & this, const char prev_name[]);
+                        void __cfaabi_dbg_record_lock(__spinlock_t & this, const char prev_name[]);
                 #else
                         #define __cfaabi_dbg_record(x, y)
+                        #define __cfaabi_dbg_record_lock(x, y)
                 #endif
+        }
 …
                 bool result = (this.lock == 0) && (__atomic_test_and_set( &this.lock, __ATOMIC_ACQUIRE ) == 0);
                 if( result ) {
                         __cfaabi_dbg_record( this, caller );
+                        __cfaabi_dbg_record_lock( this, caller );
                 } else {
                         enable_interrupts_noPoll();
 …
                         #endif
+                }
                 __cfaabi_dbg_record( this, caller );
+                __cfaabi_dbg_record_lock( this, caller );
+        }
 …
         #endif
+        extern "C" {
+                char * strerror(int);
+        }
+        #define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
         struct __bin_sem_t {
-                bool                    signaled;
                 pthread_mutex_t         lock;
                 pthread_cond_t          cond;
+                int                     val;
         };
         static inline void ?{}(__bin_sem_t & this) with( this ) {
+                signaled = false;
+                pthread_mutex_init(&lock, NULL);
+                pthread_cond_init (&cond, NULL);
+                // Create the mutex with error checking
+                pthread_mutexattr_t mattr;
+                pthread_mutexattr_init( &mattr );
+                pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
+                pthread_mutex_init(&lock, &mattr);
+                pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
+                val = 0;
+        }
         static inline void ^?{}(__bin_sem_t & this) with( this ) {
                 pthread_mutex_destroy(&lock);
                 pthread_cond_destroy (&cond);
+                CHECKED( pthread_mutex_destroy(&lock) );
+                CHECKED( pthread_cond_destroy (&cond) );
+        }
         static inline void wait(__bin_sem_t & this) with( this ) {
                 verify(__cfaabi_dbg_in_kernel());
                 pthread_mutex_lock(&lock);
                         if(!signaled) {   // this must be a loop, not if!
+                CHECKED( pthread_mutex_lock(&lock) );
+                        while(val < 1) {
                                 pthread_cond_wait(&cond, &lock);
+                        }
+                        signaled = false;
+                pthread_mutex_unlock(&lock);
+        }
+        static inline void post(__bin_sem_t & this) with( this ) {
+                verify(__cfaabi_dbg_in_kernel());
+                pthread_mutex_lock(&lock);
+                        bool needs_signal = !signaled;
+                        signaled = true;
+                pthread_mutex_unlock(&lock);
+                if (needs_signal)
+                        pthread_cond_signal(&cond);
+                        val -= 1;
+                CHECKED( pthread_mutex_unlock(&lock) );
+        }
+        static inline bool post(__bin_sem_t & this) with( this ) {
+                bool needs_signal = false;
+                CHECKED( pthread_mutex_lock(&lock) );
+                        if(val < 1) {
+                                val += 1;
+                                pthread_cond_signal(&cond);
+                                needs_signal = true;
+                        }
+                CHECKED( pthread_mutex_unlock(&lock) );
+                return needs_signal;
+        }
+        #undef CHECKED
+        struct $thread;
+        extern void park( void );
+        extern void unpark( struct $thread * this );
+        static inline struct $thread * active_thread ();
+        // Semaphore which only supports a single thread
+        struct single_sem {
+                struct $thread * volatile ptr;
+        };
+        static inline {
+                void  ?{}(single_sem & this) {
+                        this.ptr = 0p;
+                }
+                void ^?{}(single_sem &) {}
+                bool wait(single_sem & this) {
+                        for() {
+                                struct $thread * expected = this.ptr;
+                                if(expected == 1p) {
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                return false;
+                                        }
+                                }
+                                else {
+                                        /* paranoid */ verify( expected == 0p );
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                park();
+                                                return true;
+                                        }
+                                }
+                        }
+                }
+                bool post(single_sem & this) {
+                        for() {
+                                struct $thread * expected = this.ptr;
+                                if(expected == 1p) return false;
+                                if(expected == 0p) {
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                return false;
+                                        }
+                                }
+                                else {
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                unpark( expected );
+                                                return true;
+                                        }
+                                }
+                        }
+                }
+        }
+        // Synchronozation primitive which only supports a single thread and one post
+        // Similar to a binary semaphore with a 'one shot' semantic
+        // is expected to be discarded after each party call their side
+        struct oneshot {
+                // Internal state :
+                //     0p     : is initial state (wait will block)
+                //     1p     : fulfilled (wait won't block)
+                // any thread : a thread is currently waiting
+                struct $thread * volatile ptr;
+        };
+        static inline {
+                void  ?{}(oneshot & this) {
+                        this.ptr = 0p;
+                }
+                void ^?{}(oneshot &) {}
+                // Wait for the post, return immidiately if it already happened.
+                // return true if the thread was parked
+                bool wait(oneshot & this) {
+                        for() {
+                                struct $thread * expected = this.ptr;
+                                if(expected == 1p) return false;
+                                /* paranoid */ verify( expected == 0p );
+                                if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                        park();
+                                        /* paranoid */ verify( this.ptr == 1p );
+                                        return true;
+                                }
+                        }
+                }
+                // Mark as fulfilled, wake thread if needed
+                // return true if a thread was unparked
+                bool post(oneshot & this) {
+                        struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+                        if( got == 0p ) return false;
+                        unpark( got );
+                        return true;
+                }
+        }
+        // base types for future to build upon
+        // It is based on the 'oneshot' type to allow multiple futures
+        // to block on the same instance, permitting users to block a single
+        // thread on "any of" [a given set of] futures.
+        // does not support multiple threads waiting on the same future
+        struct future_t {
+                // Internal state :
+                //     0p      : is initial state (wait will block)
+                //     1p      : fulfilled (wait won't block)
+                //     2p      : in progress ()
+                //     3p      : abandoned, server should delete
+                // any oneshot : a context has been setup to wait, a thread could wait on it
+                struct oneshot * volatile ptr;
+        };
+        static inline {
+                void  ?{}(future_t & this) {
+                        this.ptr = 0p;
+                }
+                void ^?{}(future_t &) {}
+                void reset(future_t & this) {
+                        // needs to be in 0p or 1p
+                        __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                }
+                // check if the future is available
+                bool available( future_t & this ) {
+                        return this.ptr == 1p;
+                }
+                // Prepare the future to be waited on
+                // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+                bool setup( future_t & this, oneshot & wait_ctx ) {
+                        /* paranoid */ verify( wait_ctx.ptr == 0p );
+                        // The future needs to set the wait context
+                        for() {
+                                struct oneshot * expected = this.ptr;
+                                // Is the future already fulfilled?
+                                if(expected == 1p) return false; // Yes, just return false (didn't block)
+                                // The future is not fulfilled, try to setup the wait context
+                                /* paranoid */ verify( expected == 0p );
+                                if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                        return true;
+                                }
+                        }
+                }
+                // Stop waiting on a future
+                // When multiple futures are waited for together in "any of" pattern
+                // futures that weren't fulfilled before the thread woke up
+                // should retract the wait ctx
+                // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+                void retract( future_t & this, oneshot & wait_ctx ) {
+                        // Remove the wait context
+                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                        // got == 0p: future was never actually setup, just return
+                        if( got == 0p ) return;
+                        // got == wait_ctx: since fulfil does an atomic_swap,
+                        // if we got back the original then no one else saw context
+                        // It is safe to delete (which could happen after the return)
+                        if( got == &wait_ctx ) return;
+                        // got == 1p: the future is ready and the context was fully consumed
+                        // the server won't use the pointer again
+                        // It is safe to delete (which could happen after the return)
+                        if( got == 1p ) return;
+                        // got == 2p: the future is ready but the context hasn't fully been consumed
+                        // spin until it is safe to move on
+                        if( got == 2p ) {
+                                while( this.ptr != 1p ) Pause();
+                                return;
+                        }
+                        // got == any thing else, something wen't wrong here, abort
+                        abort("Future in unexpected state");
+                }
+                // Mark the future as abandoned, meaning it will be deleted by the server
+                bool abandon( future_t & this ) {
+                        /* paranoid */ verify( this.ptr != 3p );
+                        // Mark the future as abandonned
+                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
+                        // If the future isn't already fulfilled, let the server delete it
+                        if( got == 0p ) return false;
+                        // got == 2p: the future is ready but the context hasn't fully been consumed
+                        // spin until it is safe to move on
+                        if( got == 2p ) {
+                                while( this.ptr != 1p ) Pause();
+                                got = 1p;
+                        }
+                        // The future is completed delete it now
+                        /* paranoid */ verify( this.ptr != 1p );
+                        free( &this );
+                        return true;
+                }
+                // from the server side, mark the future as fulfilled
+                // delete it if needed
+                bool fulfil( future_t & this ) {
+                        for() {
+                                struct oneshot * expected = this.ptr;
+                                // was this abandoned?
+                                #if defined(__GNUC__) && __GNUC__ >= 7
+                                        #pragma GCC diagnostic push
+                                        #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+                                #endif
+                                        if( expected == 3p ) { free( &this ); return false; }
+                                #if defined(__GNUC__) && __GNUC__ >= 7
+                                        #pragma GCC diagnostic pop
+                                #endif
+                                /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
+                                /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
+                                // If there is a wait context, we need to consume it and mark it as consumed after
+                                // If there is no context then we can skip the in progress phase
+                                struct oneshot * want = expected == 0p ? 1p : 2p;
+                                if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                        if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
+                                        bool ret = post( *expected );
+                                        __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+                                        return ret;
+                                }
+                        }
+                }
+                // Wait for the future to be fulfilled
+                bool wait( future_t & this ) {
+                        oneshot temp;
+                        if( !setup(this, temp) ) return false;
+                        // Wait context is setup, just wait on it
+                        bool ret = wait( temp );
+                        // Wait for the future to tru
+                        while( this.ptr == 2p ) Pause();
+                        // Make sure the state makes sense
+                        // Should be fulfilled, could be in progress but it's out of date if so
+                        // since if that is the case, the oneshot was fulfilled (unparking this thread)
+                        // and the oneshot should not be needed any more
+                        __attribute__((unused)) struct oneshot * was = this.ptr;
+                        /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
+                        // Mark the future as fulfilled, to be consistent
+                        // with potential calls to avail
+                        // this.ptr = 1p;
+                        return ret;
+                }
+        }
 #endif

libcfa/src/bits/signal.hfa

-              r3c64c668
+              r58fe85a
 #include "bits/defs.hfa"
-extern "C" {
 #include <errno.h>
 #define __USE_GNU
 …
 #include <stdlib.h>
 #include <string.h>
+}
 // Short hands for signal context information
 …
                         sig, handler, flags, errno, strerror( errno )
                 );
                 _exit( EXIT_FAILURE );
+                _Exit( EXIT_FAILURE );
         } // if
+}

libcfa/src/common.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed Jul 11 17:54:36 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jul 12 08:02:18 2018
 // Update Count     : 5
+// Last Modified On : Sat Aug 15 08:51:29 2020
+// Update Count     : 14
 //
 …
 static inline {
+        char min( char t1, char t2 ) { return t1 < t2 ? t1 : t2; } // optimization
+        intptr_t min( intptr_t t1, intptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
+        uintptr_t min( uintptr_t t1, uintptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
         forall( otype T | { int ?<?( T, T ); } )
         T min( T t1, T t2 ) { return t1 < t2 ? t1 : t2; }
+        char max( char t1, char t2 ) { return t1 > t2 ? t1 : t2; } // optimization
+        intptr_t max( intptr_t t1, intptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
+        uintptr_t max( uintptr_t t1, uintptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
         forall( otype T | { int ?>?( T, T ); } )
         T max( T t1, T t2 ) { return t1 > t2 ? t1 : t2; }

libcfa/src/concurrency/CtxSwitch-i386.S

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Dec 6 12:27:26 2016
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Jul 21 22:29:25 2017
+// Update Count     : 1
+//
+// This  library is free  software; you  can redistribute  it and/or  modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software  Foundation; either  version 2.1 of  the License, or  (at your
+// option) any later version.
+//
+// This library is distributed in the  hope that it will be useful, but WITHOUT
+// ANY  WARRANTY;  without even  the  implied  warranty  of MERCHANTABILITY  or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should  have received a  copy of the  GNU Lesser General  Public License
+// along  with this library.
+// Last Modified On : Sun Sep  6 18:23:37 2020
+// Update Count     : 5
 //
+// This context switch routine depends on the fact that the stack of a new
+// thread has been set up to look like the thread has saved its context in
+// the normal manner.
+//
+// void CtxSwitch( machine_context *from, machine_context *to );
+// The context switch routine requires the initial the stack of a thread to
+// look like the thread has saved its context in the normal manner.
+// Offsets in the context structure. This needs to be synchronized with the
+// high level code a little better.
+// Offsets must synchronized with the __stack_context_t in invoke.h.
 #define PTR_BYTE        4
 #define SP_OFFSET       ( 0 * PTR_BYTE )
 #define FP_OFFSET       ( 1 * PTR_BYTE )
-#define PC_OFFSET       ( 2 * PTR_BYTE )
+// Context switch between coroutines/tasks.
+//   void __cfactx_switch( struct __stack_context_t * from, struct __stack_context_t * to ) ;
+// Arguments "from" in register 4(%esp), "to" in register 20(%esp)
+        .file "CtxSwitch-i386.S"
         .text
         .align 2
         .globl __cfactx_switch
         .type  __cfactx_switch, @function
+        .global __cfactx_switch
+        .type __cfactx_switch, @function
 __cfactx_switch:
         // Copy the "from" context argument from the stack to register eax
         // Return address is at 0(%esp), with parameters following
+        // Return address is at 0(%esp), with parameters following.
         movl 4(%esp),%eax
 …
         movl %ebp,FP_OFFSET(%eax)
         // Copy the "to" context argument from the stack to register eax
         // Having pushed three words (= 12 bytes) on the stack, the
         // argument is now at 8 + 12 = 20(%esp)
+        // Copy the "to" context argument from the stack to register eax. Having
+        // pushed 3 words (= 12 bytes) on the stack, the argument is now at
+        // 8 + 12 = 20(%esp).
         movl 20(%esp),%eax
 …
         ret
         .size  __cfactx_switch, .-__cfactx_switch
+        .size __cfactx_switch, .-__cfactx_switch
 // Local Variables: //

libcfa/src/concurrency/CtxSwitch-x86_64.S

-              r3c64c668
+              r58fe85a
 // CtxSwitch-x86_64.S --
 //
 // Author           : Thierry Delisle
 // Created On       : Mon Nov 28 12:27:26 2016
+// Author           : Peter A. Buhr
+// Created On       : Mon Aug 10 08:10:26 2020
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Jul 21 22:28:11 2017
+// Update Count     : 1
+//
+// This  library is free  software; you  can redistribute  it and/or  modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software  Foundation; either  version 2.1 of  the License, or  (at your
+// option) any later version.
+//
+// This library is distributed in the  hope that it will be useful, but WITHOUT
+// ANY  WARRANTY;  without even  the  implied  warranty  of MERCHANTABILITY  or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should  have received a  copy of the  GNU Lesser General  Public License
+// along  with this library.
+// Last Modified On : Sat Oct 24 14:36:25 2020
+// Update Count     : 10
 //
+// This context switch routine depends on the fact that the stack of a new
+// thread has been set up to look like the thread has saved its context in
+// the normal manner.
+//
+// void CtxSwitch( machine_context *from, machine_context *to );
+// The context switch routine requires the initial the stack of a thread to
+// look like the thread has saved its context in the normal manner.
+// Offsets in the context structure. This needs to be synchronized with the
+// high level code a little better.
+// Offsets must synchronized with the __stack_context_t in invoke.h.
 #define PTR_BYTE        8
 …
 #define FP_OFFSET       ( 1 * PTR_BYTE )
+//-----------------------------------------------------------------------------
+// Regular context switch routine which enables switching from one context to anouther
+// Context switch between coroutines/tasks.
+//   void __cfactx_switch( struct __stack_context_t * from, struct __stack_context_t * to ) ;
+// Arguments "from" in register rdi, "to" in register rsi.
+        .file "CtxSwitch-x86_64.S"
         .text
         .align 2
         .globl __cfactx_switch
         .type  __cfactx_switch, @function
+        .global __cfactx_switch
+        .type __cfactx_switch, @function
 __cfactx_switch:
 …
         ret
         .size  __cfactx_switch, .-__cfactx_switch
+        .size __cfactx_switch, .-__cfactx_switch
+//-----------------------------------------------------------------------------
+// Stub used to create new stacks which are ready to be context switched to
+// Stub to create new stacks which can be context switched to
+//   void __cfactx_invoke_stub( void );
         .text
         .align 2
         .globl __cfactx_invoke_stub
         .type    __cfactx_invoke_stub, @function
+        .global __cfactx_invoke_stub
+        .type __cfactx_invoke_stub, @function
 __cfactx_invoke_stub:
         movq %rbx, %rdi
+        movq %rbx, %rdi                                         // move main and this to first two arguments
         movq %r12, %rsi
         jmp *%r13
         .size  __cfactx_invoke_stub, .-__cfactx_invoke_stub
+        jmp *%r13                                                       // jmp to invoke
+        .size __cfactx_invoke_stub, .-__cfactx_invoke_stub
 // Local Variables: //
 // mode: c //
+// mode: asm //
 // tab-width: 4 //
 // End: //

libcfa/src/concurrency/alarm.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Fri Jun 2 11:31:25 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Jan  5 08:41:36 2020
 // Update Count     : 69
+// Last Modified On : Wed Jun 17 16:11:35 2020
+// Update Count     : 75
 //
 #define __cforall_thread__
-extern "C" {
 #include <errno.h>
 #include <stdio.h>
+#include <unistd.h>
 #include <string.h>
-#include <unistd.h>
 #include <sys/time.h>
+}
 #include "alarm.hfa"
 #include "kernel_private.hfa"
+#include "kernel/fwd.hfa"
 #include "preemption.hfa"
 …
 //=============================================================================================
 void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period ) with( this ) {
+void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period) with( this ) {
         this.thrd = thrd;
         this.alarm = alarm;
         this.period = period;
-        next = 0;
         set = false;
         kernel_alarm = false;
+        type = User;
+}
 void ?{}( alarm_node_t & this, processor   * proc, Time alarm, Duration period ) with( this ) {
+void ?{}( alarm_node_t & this, processor * proc, Time alarm, Duration period ) with( this ) {
         this.proc = proc;
         this.alarm = alarm;
         this.period = period;
-        next = 0;
         set = false;
+        kernel_alarm = true;
+        type = Kernel;
+}
+void ?{}( alarm_node_t & this, Alarm_Callback callback, Time alarm, Duration period ) with( this ) {
+        this.alarm = alarm;
+        this.period = period;
+        this.callback = callback;
+        set = false;
+        type = Callback;
+}
 …
+}
+#if !defined(NDEBUG) && (defined(__CFA_DEBUG__) || defined(__CFA_VERIFY__))
+bool validate( alarm_list_t * this ) {
+        alarm_node_t ** it = &this->head;
+        while( (*it) ) {
+                it = &(*it)->next;
+void insert( alarm_list_t * this, alarm_node_t * n ) {
+        alarm_node_t * it = & (*this)`first;
+        while( it && (n->alarm > it->alarm) ) {
+                it = & (*it)`next;
+        }
+        if ( it ) {
+                insert_before( *it, *n );
+        } else {
+                insert_last(*this, *n);
+        }
+        return it == this->tail;
+}
+#endif
+static inline void insert_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t p ) {
+        verify( !n->next );
+        if( p == this->tail ) {
+                this->tail = &n->next;
+        }
+        else {
+                n->next = *p;
+        }
+        *p = n;
+        verify( validate( this ) );
+}
+void insert( alarm_list_t * this, alarm_node_t * n ) {
+        alarm_node_t ** it = &this->head;
+        while( (*it) && (n->alarm > (*it)->alarm) ) {
+                it = &(*it)->next;
+        }
+        insert_at( this, n, it );
+        verify( validate( this ) );
+        verify( validate( *this ) );
+}
 alarm_node_t * pop( alarm_list_t * this ) {
+        alarm_node_t * head = this->head;
+        verify( validate( *this ) );
+        alarm_node_t * head = & (*this)`first;
         if( head ) {
+                this->head = head->next;
+                if( !head->next ) {
+                        this->tail = &this->head;
+                }
+                head->next = 0p;
+                remove(*head);
+        }
         verify( validate( this ) );
+        verify( validate( *this ) );
         return head;
+}
-static inline void remove_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t it ) {
-        verify( it );
-        verify( (*it) == n );
-        (*it) = n->next;
-        if( !n-> next ) {
-                this->tail = it;
+        }
-        n->next = 0p;
-        verify( validate( this ) );
+}
-static inline void remove( alarm_list_t * this, alarm_node_t * n ) {
-        alarm_node_t ** it = &this->head;
-        while( (*it) && (*it) != n ) {
-                it = &(*it)->next;
+        }
-        verify( validate( this ) );
-        if( *it ) { remove_at( this, n, it ); }
-        verify( validate( this ) );
+}
 void register_self( alarm_node_t * this ) {
         alarm_list_t * alarms = &event_kernel->alarms;
+        alarm_list_t & alarms = event_kernel->alarms;
         disable_interrupts();
 …
+        {
                 verify( validate( alarms ) );
                 bool first = !alarms->head;
+                bool first = ! & alarms`first;
                 insert( alarms, this );
+                insert( &alarms, this );
                 if( first ) {
                         __kernel_set_timer( alarms->head->alarm - __kernel_get_time() );
+                        __kernel_set_timer( alarms`first.alarm - __kernel_get_time() );
+                }
+        }
 …
         lock( event_kernel->lock __cfaabi_dbg_ctx2 );
+        {
                 verify( validate( &event_kernel->alarms ) );
                 remove( &event_kernel->alarms, this );
+                verify( validate( event_kernel->alarms ) );
+                remove( *this );
+        }
         unlock( event_kernel->lock );
 …
+}
+//=============================================================================================
+// Utilities
+//=============================================================================================
+void sleep( Duration duration ) {
+        alarm_node_t node = { active_thread(), __kernel_get_time() + duration, 0`s };
+        register_self( &node );
+        park();
+        /* paranoid */ verify( !node.set );
+        /* paranoid */ verify( & node`next == 0p );
+        /* paranoid */ verify( & node`prev == 0p );
+}
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/alarm.hfa

-              r3c64c668
+              r58fe85a
 #include "time.hfa"
+#include "containers/list.hfa"
 struct $thread;
 struct processor;
 …
 //=============================================================================================
+enum alarm_type{ Kernel = 0, User = 1, Callback = 2 };
+struct alarm_node_t;
+typedef void (*Alarm_Callback)(alarm_node_t & );
 struct alarm_node_t {
         Time alarm;                             // time when alarm goes off
         Duration period;                        // if > 0 => period of alarm
+        alarm_node_t * next;            // intrusive link list field
+        DLISTED_MGD_IMPL_IN(alarm_node_t)
         union {
+                $thread * thrd; // thrd who created event
+                processor * proc;               // proc who created event
+                $thread * thrd;                                 // thrd who created event
+                processor * proc;                               // proc who created event
+                Alarm_Callback callback;                // callback to handle event
         };
         bool set                :1;             // whether or not the alarm has be registered
         bool kernel_alarm       :1;             // true if this is not a user defined alarm
+        enum alarm_type type;           // true if this is not a user defined alarm
 };
+typedef alarm_node_t ** __alarm_it_t;
+DLISTED_MGD_IMPL_OUT(alarm_node_t)
 void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period );
 void ?{}( alarm_node_t & this, processor   * proc, Time alarm, Duration period );
+void ?{}( alarm_node_t & this, Alarm_Callback callback, Time alarm, Duration period );
 void ^?{}( alarm_node_t & this );
+struct alarm_list_t {
+        alarm_node_t * head;
+        __alarm_it_t tail;
+};
+static inline void ?{}( alarm_list_t & this ) with( this ) {
+        head = 0;
+        tail = &head;
+}
+typedef dlist(alarm_node_t, alarm_node_t) alarm_list_t;
 void insert( alarm_list_t * this, alarm_node_t * n );

libcfa/src/concurrency/coroutine.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 12:29:25 2020
 // Update Count     : 16
+// Last Modified On : Tue Dec 15 12:06:04 2020
+// Update Count     : 23
 //
 …
 #include "coroutine.hfa"
-extern "C" {
 #include <stddef.h>
 #include <malloc.h>
 …
 #include <string.h>
 #include <unistd.h>
+// use this define to make unwind.h play nice, definetely a hack
+#define HIDE_EXPORTS
+#include <sys/mman.h>                                                                   // mprotect
 #include <unwind.h>
-#undef HIDE_EXPORTS
-#include <sys/mman.h>
+}
 #include "kernel_private.hfa"
+#include "exception.hfa"
+#include "math.hfa"
+#define CFA_COROUTINE_USE_MMAP 0
 #define __CFA_INVOKE_PRIVATE__
 …
 //-----------------------------------------------------------------------------
+FORALL_DATA_INSTANCE(CoroutineCancelled, (dtype coroutine_t), (coroutine_t))
+forall(dtype T)
+void mark_exception(CoroutineCancelled(T) *) {}
+forall(dtype T)
+void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src) {
+        dst->virtual_table = src->virtual_table;
+        dst->the_coroutine = src->the_coroutine;
+        dst->the_exception = src->the_exception;
+}
+forall(dtype T)
+const char * msg(CoroutineCancelled(T) *) {
+        return "CoroutineCancelled(...)";
+}
+// This code should not be inlined. It is the error path on resume.
+forall(dtype T | is_coroutine(T))
+void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc ) {
+        verify( desc->cancellation );
+        desc->state = Cancelled;
+        exception_t * except = __cfaehm_cancellation_exception( desc->cancellation );
+        // TODO: Remove explitate vtable set once trac#186 is fixed.
+        CoroutineCancelled(T) except;
+        except.virtual_table = &get_exception_vtable(&except);
+        except.the_coroutine = &cor;
+        except.the_exception = except;
+        throwResume except;
+        except->virtual_table->free( except );
+        free( desc->cancellation );
+        desc->cancellation = 0p;
+}
+//-----------------------------------------------------------------------------
 // Global state variables
 // minimum feasible stack size in bytes
+#define MinStackSize 1000
+static const size_t MinStackSize = 1000;
 extern size_t __page_size;                              // architecture pagesize HACK, should go in proper runtime singleton
+extern int __map_prot;
 void __stack_prepare( __stack_info_t * this, size_t create_size );
+void __stack_clean  ( __stack_info_t * this );
 //-----------------------------------------------------------------------------
 …
         bool userStack = ((intptr_t)this.storage & 0x1) != 0;
         if ( ! userStack && this.storage ) {
+                __attribute__((may_alias)) intptr_t * istorage = (intptr_t *)&this.storage;
+                *istorage &= (intptr_t)-1;
+                void * storage = this.storage->limit;
+                __cfaabi_dbg_debug_do(
+                        storage = (char*)(storage) - __page_size;
+                        if ( mprotect( storage, __page_size, PROT_READ | PROT_WRITE ) == -1 ) {
+                                abort( "(coStack_t *)%p.^?{}() : internal error, mprotect failure, error(%d) %s.", &this, errno, strerror( errno ) );
+                        }
+                );
+                __cfaabi_dbg_print_safe("Kernel : Deleting stack %p\n", storage);
+                free( storage );
+                __stack_clean( &this );
+        }
+}
 …
 void ^?{}($coroutine& this) {
         if(this.state != Halted && this.state != Start && this.state != Primed) {
                 $coroutine * src = TL_GET( this_thread )->curr_cor;
+                $coroutine * src = active_coroutine();
                 $coroutine * dst = &this;
 …
         assert(__page_size != 0l);
         size_t size = libCeiling( storageSize, 16 ) + stack_data_size;
+        size = ceiling(size, __page_size);
         // If we are running debug, we also need to allocate a guardpage to catch stack overflows.
         void * storage;
+        __cfaabi_dbg_debug_do(
+                storage = memalign( __page_size, size + __page_size );
+        );
+        __cfaabi_dbg_no_debug_do(
+                storage = (void*)malloc(size);
+        );
+        #if CFA_COROUTINE_USE_MMAP
+                storage = mmap(0p, size + __page_size, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+                if(storage == ((void*)-1)) {
+                        abort( "coroutine stack creation : internal error, mmap failure, error(%d) %s.", errno, strerror( errno ) );
+                }
+                if ( mprotect( storage, __page_size, PROT_NONE ) == -1 ) {
+                        abort( "coroutine stack creation : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
+                } // if
+                storage = (void *)(((intptr_t)storage) + __page_size);
+        #else
+                __cfaabi_dbg_debug_do(
+                        storage = memalign( __page_size, size + __page_size );
+                );
+                __cfaabi_dbg_no_debug_do(
+                        storage = (void*)malloc(size);
+                );
+                __cfaabi_dbg_debug_do(
+                        if ( mprotect( storage, __page_size, PROT_NONE ) == -1 ) {
+                                abort( "__stack_alloc : internal error, mprotect failure, error(%d) %s.", (int)errno, strerror( (int)errno ) );
+                        }
+                        storage = (void *)(((intptr_t)storage) + __page_size);
+                );
+        #endif
         __cfaabi_dbg_print_safe("Kernel : Created stack %p of size %zu\n", storage, size);
-        __cfaabi_dbg_debug_do(
-                if ( mprotect( storage, __page_size, PROT_NONE ) == -1 ) {
-                        abort( "__stack_alloc : internal error, mprotect failure, error(%d) %s.", (int)errno, strerror( (int)errno ) );
+                }
-                storage = (void *)(((intptr_t)storage) + __page_size);
-        );
         verify( ((intptr_t)storage & (libAlign() - 1)) == 0ul );
         return [storage, size];
+}
+void __stack_clean  ( __stack_info_t * this ) {
+        size_t size = ((intptr_t)this->storage->base) - ((intptr_t)this->storage->limit) + sizeof(__stack_t);
+        void * storage = this->storage->limit;
+        #if CFA_COROUTINE_USE_MMAP
+                storage = (void *)(((intptr_t)storage) - __page_size);
+                if(munmap(storage, size + __page_size) == -1) {
+                        abort( "coroutine stack destruction : internal error, munmap failure, error(%d) %s.", errno, strerror( errno ) );
+                }
+        #else
+                __cfaabi_dbg_debug_do(
+                        storage = (char*)(storage) - __page_size;
+                        if ( mprotect( storage, __page_size, __map_prot ) == -1 ) {
+                                abort( "(coStack_t *)%p.^?{}() : internal error, mprotect failure, error(%d) %s.", &this, errno, strerror( errno ) );
+                        }
+                );
+                free( storage );
+        #endif
+        __cfaabi_dbg_print_safe("Kernel : Deleting stack %p\n", storage);
+}
 …
                 size = libFloor(create_size - stack_data_size - diff, libAlign());
         } // if
         assertf( size >= MinStackSize, "Stack size %zd provides less than minimum of %d bytes for a stack.", size, MinStackSize );
         this->storage = (__stack_t *)((intptr_t)storage + size);
+        assertf( size >= MinStackSize, "Stack size %zd provides less than minimum of %zd bytes for a stack.", size, MinStackSize );
+        this->storage = (__stack_t *)((intptr_t)storage + size - sizeof(__stack_t));
         this->storage->limit = storage;
+        this->storage->base  = (void*)((intptr_t)storage + size);
+        this->storage->base  = (void*)((intptr_t)storage + size - sizeof(__stack_t));
+        this->storage->exception_context.top_resume = 0p;
+        this->storage->exception_context.current_exception = 0p;
         __attribute__((may_alias)) intptr_t * istorage = (intptr_t*)&this->storage;
         *istorage |= userStack ? 0x1 : 0x0;
 …
         struct $coroutine * __cfactx_cor_finish(void) {
                 struct $coroutine * cor = kernelTLS.this_thread->curr_cor;
+                struct $coroutine * cor = active_coroutine();
                 if(cor->state == Primed) {
                         suspend();
+                        __cfactx_suspend();
+                }

libcfa/src/concurrency/coroutine.hfa

-              r3c64c668
+              r58fe85a
 #include <assert.h>
 #include "invoke.h"
+#include "../exception.hfa"
+//-----------------------------------------------------------------------------
+// Exception thrown from resume when a coroutine stack is cancelled.
+FORALL_DATA_EXCEPTION(CoroutineCancelled, (dtype coroutine_t), (coroutine_t)) (
+        coroutine_t * the_coroutine;
+        exception_t * the_exception;
+);
+forall(dtype T)
+void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src);
+forall(dtype T)
+const char * msg(CoroutineCancelled(T) *);
 //-----------------------------------------------------------------------------
 …
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
 trait is_coroutine(dtype T) {
       void main(T & this);
       $coroutine * get_coroutine(T & this);
+trait is_coroutine(dtype T | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
+        void main(T & this);
+        $coroutine * get_coroutine(T & this);
 };
 …
 //-----------------------------------------------------------------------------
 // Public coroutine API
-static inline void suspend(void);
-forall(dtype T | is_coroutine(T))
-static inline T & resume(T & cor);
 forall(dtype T | is_coroutine(T))
 void prime(T & cor);
 static inline struct $coroutine * active_coroutine() { return TL_GET( this_thread )->curr_cor; }
+static inline struct $coroutine * active_coroutine() { return active_thread()->curr_cor; }
 //-----------------------------------------------------------------------------
 …
 static inline void $ctx_switch( $coroutine * src, $coroutine * dst ) __attribute__((nonnull (1, 2))) {
         // set state of current coroutine to inactive
         src->state = src->state == Halted ? Halted : Inactive;
+        src->state = src->state == Halted ? Halted : Blocked;
         // set new coroutine that task is executing
         TL_GET( this_thread )->curr_cor = dst;
+        active_thread()->curr_cor = dst;
         // context switch to specified coroutine
 …
+}
+extern void __stack_prepare   ( __stack_info_t * this, size_t size /* ignored if storage already allocated */);
+extern void __stack_prepare( __stack_info_t * this, size_t size /* ignored if storage already allocated */);
+extern void __stack_clean  ( __stack_info_t * this );
 // Suspend implementation inlined for performance
+static inline void suspend(void) {
+        // optimization : read TLS once and reuse it
+        // Safety note: this is preemption safe since if
+        // preemption occurs after this line, the pointer
+        // will also migrate which means this value will
+        // stay in syn with the TLS
+        $coroutine * src = TL_GET( this_thread )->curr_cor;
+extern "C" {
+        static inline void __cfactx_suspend(void) {
+                // optimization : read TLS once and reuse it
+                // Safety note: this is preemption safe since if
+                // preemption occurs after this line, the pointer
+                // will also migrate which means this value will
+                // stay in syn with the TLS
+                $coroutine * src = active_coroutine();
         assertf( src->last != 0,
                 "Attempt to suspend coroutine \"%.256s\" (%p) that has never been resumed.\n"
                 "Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
                 src->name, src );
         assertf( src->last->state != Halted,
                 "Attempt by coroutine \"%.256s\" (%p) to suspend back to terminated coroutine \"%.256s\" (%p).\n"
                 "Possible cause is terminated coroutine's main routine has already returned.",
                 src->name, src, src->last->name, src->last );
+                assertf( src->last != 0,
+                        "Attempt to suspend coroutine \"%.256s\" (%p) that has never been resumed.\n"
+                        "Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
+                        src->name, src );
+                assertf( src->last->state != Halted,
+                        "Attempt by coroutine \"%.256s\" (%p) to suspend back to terminated coroutine \"%.256s\" (%p).\n"
+                        "Possible cause is terminated coroutine's main routine has already returned.",
+                        src->name, src, src->last->name, src->last );
+        $ctx_switch( src, src->last );
+                $ctx_switch( src, src->last );
+        }
+}
+forall(dtype T | is_coroutine(T))
+void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc );
 // Resume implementation inlined for performance
 …
         // will also migrate which means this value will
         // stay in syn with the TLS
         $coroutine * src = TL_GET( this_thread )->curr_cor;
+        $coroutine * src = active_coroutine();
         $coroutine * dst = get_coroutine(cor);
         if( unlikely(dst->context.SP == 0p) ) {
-                TL_GET( this_thread )->curr_cor = dst;
                 __stack_prepare(&dst->stack, 65000);
                 __cfactx_start(main, dst, cor, __cfactx_invoke_coroutine);
-                TL_GET( this_thread )->curr_cor = src;
+        }
 …
         // always done for performance testing
         $ctx_switch( src, dst );
+        if ( unlikely(dst->cancellation) ) {
+                __cfaehm_cancelled_coroutine( cor, dst );
+        }
         return cor;
 …
         // will also migrate which means this value will
         // stay in syn with the TLS
         $coroutine * src = TL_GET( this_thread )->curr_cor;
+        $coroutine * src = active_coroutine();
         // not resuming self ?

libcfa/src/concurrency/invoke.c

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Feb  9 16:37:42 2018
 // Update Count     : 5
+// Last Modified On : Sat Oct 24 14:35:28 2020
+// Update Count     : 32
 //
 …
         struct FakeStack {
             void *fixedRegisters[3];              // fixed registers ebx, edi, esi (popped on 1st uSwitch, values unimportant)
             void *rturn;                          // where to go on return from uSwitch
             void *dummyReturn;                    // fake return compiler would have pushed on call to uInvoke
             void *argument[3];                    // for 16-byte ABI, 16-byte alignment starts here
             void *padding;                        // padding to force 16-byte alignment, as "base" is 16-byte aligned
+            void *fixedRegisters[3];                                            // fixed registers ebx, edi, esi (popped on 1st uSwitch, values unimportant)
+            void *rturn;                                                                        // where to go on return from uSwitch
+            void *dummyReturn;                                                          // fake return compiler would have pushed on call to uInvoke
+            void *argument[3];                                                          // for 16-byte ABI, 16-byte alignment starts here
+            void *padding;                                                                      // padding to force 16-byte alignment, as "base" is 16-byte aligned
         };
 …
         fs->dummyReturn = NULL;
         fs->argument[0] = main;     // argument to invoke
         fs->argument[1] = this;     // argument to invoke
+        fs->argument[0] = main;                                                         // argument to invoke
+        fs->argument[1] = this;                                                         // argument to invoke
         fs->rturn = invoke;
 …
         struct FakeStack {
                 void *fixedRegisters[5];            // fixed registers rbx, r12, r13, r14, r15
                 void *rturn;                        // where to go on return from uSwitch
                 void *dummyReturn;                  // NULL return address to provide proper alignment
+                void *fixedRegisters[5];                                                // fixed registers rbx, r12, r13, r14, r15
+                void *rturn;                                                                    // where to go on return from uSwitch
+                void *dummyReturn;                                                              // NULL return address to provide proper alignment
         };
         cor->context.SP = (char *)stack->base - sizeof( struct FakeStack );
         cor->context.FP = NULL;         // terminate stack with NULL fp
+        cor->context.FP = NULL;                                                         // terminate stack with NULL fp
         struct FakeStack *fs = (struct FakeStack *)cor->context.SP;
 …
         fs->dummyReturn = NULL;
         fs->rturn = __cfactx_invoke_stub;
         fs->fixedRegisters[0] = main;
         fs->fixedRegisters[1] = this;
+        fs->fixedRegisters[0] = main;                                           // argument to invoke
+        fs->fixedRegisters[1] = this;                                           // argument to invoke
         fs->fixedRegisters[2] = invoke;
+#elif defined( __ARM_ARCH )
+#error ARM needs to be upgrade to use to parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+#elif defined( __ARM_ARCH_32 )
+#error ARM needs to be upgrade to use two parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+        // More details about the error:
+        // To avoid the thunk problem, I changed the invoke routine to pass the main explicitly
+        // instead of relying on an assertion. This effectively hoists any required thunk one level
+        // which was enough to get to global scope in most cases.
+        // This means that __cfactx_invoke_... now takes two parameters and the FakeStack needs
+        // to be adjusted as a consequence of that.
+        // I don't know how to do that for ARM, hence the #error
         struct FakeStack {
                 float fpRegs[16];                       // floating point registers
                 void *intRegs[9];                       // integer/pointer registers
                 void *arg[2];                           // placeholder for this pointer
+                float fpRegs[16];                                                               // floating point registers
+                void * intRegs[9];                                                              // integer/pointer registers
+                void * arg[2];                                                                  // placeholder for this pointer
         };
 …
         fs->arg[1] = invoke;
+#elif defined( __ARM_ARCH )
+        struct FakeStack {
+                void * intRegs[12];                                                             // x19-x30 integer registers
+                double fpRegs[8];                                                               // v8-v15 floating point
+        };
+        cor->context.SP = (char *)stack->base - sizeof( struct FakeStack );
+        cor->context.FP = NULL;
+        struct FakeStack *fs = (struct FakeStack *)cor->context.SP;
+        fs->intRegs[0] = main;                                                          // argument to invoke x19 => x0
+        fs->intRegs[1] = this;                                                          // argument to invoke x20 => x1
+        fs->intRegs[2] = invoke;
+        fs->intRegs[11] = __cfactx_invoke_stub;                         // link register x30 => ret moves to pc
 #else
         #error uknown hardware architecture

libcfa/src/concurrency/invoke.h

-              r3c64c668
+              r58fe85a
 #include "bits/defs.hfa"
 #include "bits/locks.hfa"
+#include "kernel/fwd.hfa"
 #ifdef __cforall
 …
 #define _INVOKE_H_
+#ifdef __ARM_ARCH
+        // function prototypes are only really used by these macros on ARM
+        void disable_global_interrupts();
+        void enable_global_interrupts();
+        #define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
+                disable_global_interrupts(); \
+                target = kernelTLS.member; \
+                enable_global_interrupts(); \
+                target; } )
+        #define TL_SET( member, value ) disable_global_interrupts(); \
+                kernelTLS.member = value; \
+                enable_global_interrupts();
+#else
+        #define TL_GET( member ) kernelTLS.member
+        #define TL_SET( member, value ) kernelTLS.member = value;
+#endif
+        #ifdef __cforall
+        extern "Cforall" {
+                extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
+                        struct $thread    * volatile this_thread;
+                        struct processor      * volatile this_processor;
+                        struct {
+                                volatile unsigned short disable_count;
+                                volatile bool enabled;
+                                volatile bool in_progress;
+                        } preemption_state;
+                        uint32_t rand_seed;
+                } kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+        }
+        #endif
+        struct __cfaehm_try_resume_node;
+        struct __cfaehm_base_exception_t;
+        struct exception_context_t {
+                struct __cfaehm_try_resume_node * top_resume;
+                struct __cfaehm_base_exception_t * current_exception;
+        };
         struct __stack_context_t {
 …
                 // base of stack
                 void * base;
+                // Information for exception handling.
+                struct exception_context_t exception_context;
         };
 …
         };
+        enum coroutine_state { Halted, Start, Primed, Inactive, Active, Rerun };
+        enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
+        enum __Coroutine_State { Halted, Start, Primed, Blocked, Ready, Active, Cancelled, Halting };
         struct $coroutine {
 …
                 // current execution status for coroutine
                 enum coroutine_state state;
+                enum __Coroutine_State state;
                 // first coroutine to resume this one
 …
         };
+        // Wrapper for gdb
+        struct cfathread_coroutine_t { struct $coroutine debug; };
+        static inline struct __stack_t * __get_stack( struct $coroutine * cor ) {
+                return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2));
+        }
         // struct which calls the monitor is accepting
 …
                 struct __condition_node_t * dtor_node;
         };
+        // Wrapper for gdb
+        struct cfathread_monitor_t { struct $monitor debug; };
         struct __monitor_group_t {
 …
                 // last function that acquired monitors
                 fptr_t func;
+        };
+        // Link lists fields
+        // instrusive link field for threads
+        struct __thread_desc_link {
+                struct $thread * next;
+                struct $thread * prev;
+                volatile unsigned long long ts;
+                int preferred;
         };
 …
                 // current execution status for coroutine
+                volatile int state;
+                enum __Preemption_Reason preempted;
+                // Possible values are:
+                //    - TICKET_BLOCKED (-1) thread is blocked
+                //    - TICKET_RUNNING ( 0) thread is running
+                //    - TICKET_UNBLOCK ( 1) thread should ignore next block
+                volatile int ticket;
+                enum __Coroutine_State state:8;
+                enum __Preemption_Reason preempted:8;
                 //SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it
+                // pointer to the cluster on which the thread is running
+                struct cluster * curr_cluster;
+                // Link lists fields
+                // instrusive link field for threads
+                struct __thread_desc_link link;
                 // coroutine body used to store context
 …
                 struct $monitor *  self_mon_p;
-                // pointer to the cluster on which the thread is running
-                struct cluster * curr_cluster;
                 // monitors currently held by this thread
                 struct __monitor_group_t monitors;
+                // Link lists fields
+                // instrusive link field for threads
+                struct $thread * next;
+                // used to put threads on user data structures
+                struct {
+                        struct $thread * next;
+                        struct $thread * back;
+                } seqable;
                 struct {
 …
                         struct $thread * prev;
                 } node;
+        };
+                #if defined( __CFA_WITH_VERIFY__ )
+                        void * canary;
+                #endif
+        };
+        // Wrapper for gdb
+        struct cfathread_thread_t { struct $thread debug; };
+        #ifdef __CFA_DEBUG__
+                void __cfaabi_dbg_record_thrd($thread & this, bool park, const char prev_name[]);
+        #else
+                #define __cfaabi_dbg_record_thrd(x, y, z)
+        #endif
         #ifdef __cforall
         extern "Cforall" {
                 static inline $thread *& get_next( $thread & this ) __attribute__((const)) {
                         return this.next;
+                        return this.link.next;
+                }
                 static inline [$thread *&, $thread *& ] __get( $thread & this ) __attribute__((const)) {
                         return this.node.[next, prev];
+                }
+                static inline $thread *& Back( $thread * this ) __attribute__((const)) {
+                        return this->seqable.back;
+                }
+                static inline $thread *& Next( $thread * this ) __attribute__((const)) {
+                        return this->seqable.next;
+                }
+                static inline bool listed( $thread * this ) {
+                        return this->seqable.next != 0p;
+                }

libcfa/src/concurrency/kernel.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 13:03:15 2020
 // Update Count     : 58
+// Last Modified On : Mon Aug 31 07:08:20 2020
+// Update Count     : 71
 //
 #define __cforall_thread__
+// #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 //C Includes
-#include <stddef.h>
 #include <errno.h>
-#include <string.h>
-extern "C" {
 #include <stdio.h>
-#include <fenv.h>
-#include <sys/resource.h>
 #include <signal.h>
 #include <unistd.h>
-#include <limits.h>                                                                             // PTHREAD_STACK_MIN
-#include <sys/mman.h>                                                                   // mprotect
+}
 //CFA Includes
-#include "time.hfa"
 #include "kernel_private.hfa"
 #include "preemption.hfa"
-#include "startup.hfa"
 //Private includes
 …
 #include "invoke.h"
 //-----------------------------------------------------------------------------
 // Some assembly required
 #if defined( __i386 )
-        #define CtxGet( ctx )        \
-                __asm__ volatile (     \
-                        "movl %%esp,%0\n"\
-                        "movl %%ebp,%1\n"\
-                        : "=rm" (ctx.SP),\
-                                "=rm" (ctx.FP) \
+                )
         // mxcr : SSE Status and Control bits (control bits are preserved across function calls)
         // fcw  : X87 FPU control word (preserved across function calls)
 …
 #elif defined( __x86_64 )
-        #define CtxGet( ctx )        \
-                __asm__ volatile (     \
-                        "movq %%rsp,%0\n"\
-                        "movq %%rbp,%1\n"\
-                        : "=rm" (ctx.SP),\
-                                "=rm" (ctx.FP) \
+                )
         #define __x87_store         \
                 uint32_t __mxcr;      \
 …
+                )
+#elif defined( __ARM_ARCH )
+#define CtxGet( ctx ) __asm__ ( \
+                "mov %0,%%sp\n"   \
+                "mov %1,%%r11\n"   \
+        : "=rm" (ctx.SP), "=rm" (ctx.FP) )
+#elif defined( __arm__ )
+        #define __x87_store
+        #define __x87_load
+#elif defined( __aarch64__ )
+        #define __x87_store              \
+                uint32_t __fpcntl[2];    \
+                __asm__ volatile (    \
+                        "mrs x9, FPCR\n" \
+                        "mrs x10, FPSR\n"  \
+                        "stp x9, x10, %0\n"  \
+                        : "=m" (__fpcntl) : : "x9", "x10" \
+                )
+        #define __x87_load         \
+                __asm__ volatile (    \
+                        "ldp x9, x10, %0\n"  \
+                        "msr FPSR, x10\n"  \
+                        "msr FPCR, x9\n" \
+                : "=m" (__fpcntl) : : "x9", "x10" \
+                )
 #else
         #error unknown hardware architecture
+        #error unsupported hardware architecture
 #endif
+extern $thread * mainThread;
+extern processor * mainProcessor;
 //-----------------------------------------------------------------------------
+//Start and stop routine for the kernel, declared first to make sure they run first
+static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
+static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+//-----------------------------------------------------------------------------
+// Kernel storage
+KERNEL_STORAGE(cluster,         mainCluster);
+KERNEL_STORAGE(processor,       mainProcessor);
+KERNEL_STORAGE($thread, mainThread);
+KERNEL_STORAGE(__stack_t,       mainThreadCtx);
+cluster     * mainCluster;
+processor   * mainProcessor;
+$thread * mainThread;
+extern "C" {
+        struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
+}
+size_t __page_size = 0;
+//-----------------------------------------------------------------------------
+// Global state
+thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) = {
+        NULL,                                                                                           // cannot use 0p
+        NULL,
+        { 1, false, false },
+u //this should be seeded better but due to a bug calling rdtsc doesn't work
+};
+//-----------------------------------------------------------------------------
+// Struct to steal stack
+struct current_stack_info_t {
+        __stack_t * storage;                                                            // pointer to stack object
+        void * base;                                                                            // base of stack
+        void * limit;                                                                           // stack grows towards stack limit
+        void * context;                                                                         // address of cfa_context_t
+};
+void ?{}( current_stack_info_t & this ) {
+        __stack_context_t ctx;
+        CtxGet( ctx );
+        this.base = ctx.FP;
+        rlimit r;
+        getrlimit( RLIMIT_STACK, &r);
+        size_t size = r.rlim_cur;
+        this.limit = (void *)(((intptr_t)this.base) - size);
+        this.context = &storage_mainThreadCtx;
+}
+//-----------------------------------------------------------------------------
+// Main thread construction
+void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
+        stack.storage = info->storage;
+        with(*stack.storage) {
+                limit     = info->limit;
+                base      = info->base;
+        }
+        __attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
+        *istorage |= 0x1;
+        name = "Main Thread";
+        state = Start;
+        starter = 0p;
+        last = 0p;
+        cancellation = 0p;
+}
+void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
+        state = Start;
+        self_cor{ info };
+        curr_cor = &self_cor;
+        curr_cluster = mainCluster;
+        self_mon.owner = &this;
+        self_mon.recursion = 1;
+        self_mon_p = &self_mon;
+        next = 0p;
+        node.next = 0p;
+        node.prev = 0p;
+        doregister(curr_cluster, this);
+        monitors{ &self_mon_p, 1, (fptr_t)0 };
+}
+//-----------------------------------------------------------------------------
+// Processor coroutine
+void ?{}(processorCtx_t & this) {
+}
+// Construct the processor context of non-main processors
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
+        (this.__cor){ info };
+        this.proc = proc;
+}
+static void * __invoke_processor(void * arg);
+void ?{}(processor & this, const char name[], cluster & cltr) with( this ) {
+        this.name = name;
+        this.cltr = &cltr;
+        terminated{ 0 };
+        destroyer = 0p;
+        do_terminate = false;
+        preemption_alarm = 0p;
+        pending_preemption = false;
+        runner.proc = &this;
+        idleLock{};
+        __cfaabi_dbg_print_safe("Kernel : Starting core %p\n", &this);
+        this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+        __cfaabi_dbg_print_safe("Kernel : core %p started\n", &this);
+}
+void ^?{}(processor & this) with( this ){
+        if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
+                __cfaabi_dbg_print_safe("Kernel : core %p signaling termination\n", &this);
+                __atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
+                wake( &this );
+                P( terminated );
+                verify( kernelTLS.this_processor != &this);
+        }
+        pthread_join( kernel_thread, 0p );
+        free( this.stack );
+}
+void ?{}(cluster & this, const char name[], Duration preemption_rate) with( this ) {
+        this.name = name;
+        this.preemption_rate = preemption_rate;
+        ready_queue{};
+        ready_queue_lock{};
+        procs{ __get };
+        idles{ __get };
+        threads{ __get };
+        doregister(this);
+}
+void ^?{}(cluster & this) {
+        unregister(this);
+}
+// Kernel Scheduling logic
+static $thread * __next_thread(cluster * this);
+static $thread * __next_thread_slow(cluster * this);
+static void __run_thread(processor * this, $thread * dst);
+static void __wake_one(cluster * cltr);
+static void push  (__cluster_idles & idles, processor & proc);
+static void remove(__cluster_idles & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
 //=============================================================================================
 // Kernel Scheduling logic
 //=============================================================================================
-static $thread * __next_thread(cluster * this);
-static void __run_thread(processor * this, $thread * dst);
-static void __halt(processor * this);
 //Main of the processor contexts
 void main(processorCtx_t & runner) {
         // Because of a bug, we couldn't initialized the seed on construction
         // Do it here
+        kernelTLS.rand_seed ^= rdtscl();
+        __cfaabi_tls.rand_seed ^= rdtscl();
+        __cfaabi_tls.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
+        __tls_rand_advance_bck();
         processor * this = runner.proc;
         verify(this);
+        __cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
+        doregister(this->cltr, this);
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
+        #if !defined(__CFA_NO_STATISTICS__)
+                if( this->print_halts ) {
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "Processor : %d - %s (%p)\n", this->id, this->name, (void*)this);
+                }
+        #endif
+        {
 …
                 preemption_scope scope = { this };
                 __cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p started\n", this);
                 $thread * readyThread = 0p;
+                for( unsigned int spin_count = 0; ! __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST); spin_count++ ) {
+                MAIN_LOOP:
+                for() {
+                        // Try to get the next thread
                         readyThread = __next_thread( this->cltr );
+                        if(readyThread) {
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                /* paranoid */ verifyf( readyThread->state == Inactive || readyThread->state == Start || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
+                                /* paranoid */ verifyf( readyThread->next == 0p, "Expected null got %p", readyThread->next );
+                                __run_thread(this, readyThread);
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                spin_count = 0;
+                        } else {
+                                // spin(this, &spin_count);
+                                __halt(this);
+                        if( !readyThread ) {
+                                readyThread = __next_thread_slow( this->cltr );
+                        }
+                }
+                __cfaabi_dbg_print_safe("Kernel : core %p stopping\n", this);
+        }
+        unregister(this->cltr, this);
+                        HALT:
+                        if( !readyThread ) {
+                                // Don't block if we are done
+                                if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        __tls_stats()->ready.sleep.halts++;
+                                #endif
+                                // Push self to idle stack
+                                push(this->cltr->idles, * this);
+                                // Confirm the ready-queue is empty
+                                readyThread = __next_thread_slow( this->cltr );
+                                if( readyThread ) {
+                                        // A thread was found, cancel the halt
+                                        remove(this->cltr->idles, * this);
+                                        #if !defined(__CFA_NO_STATISTICS__)
+                                                __tls_stats()->ready.sleep.cancels++;
+                                        #endif
+                                        // continue the mai loop
+                                        break HALT;
+                                }
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if(this->print_halts) {
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+                                        }
+                                #endif
+                                wait( this->idle );
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if(this->print_halts) {
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+                                        }
+                                #endif
+                                // We were woken up, remove self from idle
+                                remove(this->cltr->idles, * this);
+                                // DON'T just proceed, start looking again
+                                continue MAIN_LOOP;
+                        }
+                        /* paranoid */ verify( readyThread );
+                        // We found a thread run it
+                        __run_thread(this, readyThread);
+                        // Are we done?
+                        if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                }
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
+        }
         V( this->terminated );
+        __cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+        if(this == mainProcessor) {
+                // HACK : the coroutine context switch expects this_thread to be set
+                // and it make sense for it to be set in all other cases except here
+                // fake it
+                __cfaabi_tls.this_thread = mainThread;
+        }
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p terminated\n", this);
+}
 …
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
+        /* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+        __builtin_prefetch( thrd_dst->context.SP );
         $coroutine * proc_cor = get_coroutine(this->runner);
-        // Update global state
-        kernelTLS.this_thread = thrd_dst;
         // set state of processor coroutine to inactive
         verify(proc_cor->state == Active);
         proc_cor->state = Inactive;
+        proc_cor->state = Blocked;
         // Actually run the thread
         RUNNING:  while(true) {
+                if(unlikely(thrd_dst->preempted)) {
+                        thrd_dst->preempted = __NO_PREEMPTION;
+                        verify(thrd_dst->state == Active || thrd_dst->state == Rerun);
+                } else {
+                        verify(thrd_dst->state == Start || thrd_dst->state == Primed || thrd_dst->state == Inactive);
+                        thrd_dst->state = Active;
+                }
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                thrd_dst->preempted = __NO_PREEMPTION;
+                thrd_dst->state = Active;
+                // Update global state
+                kernelTLS().this_thread = thrd_dst;
+                /* paranoid */ verify( ! __preemption_enabled() );
+                /* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
+                /* paranoid */ verify( thrd_dst->curr_cluster == this->cltr );
+                /* paranoid */ verify( thrd_dst->context.SP );
+                /* paranoid */ verify( thrd_dst->state != Halted );
+                /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ) || thrd_dst->curr_cor == proc_cor, "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst ); // add escape condition if we are setting up the processor
+                /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit) || thrd_dst->curr_cor == proc_cor, "ERROR : Destination $thread %p has been corrupted.\n StackPointer too large.\n", thrd_dst ); // add escape condition if we are setting up the processor
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
                 // set context switch to the thread that the processor is executing
-                verify( thrd_dst->context.SP );
                 __cfactx_switch( &proc_cor->context, &thrd_dst->context );
                 // when __cfactx_switch returns we are back in the processor coroutine
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
+                /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit), "ERROR : Destination $thread %p has been corrupted.\n StackPointer too large.\n", thrd_dst );
+                /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ), "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst );
+                /* paranoid */ verify( thrd_dst->context.SP );
+                /* paranoid */ verify( thrd_dst->curr_cluster == this->cltr );
+                /* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
+                /* paranoid */ verify( ! __preemption_enabled() );
+                // Reset global state
+                kernelTLS().this_thread = 0p;
                 // We just finished running a thread, there are a few things that could have happened.
                 // 1 - Regular case : the thread has blocked and now one has scheduled it yet.
                 // 2 - Racy case    : the thread has blocked but someone has already tried to schedule it.
-                // 3 - Polite Racy case : the thread has blocked, someone has already tried to schedule it, but the thread is nice and wants to go through the ready-queue any way
                 // 4 - Preempted
                 // In case 1, we may have won a race so we can't write to the state again.
                 // In case 2, we lost the race so we now own the thread.
-                // In case 3, we lost the race but can just reschedule the thread.
                 if(unlikely(thrd_dst->preempted != __NO_PREEMPTION)) {
 …
+                }
+                if(unlikely(thrd_dst->state == Halting)) {
+                        // The thread has halted, it should never be scheduled/run again
+                        // finish the thread
+                        __thread_finish( thrd_dst );
+                        break RUNNING;
+                }
+                /* paranoid */ verify( thrd_dst->state == Active );
+                thrd_dst->state = Blocked;
                 // set state of processor coroutine to active and the thread to inactive
+                static_assert(sizeof(thrd_dst->state) == sizeof(int));
+                enum coroutine_state old_state = __atomic_exchange_n(&thrd_dst->state, Inactive, __ATOMIC_SEQ_CST);
+                switch(old_state) {
+                        case Halted:
+                                // The thread has halted, it should never be scheduled/run again, leave it back to Halted and move on
+                                thrd_dst->state = Halted;
+                                // We may need to wake someone up here since
+                                unpark( this->destroyer );
+                                this->destroyer = 0p;
+                                break RUNNING;
+                        case Active:
+                int old_ticket = __atomic_fetch_sub(&thrd_dst->ticket, 1, __ATOMIC_SEQ_CST);
+                switch(old_ticket) {
+                        case TICKET_RUNNING:
                                 // This is case 1, the regular case, nothing more is needed
                                 break RUNNING;
                         case Rerun:
+                        case TICKET_UNBLOCK:
                                 // This is case 2, the racy case, someone tried to run this thread before it finished blocking
                                 // In this case, just run it again.
 …
                         default:
                                 // This makes no sense, something is wrong abort
                                 abort("Finished running a thread that was Inactive/Start/Primed %d\n", old_state);
+                                abort();
+                }
+        }
 …
         // Just before returning to the processor, set the processor coroutine to active
         proc_cor->state = Active;
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
 // KERNEL_ONLY
 void returnToKernel() {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        $coroutine * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
+        $thread * thrd_src = kernelTLS.this_thread;
+        /* paranoid */ verify( ! __preemption_enabled() );
+        $coroutine * proc_cor = get_coroutine(kernelTLS().this_processor->runner);
+        $thread * thrd_src = kernelTLS().this_thread;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct processor * last_proc = kernelTLS().this_processor;
+        #endif
         // Run the thread on this processor
 …
                         __x87_store;
                 #endif
+                verify( proc_cor->context.SP );
+                /* paranoid */ verify( proc_cor->context.SP );
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_src->canary );
                 __cfactx_switch( &thrd_src->context, &proc_cor->context );
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_src->canary );
                 #if defined( __i386 ) || defined( __x86_64 )
                         __x87_load;
 …
+        }
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+// KERNEL_ONLY
+// Context invoker for processors
+// This is the entry point for processors (kernel threads)
+// It effectively constructs a coroutine by stealing the pthread stack
+static void * __invoke_processor(void * arg) {
+        processor * proc = (processor *) arg;
+        kernelTLS.this_processor = proc;
+        kernelTLS.this_thread    = 0p;
+        kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+        // SKULLDUGGERY: We want to create a context for the processor coroutine
+        // which is needed for the 2-step context switch. However, there is no reason
+        // to waste the perfectly valid stack create by pthread.
+        current_stack_info_t info;
+        __stack_t ctx;
+        info.storage = &ctx;
+        (proc->runner){ proc, &info };
+        __cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
+        //Set global state
+        kernelTLS.this_thread = 0p;
+        //We now have a proper context from which to schedule threads
+        __cfaabi_dbg_print_safe("Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+        // SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
+        // resume it to start it like it normally would, it will just context switch
+        // back to here. Instead directly call the main since we already are on the
+        // appropriate stack.
+        get_coroutine(proc->runner)->state = Active;
+        main( proc->runner );
+        get_coroutine(proc->runner)->state = Halted;
+        // Main routine of the core returned, the core is now fully terminated
+        __cfaabi_dbg_print_safe("Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+        return 0p;
+}
+static void Abort( int ret, const char func[] ) {
+        if ( ret ) {                                                                            // pthread routines return errno values
+                abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
+        } // if
+} // Abort
+void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
+        pthread_attr_t attr;
+        Abort( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+        size_t stacksize;
+        // default stack size, normally defined by shell limit
+        Abort( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
+        assert( stacksize >= PTHREAD_STACK_MIN );
+        void * stack;
+        __cfaabi_dbg_debug_do(
+                stack = memalign( __page_size, stacksize + __page_size );
+                // pthread has no mechanism to create the guard page in user supplied stack.
+                if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
+                        abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
+                } // if
+        );
+        __cfaabi_dbg_no_debug_do(
+                stack = malloc( stacksize );
+        );
+        Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+        Abort( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
+        return stack;
+}
+// KERNEL_ONLY
+static void __kernel_first_resume( processor * this ) {
+        $thread * src = mainThread;
+        $coroutine * dst = get_coroutine(this->runner);
+        verify( ! kernelTLS.preemption_state.enabled );
+        kernelTLS.this_thread->curr_cor = dst;
+        __stack_prepare( &dst->stack, 65000 );
+        __cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
+        verify( ! kernelTLS.preemption_state.enabled );
+        dst->last = &src->self_cor;
+        dst->starter = dst->starter ? dst->starter : &src->self_cor;
+        // set state of current coroutine to inactive
+        src->state = src->state == Halted ? Halted : Inactive;
+        // context switch to specified coroutine
+        verify( dst->context.SP );
+        __cfactx_switch( &src->context, &dst->context );
+        // when __cfactx_switch returns we are back in the src coroutine
+        mainThread->curr_cor = &mainThread->self_cor;
+        // set state of new coroutine to active
+        src->state = Active;
+        verify( ! kernelTLS.preemption_state.enabled );
+}
+// KERNEL_ONLY
+static void __kernel_last_resume( processor * this ) {
+        $coroutine * src = &mainThread->self_cor;
+        $coroutine * dst = get_coroutine(this->runner);
+        verify( ! kernelTLS.preemption_state.enabled );
+        verify( dst->starter == src );
+        verify( dst->context.SP );
+        // context switch to the processor
+        __cfactx_switch( &src->context, &dst->context );
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(last_proc != kernelTLS().this_processor) {
+                        __tls_stats()->ready.threads.migration++;
+                }
+        #endif
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
+        /* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) > ((uintptr_t)__get_stack(thrd_src->curr_cor)->limit), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too large.\n", thrd_src );
+}
 …
 // Scheduler routines
 // KERNEL ONLY
+void __schedule_thread( $thread * thrd ) with( *thrd->curr_cluster ) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+void __schedule_thread( $thread * thrd ) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        /* paranoid */ verify( thrd );
+        /* paranoid */ verify( thrd->state != Halted );
+        /* paranoid */ verify( thrd->curr_cluster );
         /* paranoid */ #if defined( __CFA_WITH_VERIFY__ )
         /* paranoid */ if( thrd->state == Inactive || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
                           "Error inactive thread marked as preempted, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ if( thrd->preempted != __NO_PREEMPTION ) assertf(thrd->state == Active || thrd->state == Rerun,
                           "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
+        /* paranoid */  if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
+                                        "Error inactive thread marked as preempted, state %d, preemption %d\n", thrd->state, thrd->preempted );
+        /* paranoid */  if( thrd->preempted != __NO_PREEMPTION ) assertf(thrd->state == Active,
+                                        "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ #endif
+        /* paranoid */ verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+        lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
+        bool was_empty = !(ready_queue != 0);
+        append( ready_queue, thrd );
+        unlock( ready_queue_lock );
+        if(was_empty) {
+                lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                if(idles) {
+                        wake_fast(idles.head);
+                }
+                unlock    (proc_list_lock);
+        }
+        else if( struct processor * idle = idles.head ) {
+                wake_fast(idle);
+        }
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+        /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
+        if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
+        ready_schedule_lock();
+                // Dereference the thread now because once we push it, there is not guaranteed it's still valid.
+                struct cluster * cl = thrd->curr_cluster;
+                // push the thread to the cluster ready-queue
+                push( cl, thrd );
+                // variable thrd is no longer safe to use
+                // wake the cluster using the save variable.
+                __wake_one( cl );
+        ready_schedule_unlock();
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
 // KERNEL ONLY
+static $thread * __next_thread(cluster * this) with( *this ) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        lock( ready_queue_lock __cfaabi_dbg_ctx2 );
+        $thread * head = pop_head( ready_queue );
+        unlock( ready_queue_lock );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        return head;
+static inline $thread * __next_thread(cluster * this) with( *this ) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        ready_schedule_lock();
+                $thread * thrd = pop( this );
+        ready_schedule_unlock();
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        return thrd;
+}
+// KERNEL ONLY
+static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        ready_schedule_lock();
+                $thread * thrd = pop_slow( this );
+        ready_schedule_unlock();
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        return thrd;
+}
 …
         if( !thrd ) return;
+        disable_interrupts();
+        static_assert(sizeof(thrd->state) == sizeof(int));
+        enum coroutine_state old_state = __atomic_exchange_n(&thrd->state, Rerun, __ATOMIC_SEQ_CST);
+        switch(old_state) {
+                case Active:
+        int old_ticket = __atomic_fetch_add(&thrd->ticket, 1, __ATOMIC_SEQ_CST);
+        switch(old_ticket) {
+                case TICKET_RUNNING:
                         // Wake won the race, the thread will reschedule/rerun itself
                         break;
                 case Inactive:
+                case TICKET_BLOCKED:
                         /* paranoid */ verify( ! thrd->preempted != __NO_PREEMPTION );
+                        // Wake lost the race,
+                        thrd->state = Inactive;
+                        __schedule_thread( thrd );
+                        /* paranoid */ verify( thrd->state == Blocked );
+                        {
+                                /* paranoid */ verify( publicTLS_get(this_proc_id) );
+                                bool full = publicTLS_get(this_proc_id)->full_proc;
+                                if(full) disable_interrupts();
+                                /* paranoid */ verify( ! __preemption_enabled() );
+                                // Wake lost the race,
+                                __schedule_thread( thrd );
+                                /* paranoid */ verify( ! __preemption_enabled() );
+                                if(full) enable_interrupts( __cfaabi_dbg_ctx );
+                                /* paranoid */ verify( publicTLS_get(this_proc_id) );
+                        }
                         break;
-                case Rerun:
-                        abort("More than one thread attempted to schedule thread %p\n", thrd);
-                        break;
-                case Halted:
-                case Start:
-                case Primed:
                 default:
                         // This makes no sense, something is wrong abort
+                        abort();
+        }
+                        abort("Thread %p (%s) has mismatch park/unpark\n", thrd, thrd->self_cor.name);
+        }
+}
+void park( void ) {
+        /* paranoid */ verify( __preemption_enabled() );
+        disable_interrupts();
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_thread->preempted == __NO_PREEMPTION );
+        returnToKernel();
+        /* paranoid */ verify( ! __preemption_enabled() );
         enable_interrupts( __cfaabi_dbg_ctx );
+}
+void park( void ) {
+        /* paranoid */ verify( kernelTLS.preemption_state.enabled );
+        disable_interrupts();
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( kernelTLS.this_thread->preempted == __NO_PREEMPTION );
+        returnToKernel();
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        enable_interrupts( __cfaabi_dbg_ctx );
+        /* paranoid */ verify( kernelTLS.preemption_state.enabled );
+}
+// KERNEL ONLY
+void __leave_thread() {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        returnToKernel();
+        abort();
+        /* paranoid */ verify( __preemption_enabled() );
+}
+extern "C" {
+        // Leave the thread monitor
+        // last routine called by a thread.
+        // Should never return
+        void __cfactx_thrd_leave() {
+                $thread * thrd = active_thread();
+                $monitor * this = &thrd->self_mon;
+                // Lock the monitor now
+                lock( this->lock __cfaabi_dbg_ctx2 );
+                disable_interrupts();
+                /* paranoid */ verify( ! __preemption_enabled() );
+                /* paranoid */ verify( thrd->state == Active );
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
+                /* paranoid */ verify( kernelTLS().this_thread == thrd );
+                /* paranoid */ verify( thrd->context.SP );
+                /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) > ((uintptr_t)__get_stack(thrd->curr_cor)->limit), "ERROR : $thread %p has been corrupted.\n StackPointer too large.\n", thrd );
+                /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) < ((uintptr_t)__get_stack(thrd->curr_cor)->base ), "ERROR : $thread %p has been corrupted.\n StackPointer too small.\n", thrd );
+                thrd->state = Halting;
+                if( TICKET_RUNNING != thrd->ticket ) { abort( "Thread terminated with pending unpark" ); }
+                if( thrd != this->owner ) { abort( "Thread internal monitor has incorrect owner" ); }
+                if( this->recursion != 1) { abort( "Thread internal monitor has unbalanced recursion" ); }
+                // Leave the thread
+                returnToKernel();
+                // Control flow should never reach here!
+                abort();
+        }
+}
 // KERNEL ONLY
 bool force_yield( __Preemption_Reason reason ) {
         /* paranoid */ verify( kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( __preemption_enabled() );
         disable_interrupts();
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         $thread * thrd = kernelTLS.this_thread;
         /* paranoid */ verify(thrd->state == Active || thrd->state == Rerun);
+        /* paranoid */ verify( ! __preemption_enabled() );
+        $thread * thrd = kernelTLS().this_thread;
+        /* paranoid */ verify(thrd->state == Active);
         // SKULLDUGGERY: It is possible that we are preempting this thread just before
 …
         // If that is the case, abandon the preemption.
         bool preempted = false;
         if(thrd->next == 0p) {
+        if(thrd->link.next == 0p) {
                 preempted = true;
                 thrd->preempted = reason;
 …
+        }
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( ! __preemption_enabled() );
         enable_interrupts_noPoll();
         /* paranoid */ verify( kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( __preemption_enabled() );
         return preempted;
 …
 //=============================================================================================
 // Kernel Setup logic
+// Kernel Idle Sleep
 //=============================================================================================
+//-----------------------------------------------------------------------------
+// Kernel boot procedures
+static void __kernel_startup(void) {
+        verify( ! kernelTLS.preemption_state.enabled );
+        __cfaabi_dbg_print_safe("Kernel : Starting\n");
+        __page_size = sysconf( _SC_PAGESIZE );
+        __cfa_dbg_global_clusters.list{ __get };
+        __cfa_dbg_global_clusters.lock{};
+        // Initialize the main cluster
+        mainCluster = (cluster *)&storage_mainCluster;
+        (*mainCluster){"Main Cluster"};
+        __cfaabi_dbg_print_safe("Kernel : Main cluster ready\n");
+        // Start by initializing the main thread
+        // SKULLDUGGERY: the mainThread steals the process main thread
+        // which will then be scheduled by the mainProcessor normally
+        mainThread = ($thread *)&storage_mainThread;
+        current_stack_info_t info;
+        info.storage = (__stack_t*)&storage_mainThreadCtx;
+        (*mainThread){ &info };
+        __cfaabi_dbg_print_safe("Kernel : Main thread ready\n");
+        // Construct the processor context of the main processor
+        void ?{}(processorCtx_t & this, processor * proc) {
+                (this.__cor){ "Processor" };
+                this.__cor.starter = 0p;
+                this.proc = proc;
+        }
+        void ?{}(processor & this) with( this ) {
+                name = "Main Processor";
+                cltr = mainCluster;
+                terminated{ 0 };
+                do_terminate = false;
+                preemption_alarm = 0p;
+                pending_preemption = false;
+                kernel_thread = pthread_self();
+                runner{ &this };
+                __cfaabi_dbg_print_safe("Kernel : constructed main processor context %p\n", &runner);
+        }
+        // Initialize the main processor and the main processor ctx
+        // (the coroutine that contains the processing control flow)
+        mainProcessor = (processor *)&storage_mainProcessor;
+        (*mainProcessor){};
+        //initialize the global state variables
+        kernelTLS.this_processor = mainProcessor;
+        kernelTLS.this_thread    = mainThread;
+        // Enable preemption
+        kernel_start_preemption();
+        // Add the main thread to the ready queue
+        // once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
+        __schedule_thread(mainThread);
+        // SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
+        // context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
+        // mainThread is on the ready queue when this call is made.
+        __kernel_first_resume( kernelTLS.this_processor );
+        // THE SYSTEM IS NOW COMPLETELY RUNNING
+        __cfaabi_dbg_print_safe("Kernel : Started\n--------------------------------------------------\n\n");
+        verify( ! kernelTLS.preemption_state.enabled );
+// Wake a thread from the front if there are any
+static void __wake_one(cluster * this) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( ready_schedule_islocked() );
+        // Check if there is a sleeping processor
+        processor * p;
+        unsigned idle;
+        unsigned total;
+        [idle, total, p] = query(this->idles);
+        // If no one is sleeping, we are done
+        if( idle == 0 ) return;
+        // We found a processor, wake it up
+        post( p->idle );
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.sleep.wakes++;
+        #endif
+        /* paranoid */ verify( ready_schedule_islocked() );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        return;
+}
+// Unconditionnaly wake a thread
+void __wake_proc(processor * this) {
+        __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
+        disable_interrupts();
+                /* paranoid */ verify( ! __preemption_enabled() );
+                post( this->idle );
         enable_interrupts( __cfaabi_dbg_ctx );
+        verify( TL_GET( preemption_state.enabled ) );
+}
+static void __kernel_shutdown(void) {
+        __cfaabi_dbg_print_safe("\n--------------------------------------------------\nKernel : Shutting down\n");
+        verify( TL_GET( preemption_state.enabled ) );
+        disable_interrupts();
+        verify( ! kernelTLS.preemption_state.enabled );
+        // SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
+        // When its coroutine terminates, it return control to the mainThread
+        // which is currently here
+        __atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
+        __kernel_last_resume( kernelTLS.this_processor );
+        mainThread->self_cor.state = Halted;
+        // THE SYSTEM IS NOW COMPLETELY STOPPED
+        // Disable preemption
+        kernel_stop_preemption();
+        // Destroy the main processor and its context in reverse order of construction
+        // These were manually constructed so we need manually destroy them
+        ^(mainProcessor->runner){};
+        ^(mainProcessor){};
+        // Final step, destroy the main thread since it is no longer needed
+        // Since we provided a stack to this taxk it will not destroy anything
+        ^(mainThread){};
+        ^(__cfa_dbg_global_clusters.list){};
+        ^(__cfa_dbg_global_clusters.lock){};
+        __cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
+}
+//=============================================================================================
+// Kernel Quiescing
+//=============================================================================================
+static void __halt(processor * this) with( *this ) {
+        // verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
+        with( *cltr ) {
+                lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                remove    (procs, *this);
+                push_front(idles, *this);
+                unlock    (proc_list_lock);
+        }
+        __cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+        wait( idleLock );
+        __cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+        with( *cltr ) {
+                lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                remove    (idles, *this);
+                push_front(procs, *this);
+                unlock    (proc_list_lock);
+}
+static void push  (__cluster_idles & this, processor & proc) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        lock( this );
+                this.idle++;
+                /* paranoid */ verify( this.idle <= this.total );
+                insert_first(this.list, proc);
+        unlock( this );
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
+static void remove(__cluster_idles & this, processor & proc) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        lock( this );
+                this.idle--;
+                /* paranoid */ verify( this.idle >= 0 );
+                remove(proc);
+        unlock( this );
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+        for() {
+                uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
+                if( 1 == (l % 2) ) { Pause(); continue; }
+                unsigned idle    = this.idle;
+                unsigned total   = this.total;
+                processor * proc = &this.list`first;
+                // Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
+                asm volatile("": : :"memory");
+                if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
+                return [idle, total, proc];
+        }
+}
 …
         // the globalAbort flag is true.
         lock( kernel_abort_lock __cfaabi_dbg_ctx2 );
+        // disable interrupts, it no longer makes sense to try to interrupt this processor
+        disable_interrupts();
         // first task to abort ?
 …
+        }
         return kernelTLS.this_thread;
+        return __cfaabi_tls.this_thread;
+}
 void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
         $thread * thrd = kernel_data;
+        $thread * thrd = ( $thread * ) kernel_data;
         if(thrd) {
 …
 int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
         return get_coroutine(kernelTLS.this_thread) == get_coroutine(mainThread) ? 4 : 2;
+        return get_coroutine(kernelTLS().this_thread) == get_coroutine(mainThread) ? 4 : 2;
+}
 …
 void ^?{}(semaphore & this) {}
 void P(semaphore & this) with( this ){
+bool P(semaphore & this) with( this ){
         lock( lock __cfaabi_dbg_ctx2 );
         count -= 1;
         if ( count < 0 ) {
                 // queue current task
                 append( waiting, kernelTLS.this_thread );
+                append( waiting, active_thread() );
                 // atomically release spin lock and block
                 unlock( lock );
                 park();
+                return true;
+        }
         else {
             unlock( lock );
+        }
+}
+void V(semaphore & this) with( this ) {
+            return false;
+        }
+}
+bool V(semaphore & this) with( this ) {
         $thread * thrd = 0p;
         lock( lock __cfaabi_dbg_ctx2 );
 …
         // make new owner
         unpark( thrd );
+}
+//-----------------------------------------------------------------------------
+// Global Queues
+void doregister( cluster     & cltr ) {
+        lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+        push_front( __cfa_dbg_global_clusters.list, cltr );
+        unlock    ( __cfa_dbg_global_clusters.lock );
+}
+void unregister( cluster     & cltr ) {
+        lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+        remove( __cfa_dbg_global_clusters.list, cltr );
+        unlock( __cfa_dbg_global_clusters.lock );
+}
+void doregister( cluster * cltr, $thread & thrd ) {
+        lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+        cltr->nthreads += 1;
+        push_front(cltr->threads, thrd);
+        unlock    (cltr->thread_list_lock);
+}
+void unregister( cluster * cltr, $thread & thrd ) {
+        lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+        remove(cltr->threads, thrd );
+        cltr->nthreads -= 1;
+        unlock(cltr->thread_list_lock);
+}
+void doregister( cluster * cltr, processor * proc ) {
+        lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
+        cltr->nprocessors += 1;
+        push_front(cltr->procs, *proc);
+        unlock    (cltr->proc_list_lock);
+}
+void unregister( cluster * cltr, processor * proc ) {
+        lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
+        remove(cltr->procs, *proc );
+        cltr->nprocessors -= 1;
+        unlock(cltr->proc_list_lock);
+        return thrd != 0p;
+}
+bool V(semaphore & this, unsigned diff) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        int release = max(-count, (int)diff);
+        count += diff;
+        for(release) {
+                unpark( pop_head( waiting ) );
+        }
+        unlock( lock );
+        return thrd != 0p;
+}
 …
 __cfaabi_dbg_debug_do(
         extern "C" {
                 void __cfaabi_dbg_record(__spinlock_t & this, const char prev_name[]) {
+                void __cfaabi_dbg_record_lock(__spinlock_t & this, const char prev_name[]) {
                         this.prev_name = prev_name;
                         this.prev_thrd = kernelTLS.this_thread;
+                        this.prev_thrd = kernelTLS().this_thread;
+                }
+        }
 …
         return true;
+}
+//-----------------------------------------------------------------------------
+// Statistics
+#if !defined(__CFA_NO_STATISTICS__)
+        void print_halts( processor & this ) {
+                this.print_halts = true;
+        }
+        void print_stats_now( cluster & this, int flags ) {
+                __print_stats( this.stats, this.print_stats, "Cluster", this.name, (void*)&this );
+        }
+        extern int __print_alarm_stats;
+        void print_alarm_stats() {
+                __print_alarm_stats = -1;
+        }
+#endif
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/kernel.hfa

-              r3c64c668
+              r58fe85a
 #pragma once
-#include <stdbool.h>
 #include "invoke.h"
 #include "time_t.hfa"
 #include "coroutine.hfa"
+#include "containers/list.hfa"
 extern "C" {
 #include <pthread.h>
 #include <semaphore.h>
+        #include <bits/pthreadtypes.h>
+        #include <linux/types.h>
+}
 …
 void  ?{}(semaphore & this, int count = 1);
 void ^?{}(semaphore & this);
+void   P (semaphore & this);
+void   V (semaphore & this);
+bool   P (semaphore & this);
+bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);
 …
 extern struct cluster * mainCluster;
+// Processor
+// Processor id, required for scheduling threads
+struct __processor_id_t {
+        unsigned id:24;
+        bool full_proc:1;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct __stats_t * stats;
+        #endif
+};
 coroutine processorCtx_t {
         struct processor * proc;
 …
 // Wrapper around kernel threads
 struct processor {
+struct __attribute__((aligned(128))) processor {
         // Main state
+        inline __processor_id_t;
+        // Cluster from which to get threads
+        struct cluster * cltr;
+        // Set to true to notify the processor should terminate
+        volatile bool do_terminate;
         // Coroutine ctx who does keeps the state of the processor
         struct processorCtx_t runner;
-        // Cluster from which to get threads
-        struct cluster * cltr;
         // Name of the processor
         const char * name;
 …
         // Handle to pthreads
         pthread_t kernel_thread;
-        // RunThread data
-        // Action to do after a thread is ran
-        $thread * destroyer;
         // Preemption data
 …
         bool pending_preemption;
+        // Idle lock
+        __bin_sem_t idleLock;
+        // Termination
+        // Set to true to notify the processor should terminate
+        volatile bool do_terminate;
+        // Termination synchronisation
+        // Idle lock (kernel semaphore)
+        __bin_sem_t idle;
+        // Termination synchronisation (user semaphore)
         semaphore terminated;
 …
         // Link lists fields
+        struct __dbg_node_proc {
+                struct processor * next;
+                struct processor * prev;
+        } node;
+        DLISTED_MGD_IMPL_IN(processor)
+        #if !defined(__CFA_NO_STATISTICS__)
+                int print_stats;
+                bool print_halts;
+        #endif
 #ifdef __CFA_DEBUG__
 …
 static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }
+static inline [processor *&, processor *& ] __get( processor & this ) __attribute__((const)) { return this.node.[next, prev]; }
+DLISTED_MGD_IMPL_OUT(processor)
+//-----------------------------------------------------------------------------
+// I/O
+struct __io_data;
+// IO poller user-thread
+// Not using the "thread" keyword because we want to control
+// more carefully when to start/stop it
+struct $io_ctx_thread {
+        struct __io_data * ring;
+        single_sem sem;
+        volatile bool done;
+        $thread self;
+};
+struct io_context {
+        $io_ctx_thread thrd;
+};
+struct io_context_params {
+        int num_entries;
+        int num_ready;
+        int submit_aff;
+        bool eager_submits:1;
+        bool poller_submits:1;
+        bool poll_submit:1;
+        bool poll_complete:1;
+};
+void  ?{}(io_context_params & this);
+void  ?{}(io_context & this, struct cluster & cl);
+void  ?{}(io_context & this, struct cluster & cl, const io_context_params & params);
+void ^?{}(io_context & this);
+struct io_cancellation {
+        __u64 target;
+};
+static inline void  ?{}(io_cancellation & this) { this.target = -1u; }
+static inline void ^?{}(io_cancellation &) {}
+bool cancel(io_cancellation & this);
+//-----------------------------------------------------------------------------
+// Cluster Tools
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t;
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
+// Counter used for wether or not the lanes are all empty
+struct __attribute__((aligned(128))) __snzi_node_t;
+struct __snzi_t {
+        unsigned mask;
+        int root;
+        __snzi_node_t * nodes;
+};
+void  ?{}( __snzi_t & this, unsigned depth );
+void ^?{}( __snzi_t & this );
+//TODO adjust cache size to ARCHITECTURE
+// Structure holding the relaxed ready queue
+struct __ready_queue_t {
+        // Data tracking how many/which lanes are used
+        // Aligned to 128 for cache locality
+        __snzi_t snzi;
+        // Data tracking the actual lanes
+        // On a seperate cacheline from the used struct since
+        // used can change on each push/pop but this data
+        // only changes on shrink/grow
+        struct {
+                // Arary of lanes
+                __intrusive_lane_t * volatile data;
+                // Number of lanes (empty or not)
+                volatile size_t count;
+        } lanes;
+};
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
+// Idle Sleep
+struct __cluster_idles {
+        // Spin lock protecting the queue
+        volatile uint64_t lock;
+        // Total number of processors
+        unsigned total;
+        // Total number of idle processors
+        unsigned idle;
+        // List of idle processors
+        dlist(processor, processor) list;
+};
 //-----------------------------------------------------------------------------
 // Cluster
+struct cluster {
+        // Ready queue locks
+        __spinlock_t ready_queue_lock;
+struct __attribute__((aligned(128))) cluster {
         // Ready queue for threads
         __queue_t($thread) ready_queue;
+        __ready_queue_t ready_queue;
         // Name of the cluster
 …
         Duration preemption_rate;
+        // List of processors
+        __spinlock_t proc_list_lock;
+        __dllist_t(struct processor) procs;
+        __dllist_t(struct processor) idles;
+        unsigned int nprocessors;
+        // List of idle processors
+        __cluster_idles idles;
         // List of threads
 …
                 cluster * prev;
         } node;
+        struct {
+                io_context * ctxs;
+                unsigned cnt;
+        } io;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct __stats_t * stats;
+                int print_stats;
+        #endif
 };
 extern Duration default_preemption();
 void ?{} (cluster & this, const char name[], Duration preemption_rate);
+void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params);
 void ^?{}(cluster & this);
+static inline void ?{} (cluster & this)                           { this{"Anonymous Cluster", default_preemption()}; }
+static inline void ?{} (cluster & this, Duration preemption_rate) { this{"Anonymous Cluster", preemption_rate}; }
+static inline void ?{} (cluster & this, const char name[])        { this{name, default_preemption()}; }
+static inline void ?{} (cluster & this)                                            { io_context_params default_params;    this{"Anonymous Cluster", default_preemption(), 1, default_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate)                  { io_context_params default_params;    this{"Anonymous Cluster", preemption_rate, 1, default_params}; }
+static inline void ?{} (cluster & this, const char name[])                         { io_context_params default_params;    this{name, default_preemption(), 1, default_params}; }
+static inline void ?{} (cluster & this, unsigned num_io)                           { io_context_params default_params;    this{"Anonymous Cluster", default_preemption(), num_io, default_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned num_io) { io_context_params default_params;    this{"Anonymous Cluster", preemption_rate, num_io, default_params}; }
+static inline void ?{} (cluster & this, const char name[], unsigned num_io)        { io_context_params default_params;    this{name, default_preemption(), num_io, default_params}; }
+static inline void ?{} (cluster & this, const io_context_params & io_params)                                            { this{"Anonymous Cluster", default_preemption(), 1, io_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, const io_context_params & io_params)                  { this{"Anonymous Cluster", preemption_rate, 1, io_params}; }
+static inline void ?{} (cluster & this, const char name[], const io_context_params & io_params)                         { this{name, default_preemption(), 1, io_params}; }
+static inline void ?{} (cluster & this, unsigned num_io, const io_context_params & io_params)                           { this{"Anonymous Cluster", default_preemption(), num_io, io_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned num_io, const io_context_params & io_params) { this{"Anonymous Cluster", preemption_rate, num_io, io_params}; }
+static inline void ?{} (cluster & this, const char name[], unsigned num_io, const io_context_params & io_params)        { this{name, default_preemption(), num_io, io_params}; }
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }
+static inline struct processor * active_processor() { return TL_GET( this_processor ); } // UNSAFE
+static inline struct cluster   * active_cluster  () { return TL_GET( this_processor )->cltr; }
+static inline struct processor * active_processor() { return publicTLS_get( this_processor ); } // UNSAFE
+static inline struct cluster   * active_cluster  () { return publicTLS_get( this_processor )->cltr; }
+#if !defined(__CFA_NO_STATISTICS__)
+        void print_stats_now( cluster & this, int flags );
+        static inline void print_stats_at_exit( cluster & this, int flags ) {
+                this.print_stats |= flags;
+        }
+        static inline void print_stats_at_exit( processor & this, int flags ) {
+                this.print_stats |= flags;
+        }
+        void print_halts( processor & this );
+#endif
 // Local Variables: //

libcfa/src/concurrency/kernel_private.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Feb 13 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Nov 30 19:25:02 2019
 // Update Count     : 8
+// Last Modified On : Wed Aug 12 08:21:33 2020
+// Update Count     : 9
 //
 …
 #include "alarm.hfa"
+#include "stats.hfa"
 //-----------------------------------------------------------------------------
 // Scheduler
+struct __attribute__((aligned(128))) __scheduler_lock_id_t;
 extern "C" {
 …
+}
+void __schedule_thread( $thread * ) __attribute__((nonnull (1)));
+//Block current thread and release/wake-up the following resources
+void __leave_thread() __attribute__((noreturn));
+void __schedule_thread( $thread * )
+#if defined(NDEBUG) || (!defined(__CFA_DEBUG__) && !defined(__CFA_VERIFY__))
+        __attribute__((nonnull (1)))
+#endif
+;
+extern bool __preemption_enabled();
+//release/wake-up the following resources
+void __thread_finish( $thread * thrd );
 //-----------------------------------------------------------------------------
 …
 void * __create_pthread( pthread_t *, void * (*)(void *), void * );
+static inline void wake_fast(processor * this) {
+        __cfaabi_dbg_print_safe("Kernel : Waking up processor %p\n", this);
+        post( this->idleLock );
+}
+static inline void wake(processor * this) {
+        disable_interrupts();
+        wake_fast(this);
+        enable_interrupts( __cfaabi_dbg_ctx );
+}
+struct event_kernel_t {
+        alarm_list_t alarms;
+        __spinlock_t lock;
+};
+extern event_kernel_t * event_kernel;
+struct __cfa_kernel_preemption_state_t {
+        bool enabled;
+        bool in_progress;
+        unsigned short disable_count;
+};
+extern volatile thread_local __cfa_kernel_preemption_state_t preemption_state __attribute__ ((tls_model ( "initial-exec" )));
+void __destroy_pthread( pthread_t pthread, void * stack, void ** retval );
+extern cluster * mainCluster;
 //-----------------------------------------------------------------------------
 …
+)
+#define TICKET_BLOCKED (-1) // thread is blocked
+#define TICKET_RUNNING ( 0) // thread is running
+#define TICKET_UNBLOCK ( 1) // thread should ignore next block
 //-----------------------------------------------------------------------------
 // Utils
-#define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
-static inline uint32_t __tls_rand() {
-        kernelTLS.rand_seed ^= kernelTLS.rand_seed << 6;
-        kernelTLS.rand_seed ^= kernelTLS.rand_seed >> 21;
-        kernelTLS.rand_seed ^= kernelTLS.rand_seed << 7;
-        return kernelTLS.rand_seed;
+}
-void doregister( struct cluster & cltr );
-void unregister( struct cluster & cltr );
 void doregister( struct cluster * cltr, struct $thread & thrd );
 void unregister( struct cluster * cltr, struct $thread & thrd );
+void doregister( struct cluster * cltr, struct processor * proc );
+void unregister( struct cluster * cltr, struct processor * proc );
+//-----------------------------------------------------------------------------
+// I/O
+void ^?{}(io_context & this, bool );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __attribute__((aligned(128))) __scheduler_lock_id_t {
+        // Spin lock used as the underlying lock
+        volatile bool lock;
+        // Handle pointing to the proc owning this cell
+        // Used for allocating cells and debugging
+        __processor_id_t * volatile handle;
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                bool owned;
+        #endif
+};
+static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister( struct __processor_id_t * proc );
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct __processor_id_t * proc );
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+        for() {
+                uint64_t l = this.lock;
+                if(
+                        (0 == (l % 2))
+                        && __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+                ) return;
+                Pause();
+        }
+}
+static inline void unlock(__cluster_idles & this) {
+        /* paranoid */ verify( 1 == (this.lock % 2) );
+        __atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
+static inline void __atomic_acquire(volatile bool * ll) {
+        while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+                while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+                        Pause();
+        }
+        /* paranoid */ verify(*ll);
+}
+// Non-Blocking acquire
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+// Release
+static inline void __atomic_unlock(volatile bool * ll) {
+        /* paranoid */ verify(*ll);
+        __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+//-----------------------------------------------------------------------
+// Reader-Writer lock protecting the ready-queues
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
+struct __scheduler_RWLock_t {
+        // total cachelines allocated
+        unsigned int max;
+        // cachelines currently in use
+        volatile unsigned int alloc;
+        // cachelines ready to itereate over
+        // (!= to alloc when thread is in second half of doregister)
+        volatile unsigned int ready;
+        // writer lock
+        volatile bool lock;
+        // data pointer
+        __scheduler_lock_id_t * data;
+};
+void  ?{}(__scheduler_RWLock_t & this);
+void ^?{}(__scheduler_RWLock_t & this);
+extern __scheduler_RWLock_t * __scheduler_lock;
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        unsigned iproc = kernelTLS().this_proc_id->id;
+        /*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
+        /*paranoid*/ verify(iproc < ready);
+        // Step 1 : make sure no writer are in the middle of the critical section
+        while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+                Pause();
+        // Fence needed because we don't want to start trying to acquire the lock
+        // before we read a false.
+        // Not needed on x86
+        // std::atomic_thread_fence(std::memory_order_seq_cst);
+        // Step 2 : acquire our local lock
+        __atomic_acquire( &data[iproc].lock );
+        /*paranoid*/ verify(data[iproc].lock);
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                data[iproc].owned = true;
+        #endif
+}
+static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        unsigned iproc = kernelTLS().this_proc_id->id;
+        /*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
+        /*paranoid*/ verify(iproc < ready);
+        /*paranoid*/ verify(data[iproc].lock);
+        /*paranoid*/ verify(data[iproc].owned);
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                data[iproc].owned = false;
+        #endif
+        __atomic_unlock(&data[iproc].lock);
+}
+#ifdef __CFA_WITH_VERIFY__
+        static inline bool ready_schedule_islocked(void) {
+                /* paranoid */ verify( ! __preemption_enabled() );
+                /*paranoid*/ verify( kernelTLS().this_proc_id );
+                __processor_id_t * proc = kernelTLS().this_proc_id;
+                return __scheduler_lock->data[proc->id].owned;
+        }
+        static inline bool ready_mutate_islocked() {
+                return __scheduler_lock->lock;
+        }
+#endif
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( void );
+void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );
+//=======================================================================
+// Ready-Queue API
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+__attribute__((hot)) bool query(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
+__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// May return 0p spuriously
+__attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// guaranteed to find any threads added before this call
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// remove thread from the ready queue of a cluster
+// returns bool if it wasn't found
+bool remove_head(struct cluster * cltr, struct $thread * thrd);
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
+void ready_queue_grow  (struct cluster * cltr, int target);
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_shrink(struct cluster * cltr, int target);
 // Local Variables: //

libcfa/src/concurrency/monitor.cfa

-              r3c64c668
+              r58fe85a
 // Enter single monitor
 static void __enter( $monitor * this, const __monitor_group_t & group ) {
+        $thread * thrd = active_thread();
         // Lock the monitor spinlock
         lock( this->lock __cfaabi_dbg_ctx2 );
-        // Interrupts disable inside critical section
-        $thread * thrd = kernelTLS.this_thread;
         __cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
+        if( !this->owner ) {
+        if( unlikely(0 != (0x1 & (uintptr_t)this->owner)) ) {
+                abort( "Attempt by thread \"%.256s\" (%p) to access joined monitor %p.", thrd->self_cor.name, thrd, this );
+        }
+        else if( !this->owner ) {
                 // No one has the monitor, just take it
                 __set_owner( this, thrd );
 …
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->next == 0p );
+                /* paranoid */ verify( thrd->link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->next == 1p );
+                /* paranoid */ verify( thrd->link.next == 1p );
                 unlock( this->lock );
 …
                 __cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
                 /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
                 return;
+        }
 …
         __cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
         /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+        /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
         /* paranoid */ verify( this->lock.lock );
 …
+}
+static void __dtor_enter( $monitor * this, fptr_t func ) {
+static void __dtor_enter( $monitor * this, fptr_t func, bool join ) {
+        $thread * thrd = active_thread();
+        #if defined( __CFA_WITH_VERIFY__ )
+                bool is_thrd = this == &thrd->self_mon;
+        #endif
         // Lock the monitor spinlock
         lock( this->lock __cfaabi_dbg_ctx2 );
-        // Interrupts disable inside critical section
-        $thread * thrd = kernelTLS.this_thread;
         __cfaabi_dbg_print_safe( "Kernel : %10p Entering dtor for mon %p (%p)\n", thrd, this, this->owner);
 …
                 __set_owner( this, thrd );
+                verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
+                /* paranoid */ verify( !is_thrd || thrd->state == Halted || thrd->state == Cancelled );
                 unlock( this->lock );
                 return;
+        }
         else if( this->owner == thrd) {
+        else if( this->owner == thrd && !join) {
                 // We already have the monitor... but where about to destroy it so the nesting will fail
                 // Abort!
                 abort( "Attempt to destroy monitor %p by thread \"%.256s\" (%p) in nested mutex.", this, thrd->self_cor.name, thrd );
+        }
+        // SKULLDUGGERY: join will act as a dtor so it would normally trigger to above check
+        // because join will not release the monitor after it executed.
+        // to avoid that it sets the owner to the special value thrd | 1p before exiting
+        else if( this->owner == ($thread*)(1 | (uintptr_t)thrd) ) {
+                // restore the owner and just return
+                __cfaabi_dbg_print_safe( "Kernel : Destroying free mon %p\n", this);
+                // No one has the monitor, just take it
+                __set_owner( this, thrd );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
+                /* paranoid */ verify( !is_thrd || thrd->state == Halted || thrd->state == Cancelled );
+                unlock( this->lock );
+                return;
+        }
+        // The monitor is busy, if this is a thread and the thread owns itself, it better be active
+        /* paranoid */ verify( !is_thrd || this->owner != thrd || (thrd->state != Halted && thrd->state != Cancelled) );
         __lock_size_t count = 1;
 …
                 // Release the next thread
                 /* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
                 unpark( urgent->owner->waiting_thread );
 …
                 // Some one was waiting for us, enter
+                /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
+                __cfaabi_dbg_print_safe( "Kernel : Destroying %p\n", this);
+                return;
+        }
         else {
 …
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->next == 0p );
+                /* paranoid */ verify( thrd->link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->next == 1p );
+                /* paranoid */ verify( thrd->link.next == 1p );
                 unlock( this->lock );
 …
                 park();
                 /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
                 return;
+        }
-        __cfaabi_dbg_print_safe( "Kernel : Destroying %p\n", this);
+}
 …
         lock( this->lock __cfaabi_dbg_ctx2 );
         __cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", kernelTLS.this_thread, this, this->owner);
         /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+        __cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", active_thread(), this, this->owner);
+        /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
         // Leaving a recursion level, decrement the counter
 …
 // Leave single monitor for the last time
 void __dtor_leave( $monitor * this ) {
+void __dtor_leave( $monitor * this, bool join ) {
         __cfaabi_dbg_debug_do(
                 if( TL_GET( this_thread ) != this->owner ) {
                         abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, TL_GET( this_thread ), this->owner);
+                if( active_thread() != this->owner ) {
+                        abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, active_thread(), this->owner);
+                }
                 if( this->recursion != 1 ) {
+                if( this->recursion != 1  && !join ) {
                         abort( "Destroyed monitor %p has %d outstanding nested calls.\n", this, this->recursion - 1);
+                }
+        )
+}
+extern "C" {
+        // Leave the thread monitor
+        // last routine called by a thread.
+        // Should never return
+        void __cfactx_thrd_leave() {
+                $thread * thrd = TL_GET( this_thread );
+                $monitor * this = &thrd->self_mon;
+                // Lock the monitor now
+                lock( this->lock __cfaabi_dbg_ctx2 );
+                disable_interrupts();
+                thrd->state = Halted;
+                /* paranoid */ verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
+                // Leaving a recursion level, decrement the counter
+                this->recursion -= 1;
+                // If we haven't left the last level of recursion
+                // it must mean there is an error
+                if( this->recursion != 0) { abort( "Thread internal monitor has unbalanced recursion" ); }
+                // Fetch the next thread, can be null
+                $thread * new_owner = next_thread( this );
+                // Release the monitor lock
+                unlock( this->lock );
+                // Unpark the next owner if needed
+                /* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                /* paranoid */ verify( ! kernelTLS.this_processor->destroyer );
+                /* paranoid */ verify( thrd->state == Halted );
+                kernelTLS.this_processor->destroyer = new_owner;
+                // Leave the thread
+                __leave_thread();
+                // Control flow should never reach here!
+        }
+        this->owner = ($thread*)(1 | (uintptr_t)this->owner);
+}
+void __thread_finish( $thread * thrd ) {
+        $monitor * this = &thrd->self_mon;
+        // Lock the monitor now
+        /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
+        /* paranoid */ verify( this->lock.lock );
+        /* paranoid */ verify( thrd->context.SP );
+        /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) > ((uintptr_t)__get_stack(thrd->curr_cor)->limit), "ERROR : $thread %p has been corrupted.\n StackPointer too large.\n", thrd );
+        /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) < ((uintptr_t)__get_stack(thrd->curr_cor)->base ), "ERROR : $thread %p has been corrupted.\n StackPointer too small.\n", thrd );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
+        /* paranoid */ verify( thrd->state == Halting );
+        /* paranoid */ verify( this->recursion == 1 );
+        // Leaving a recursion level, decrement the counter
+        this->recursion -= 1;
+        this->owner = 0p;
+        // Fetch the next thread, can be null
+        $thread * new_owner = next_thread( this );
+        // Mark the state as fully halted
+        thrd->state = Halted;
+        // Release the monitor lock
+        unlock( this->lock );
+        // Unpark the next owner if needed
+        /* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( thrd->state == Halted );
+        unpark( new_owner );
+}
 …
 // Sorts monitors before entering
 void ?{}( monitor_guard_t & this, $monitor * m [], __lock_size_t count, fptr_t func ) {
         $thread * thrd = TL_GET( this_thread );
+        $thread * thrd = active_thread();
         // Store current array
 …
         // Restore thread context
         TL_GET( this_thread )->monitors = this.prev;
+        active_thread()->monitors = this.prev;
+}
 // Ctor for monitor guard
 // Sorts monitors before entering
 void ?{}( monitor_dtor_guard_t & this, $monitor * m [], fptr_t func ) {
+void ?{}( monitor_dtor_guard_t & this, $monitor * m [], fptr_t func, bool join ) {
         // optimization
         $thread * thrd = TL_GET( this_thread );
+        $thread * thrd = active_thread();
         // Store current array
 …
         this.prev = thrd->monitors;
+        // Save whether we are in a join or not
+        this.join = join;
         // Update thread context (needed for conditions)
         (thrd->monitors){m, 1, func};
         __dtor_enter( this.m, func );
+        __dtor_enter( this.m, func, join );
+}
 …
 void ^?{}( monitor_dtor_guard_t & this ) {
         // Leave the monitors in order
         __dtor_leave( this.m );
+        __dtor_leave( this.m, this.join );
         // Restore thread context
         TL_GET( this_thread )->monitors = this.prev;
+        active_thread()->monitors = this.prev;
+}
 …
         // Create the node specific to this wait operation
         wait_ctx( TL_GET( this_thread ), user_info );
+        wait_ctx( active_thread(), user_info );
         // Append the current wait operation to the ones already queued on the condition
 …
         //Some more checking in debug
         __cfaabi_dbg_debug_do(
                 $thread * this_thrd = TL_GET( this_thread );
+                $thread * this_thrd = active_thread();
                 if ( this.monitor_count != this_thrd->monitors.size ) {
                         abort( "Signal on condition %p made with different number of monitor(s), expected %zi got %zi", &this, this.monitor_count, this_thrd->monitors.size );
 …
         // Create the node specific to this wait operation
         wait_ctx_primed( kernelTLS.this_thread, 0 )
+        wait_ctx_primed( active_thread(), 0 )
         //save contexts
 …
         //Find the thread to run
         $thread * signallee = pop_head( this.blocked )->waiting_thread;
-        /* paranoid */ verify( signallee->next == 0p );
         __set_owner( monitors, count, signallee );
 …
                                 // Create the node specific to this wait operation
                                 wait_ctx_primed( kernelTLS.this_thread, 0 );
+                                wait_ctx_primed( active_thread(), 0 );
                                 // Save monitor states
 …
         // Create the node specific to this wait operation
         wait_ctx_primed( kernelTLS.this_thread, 0 );
+        wait_ctx_primed( active_thread(), 0 );
         monitor_save;
 …
         for( __lock_size_t i = 0; i < count; i++) {
                 verify( monitors[i]->owner == kernelTLS.this_thread );
+                verify( monitors[i]->owner == active_thread() );
+        }
 …
 static inline void __set_owner( $monitor * monitors [], __lock_size_t count, $thread * owner ) {
         /* paranoid */ verify ( monitors[0]->lock.lock );
         /* paranoid */ verifyf( monitors[0]->owner == kernelTLS.this_thread, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, monitors[0]->owner, monitors[0]->recursion, monitors[0] );
+        /* paranoid */ verifyf( monitors[0]->owner == active_thread(), "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), monitors[0]->owner, monitors[0]->recursion, monitors[0] );
         monitors[0]->owner        = owner;
         monitors[0]->recursion    = 1;
         for( __lock_size_t i = 1; i < count; i++ ) {
                 /* paranoid */ verify ( monitors[i]->lock.lock );
                 /* paranoid */ verifyf( monitors[i]->owner == kernelTLS.this_thread, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, monitors[i]->owner, monitors[i]->recursion, monitors[i] );
+                /* paranoid */ verifyf( monitors[i]->owner == active_thread(), "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), monitors[i]->owner, monitors[i]->recursion, monitors[i] );
                 monitors[i]->owner        = owner;
                 monitors[i]->recursion    = 0;
 …
                 //regardless of if we are ready to baton pass,
                 //we need to set the monitor as in use
                 /* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
                 __set_owner( this,  urgent->owner->waiting_thread );
 …
         // Get the next thread in the entry_queue
         $thread * new_owner = pop_head( this->entry_queue );
         /* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
         /* paranoid */ verify( !new_owner || new_owner->next == 0p );
+        /* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
+        /* paranoid */ verify( !new_owner || new_owner->link.next == 0p );
         __set_owner( this, new_owner );
 …
+        }
         __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? node->waiting_thread : 0p );
+        __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? (thread*)node->waiting_thread : (thread*)0p );
         return ready2run ? node->waiting_thread : 0p;
+}
 static inline void brand_condition( condition & this ) {
         $thread * thrd = TL_GET( this_thread );
+        $thread * thrd = active_thread();
         if( !this.monitors ) {
                 // __cfaabi_dbg_print_safe( "Branding\n" );
 …
         // For each thread in the entry-queue
         for(    $thread ** thrd_it = &entry_queue.head;
                 *thrd_it != 1p;
                 thrd_it = &(*thrd_it)->next
+                (*thrd_it) != 1p;
+                thrd_it = &(*thrd_it)->link.next
         ) {
                 // For each acceptable check if it matches

libcfa/src/concurrency/monitor.hfa

-              r3c64c668
+              r58fe85a
         $monitor *    m;
         __monitor_group_t prev;
+        bool join;
 };
 void ?{}( monitor_dtor_guard_t & this, $monitor ** m, void (*func)() );
+void ?{}( monitor_dtor_guard_t & this, $monitor ** m, void (*func)(), bool join );
 void ^?{}( monitor_dtor_guard_t & this );
 …
               void wait        ( condition & this, uintptr_t user_info = 0 );
+static inline bool is_empty    ( condition & this ) { return this.blocked.head == 1p; }
               bool signal      ( condition & this );
               bool signal_block( condition & this );
 static inline bool is_empty    ( condition & this ) { return this.blocked.head == 1p; }
+static inline bool signal_all  ( condition & this ) { bool ret = false; while(!is_empty(this)) { ret = signal(this) || ret; } return ret; }
          uintptr_t front       ( condition & this );

libcfa/src/concurrency/mutex.cfa

-              r3c64c668
+              r58fe85a
         this.lock{};
         this.blocked_threads{};
+        this.is_locked = false;
+}
 …
         lock( lock __cfaabi_dbg_ctx2 );
         if( is_locked ) {
                 append( blocked_threads, kernelTLS.this_thread );
+                append( blocked_threads, active_thread() );
                 unlock( lock );
                 park();
 …
         lock( lock __cfaabi_dbg_ctx2 );
         if( owner == 0p ) {
                 owner = kernelTLS.this_thread;
+                owner = active_thread();
                 recursion_count = 1;
                 unlock( lock );
+        }
         else if( owner == kernelTLS.this_thread ) {
+        else if( owner == active_thread() ) {
                 recursion_count++;
                 unlock( lock );
+        }
         else {
                 append( blocked_threads, kernelTLS.this_thread );
+                append( blocked_threads, active_thread() );
                 unlock( lock );
                 park();
 …
         lock( lock __cfaabi_dbg_ctx2 );
         if( owner == 0p ) {
                 owner = kernelTLS.this_thread;
+                owner = active_thread();
                 recursion_count = 1;
                 ret = true;
+        }
         else if( owner == kernelTLS.this_thread ) {
+        else if( owner == active_thread() ) {
                 recursion_count++;
                 ret = true;
 …
 void wait(condition_variable & this) {
         lock( this.lock __cfaabi_dbg_ctx2 );
         append( this.blocked_threads, kernelTLS.this_thread );
+        append( this.blocked_threads, active_thread() );
         unlock( this.lock );
         park();
 …
 void wait(condition_variable & this, L & l) {
         lock( this.lock __cfaabi_dbg_ctx2 );
         append( this.blocked_threads, kernelTLS.this_thread );
+        append( this.blocked_threads, active_thread() );
         unlock(l);
         unlock(this.lock);

libcfa/src/concurrency/preemption.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Dec  5 16:34:05 2019
 // Update Count     : 43
+// Last Modified On : Fri Nov  6 07:42:13 2020
+// Update Count     : 54
 //
 …
 #include <assert.h>
-extern "C" {
 #include <errno.h>
 #include <stdio.h>
 …
 #include <unistd.h>
 #include <limits.h>                                                                             // PTHREAD_STACK_MIN
+}
 #include "bits/signal.hfa"
+#include "kernel_private.hfa"
 #if !defined(__CFA_DEFAULT_PREEMPTION__)
 …
 // FwdDeclarations : Signal handlers
 static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ );
+static void sigHandler_alarm    ( __CFA_SIGPARMS__ );
 static void sigHandler_segv     ( __CFA_SIGPARMS__ );
 static void sigHandler_ill      ( __CFA_SIGPARMS__ );
 …
 #elif defined( __x86_64 )
 #define CFA_REG_IP gregs[REG_RIP]
 #elif defined( __ARM_ARCH )
+#elif defined( __arm__ )
 #define CFA_REG_IP arm_pc
+#elif defined( __aarch64__ )
+#define CFA_REG_IP pc
 #else
 #error unknown hardware architecture
+#error unsupported hardware architecture
 #endif
 …
 // Get next expired node
 static inline alarm_node_t * get_expired( alarm_list_t * alarms, Time currtime ) {
         if( !alarms->head ) return 0p;                                          // If no alarms return null
         if( alarms->head->alarm >= currtime ) return 0p;        // If alarms head not expired return null
+        if( ! & (*alarms)`first ) return 0p;                                            // If no alarms return null
+        if( (*alarms)`first.alarm >= currtime ) return 0p;      // If alarms head not expired return null
         return pop(alarms);                                                                     // Otherwise just pop head
+}
 // Tick one frame of the Discrete Event Simulation for alarms
 static void tick_preemption() {
+static void tick_preemption(void) {
         alarm_node_t * node = 0p;                                                       // Used in the while loop but cannot be declared in the while condition
         alarm_list_t * alarms = &event_kernel->alarms;          // Local copy for ease of reading
 …
         while( node = get_expired( alarms, currtime ) ) {
                 // __cfaabi_dbg_print_buffer_decl( " KERNEL: preemption tick.\n" );
+                Duration period = node->period;
+                if( period == 0) {
+                        node->set = false;                  // Node is one-shot, just mark it as not pending
+                }
                 // Check if this is a kernel
                 if( node->kernel_alarm ) {
+                if( node->type == Kernel ) {
                         preempt( node->proc );
+                }
+                else if( node->type == User ) {
+                        timeout( node->thrd );
+                }
                 else {
                         timeout( node->thrd );
+                        node->callback(*node);
+                }
                 // Check if this is a periodic alarm
-                Duration period = node->period;
                 if( period > 0 ) {
                         // __cfaabi_dbg_print_buffer_local( " KERNEL: alarm period is %lu.\n", period.tv );
 …
                         insert( alarms, node );             // Reinsert the node for the next time it triggers
+                }
-                else {
-                        node->set = false;                  // Node is one-shot, just mark it as not pending
+                }
+        }
         // If there are still alarms pending, reset the timer
         if( alarms->head ) {
                 __cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
                 Duration delta = alarms->head->alarm - currtime;
                 Duration caped = max(delta, 50`us);
+        if( & (*alarms)`first ) {
+                __cfadbg_print_buffer_decl(preemption, " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
+                Duration delta = (*alarms)`first.alarm - currtime;
+                Duration capped = max(delta, 50`us);
                 // itimerval tim  = { caped };
                 // __cfaabi_dbg_print_buffer_local( "    Values are %lu, %lu, %lu %lu.\n", delta.tv, caped.tv, tim.it_value.tv_sec, tim.it_value.tv_usec);
                 __kernel_set_timer( caped );
+                __kernel_set_timer( capped );
+        }
+}
 …
 // Kernel Signal Tools
 //=============================================================================================
+__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
+// In a user-level threading system, there are handful of thread-local variables where this problem occurs on the ARM.
+//
+// For each kernel thread running user-level threads, there is a flag variable to indicate if interrupts are
+// enabled/disabled for that kernel thread. Therefore, this variable is made thread local.
+//
+// For example, this code fragment sets the state of the "interrupt" variable in thread-local memory.
+//
+// _Thread_local volatile int interrupts;
+// int main() {
+//     interrupts = 0; // disable interrupts }
+//
+// which generates the following code on the ARM
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000000610 <+0>:  mrs     x1, tpidr_el0
+//    0x0000000000000614 <+4>:  mov     w0, #0x0                        // #0
+//    0x0000000000000618 <+8>:  add     x1, x1, #0x0, lsl #12
+//    0x000000000000061c <+12>: add     x1, x1, #0x10
+//    0x0000000000000620 <+16>: str     wzr, [x1]
+//    0x0000000000000624 <+20>: ret
+//
+// The mrs moves a pointer from coprocessor register tpidr_el0 into register x1.  Register w0 is set to 0. The two adds
+// increase the TLS pointer with the displacement (offset) 0x10, which is the location in the TSL of variable
+// "interrupts".  Finally, 0 is stored into "interrupts" through the pointer in register x1 that points into the
+// TSL. Now once x1 has the pointer to the location of the TSL for kernel thread N, it can be be preempted at a
+// user-level and the user thread is put on the user-level ready-queue. When the preempted thread gets to the front of
+// the user-level ready-queue it is run on kernel thread M. It now stores 0 into "interrupts" back on kernel thread N,
+// turning off interrupt on the wrong kernel thread.
+//
+// On the x86, the following code is generated for the same code fragment.
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000400420 <+0>:  movl   $0x0,%fs:0xfffffffffffffffc
+//    0x000000000040042c <+12>: xor    %eax,%eax
+//    0x000000000040042e <+14>: retq
+//
+// and there is base-displacement addressing used to atomically reset variable "interrupts" off of the TSL pointer in
+// register "fs".
+//
+// Hence, the ARM has base-displacement address for the general purpose registers, BUT not to the coprocessor
+// registers. As a result, generating the address for the write into variable "interrupts" is no longer atomic.
+//
+// Note this problem does NOT occur when just using multiple kernel threads because the preemption ALWAYS restarts the
+// thread on the same kernel thread.
+//
+// The obvious question is why does ARM use a coprocessor register to store the TSL pointer given that coprocessor
+// registers are second-class registers with respect to the instruction set. One possible answer is that they did not
+// want to dedicate one of the general registers to hold the TLS pointer and there was a free coprocessor register
+// available.
+//-----------------------------------------------------------------------------
+// Some assembly required
+#define __cfaasm_label(label, when) when: asm volatile goto(".global __cfaasm_" #label "_" #when "\n" "__cfaasm_" #label "_" #when ":":::"memory":when)
+//----------
+// special case for preemption since used often
+bool __preemption_enabled() {
+        // create a assembler label before
+        // marked as clobber all to avoid movement
+        __cfaasm_label(check, before);
+        // access tls as normal
+        bool enabled = __cfaabi_tls.preemption_state.enabled;
+        // create a assembler label after
+        // marked as clobber all to avoid movement
+        __cfaasm_label(check, after);
+        return enabled;
+}
+struct asm_region {
+        void * before;
+        void * after;
+};
+static inline bool __cfaasm_in( void * ip, struct asm_region & region ) {
+        return ip >= region.before && ip <= region.after;
+}
+//----------
+// Get data from the TLS block
+// struct asm_region __cfaasm_get;
+uintptr_t __cfatls_get( unsigned long int offset ) __attribute__((__noinline__)); //no inline to avoid problems
+uintptr_t __cfatls_get( unsigned long int offset ) {
+        // create a assembler label before
+        // marked as clobber all to avoid movement
+        __cfaasm_label(get, before);
+        // access tls as normal (except for pointer arithmetic)
+        uintptr_t val = *(uintptr_t*)((uintptr_t)&__cfaabi_tls + offset);
+        // create a assembler label after
+        // marked as clobber all to avoid movement
+        __cfaasm_label(get, after);
+        return val;
+}
 extern "C" {
         // Disable interrupts by incrementing the counter
         void disable_interrupts() {
+                with( kernelTLS.preemption_state ) {
+                // create a assembler label before
+                // marked as clobber all to avoid movement
+                __cfaasm_label(dsable, before);
+                with( __cfaabi_tls.preemption_state ) {
                         #if GCC_VERSION > 50000
                         static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
 …
                         verify( new_val < 65_000u );              // If this triggers someone is disabling interrupts without enabling them
+                }
+                // create a assembler label after
+                // marked as clobber all to avoid movement
+                __cfaasm_label(dsable, after);
+        }
 …
         // If counter reaches 0, execute any pending __cfactx_switch
         void enable_interrupts( __cfaabi_dbg_ctx_param ) {
+                processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
+                with( kernelTLS.preemption_state ){
+                // Cache the processor now since interrupts can start happening after the atomic store
+                processor   * proc = __cfaabi_tls.this_processor;
+                /* paranoid */ verify( proc );
+                with( __cfaabi_tls.preemption_state ){
                         unsigned short prev = disable_count;
                         disable_count -= 1;
+                        verify( prev != 0u );                     // If this triggers someone is enabled already enabled interruptsverify( prev != 0u );
+                        // If this triggers someone is enabled already enabled interruptsverify( prev != 0u );
+                        /* paranoid */ verify( prev != 0u );
                         // Check if we need to prempt the thread because an interrupt was missed
                         if( prev == 1 ) {
                                 #if GCC_VERSION > 50000
                                 static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
+                                        static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
                                 #endif
 …
         // Don't execute any pending __cfactx_switch even if counter reaches 0
         void enable_interrupts_noPoll() {
+                unsigned short prev = kernelTLS.preemption_state.disable_count;
+                kernelTLS.preemption_state.disable_count -= 1;
+                verifyf( prev != 0u, "Incremented from %u\n", prev );                     // If this triggers someone is enabled already enabled interrupts
+                unsigned short prev = __cfaabi_tls.preemption_state.disable_count;
+                __cfaabi_tls.preemption_state.disable_count -= 1;
+                // If this triggers someone is enabled already enabled interrupts
+                /* paranoid */ verifyf( prev != 0u, "Incremented from %u\n", prev );
                 if( prev == 1 ) {
                         #if GCC_VERSION > 50000
                         static_assert(__atomic_always_lock_free(sizeof(kernelTLS.preemption_state.enabled), &kernelTLS.preemption_state.enabled), "Must be lock-free");
+                                static_assert(__atomic_always_lock_free(sizeof(__cfaabi_tls.preemption_state.enabled), &__cfaabi_tls.preemption_state.enabled), "Must be lock-free");
                         #endif
                         // Set enabled flag to true
                         // should be atomic to avoid preemption in the middle of the operation.
                         // use memory order RELAXED since there is no inter-thread on this variable requirements
                         __atomic_store_n(&kernelTLS.preemption_state.enabled, true, __ATOMIC_RELAXED);
+                        __atomic_store_n(&__cfaabi_tls.preemption_state.enabled, true, __ATOMIC_RELAXED);
                         // Signal the compiler that a fence is needed but only for signal handlers
 …
+        }
+}
+//-----------------------------------------------------------------------------
+// Kernel Signal Debug
+void __cfaabi_check_preemption() {
+        bool ready = __preemption_enabled();
+        if(!ready) { abort("Preemption should be ready"); }
+        __cfaasm_label(debug, before);
+                sigset_t oldset;
+                int ret;
+                ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+                if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
+                ret = sigismember(&oldset, SIGUSR1);
+                if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+                if(ret == 1) { abort("ERROR SIGUSR1 is disabled"); }
+                ret = sigismember(&oldset, SIGALRM);
+                if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+                if(ret == 0) { abort("ERROR SIGALRM is enabled"); }
+                ret = sigismember(&oldset, SIGTERM);
+                if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+                if(ret == 1) { abort("ERROR SIGTERM is disabled"); }
+        __cfaasm_label(debug, after);
+}
+#ifdef __CFA_WITH_VERIFY__
+bool __cfaabi_dbg_in_kernel() {
+        return !__preemption_enabled();
+}
+#endif
+#undef __cfaasm_label
+//-----------------------------------------------------------------------------
+// Signal handling
 // sigprocmask wrapper : unblock a single signal
 …
         if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
             abort( "internal error, pthread_sigmask" );
+                abort( "internal error, pthread_sigmask" );
+        }
+}
 …
 // reserved for future use
 static void timeout( $thread * this ) {
+        //TODO : implement waking threads
+}
+        unpark( this );
+}
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if defined( __i386 )
+        #ifdef __PIC__
+                #define RELOC_PRELUDE( label ) \
+                        "calll   .Lcfaasm_prelude_" #label "$pb\n\t" \
+                        ".Lcfaasm_prelude_" #label "$pb:\n\t" \
+                        "popl    %%eax\n\t" \
+                        ".Lcfaasm_prelude_" #label "_end:\n\t" \
+                        "addl    $_GLOBAL_OFFSET_TABLE_+(.Lcfaasm_prelude_" #label "_end-.Lcfaasm_prelude_" #label "$pb), %%eax\n\t"
+                #define RELOC_PREFIX ""
+                #define RELOC_SUFFIX "@GOT(%%eax)"
+        #else
+                #define RELOC_PREFIX "$"
+                #define RELOC_SUFFIX ""
+        #endif
+        #define __cfaasm_label( label ) struct asm_region label = \
+                ({ \
+                        struct asm_region region; \
+                        asm( \
+                                RELOC_PRELUDE( label ) \
+                                "movl " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
+                                "movl " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
+                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+                        ); \
+                        region; \
+                });
+#elif defined( __x86_64 )
+        #ifdef __PIC__
+                #define RELOC_PREFIX ""
+                #define RELOC_SUFFIX "@GOTPCREL(%%rip)"
+        #else
+                #define RELOC_PREFIX "$"
+                #define RELOC_SUFFIX ""
+        #endif
+        #define __cfaasm_label( label ) struct asm_region label = \
+                ({ \
+                        struct asm_region region; \
+                        asm( \
+                                "movq " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
+                                "movq " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
+                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+                        ); \
+                        region; \
+                });
+#elif defined( __aarch64__ )
+        #ifdef __PIC__
+                // Note that this works only for gcc
+                #define __cfaasm_label( label ) struct asm_region label = \
+                ({ \
+                        struct asm_region region; \
+                        asm( \
+                                "adrp %[vb], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
+                                "ldr  %[vb], [%[vb], #:gotpage_lo15:__cfaasm_" #label "_before]" "\n\t" \
+                                "adrp %[va], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
+                                "ldr  %[va], [%[va], #:gotpage_lo15:__cfaasm_" #label "_after]"  "\n\t" \
+                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+                        ); \
+                        region; \
+                });
+        #else
+                #error this is not the right thing to do
+                /*
+                #define __cfaasm_label( label ) struct asm_region label = \
+                ({ \
+                        struct asm_region region; \
+                        asm( \
+                                "adrp %[vb], __cfaasm_" #label "_before"              "\n\t" \
+                                "add  %[vb], %[vb], :lo12:__cfaasm_" #label "_before" "\n\t" \
+                                "adrp %[va], :got:__cfaasm_" #label "_after"          "\n\t" \
+                                "add  %[va], %[va], :lo12:__cfaasm_" #label "_after"  "\n\t" \
+                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+                        ); \
+                        region; \
+                });
+                */
+        #endif
+#else
+        #error unknown hardware architecture
+#endif
 // KERNEL ONLY
 …
 // If true  : preemption is safe
 // If false : preemption is unsafe and marked as pending
+static inline bool preemption_ready() {
+static inline bool preemption_ready( void * ip ) {
+        // Get all the region for which it is not safe to preempt
+        __cfaasm_label( get    );
+        __cfaasm_label( check  );
+        __cfaasm_label( dsable );
+        __cfaasm_label( debug  );
         // Check if preemption is safe
+        bool ready = kernelTLS.preemption_state.enabled && ! kernelTLS.preemption_state.in_progress;
+        bool ready = true;
+        if( __cfaasm_in( ip, get    ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, check  ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, dsable ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, debug  ) ) { ready = false; goto EXIT; };
+        if( !__cfaabi_tls.preemption_state.enabled) { ready = false; goto EXIT; };
+        if( __cfaabi_tls.preemption_state.in_progress ) { ready = false; goto EXIT; };
+EXIT:
         // Adjust the pending flag accordingly
         kernelTLS.this_processor->pending_preemption = !ready;
+        __cfaabi_tls.this_processor->pending_preemption = !ready;
         return ready;
+}
 …
 // Startup routine to activate preemption
 // Called from kernel_startup
 void kernel_start_preemption() {
+void __kernel_alarm_startup() {
         __cfaabi_dbg_print_safe( "Kernel : Starting preemption\n" );
         // Start with preemption disabled until ready
         kernelTLS.preemption_state.enabled = false;
         kernelTLS.preemption_state.disable_count = 1;
+        __cfaabi_tls.preemption_state.enabled = false;
+        __cfaabi_tls.preemption_state.disable_count = 1;
         // Initialize the event kernel
 …
         // Setup proper signal handlers
         __cfaabi_sigaction( SIGUSR1, sigHandler_ctxSwitch, SA_SIGINFO | SA_RESTART ); // __cfactx_switch handler
+        __cfaabi_sigaction( SIGALRM, sigHandler_alarm    , SA_SIGINFO | SA_RESTART ); // debug handler
         signal_block( SIGALRM );
 …
 // Shutdown routine to deactivate preemption
 // Called from kernel_shutdown
 void kernel_stop_preemption() {
+void __kernel_alarm_shutdown() {
         __cfaabi_dbg_print_safe( "Kernel : Preemption stopping\n" );
 …
         // Wait for the preemption thread to finish
+        pthread_join( alarm_thread, 0p );
+        free( alarm_stack );
+        __destroy_pthread( alarm_thread, alarm_stack, 0p );
         // Preemption is now fully stopped
 …
 // Kernel Signal Handlers
 //=============================================================================================
+__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
 // Context switch signal handler
 // Receives SIGUSR1 signal and causes the current thread to yield
 static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ ) {
+        __cfaabi_dbg_debug_do( last_interrupt = (void *)(cxt->uc_mcontext.CFA_REG_IP); )
+        void * ip = (void *)(cxt->uc_mcontext.CFA_REG_IP);
+        __cfaabi_dbg_debug_do( last_interrupt = ip; )
         // SKULLDUGGERY: if a thread creates a processor and the immediately deletes it,
         // the interrupt that is supposed to force the kernel thread to preempt might arrive
         // before the kernel thread has even started running. When that happens an iterrupt
         // we a null 'this_processor' will be caught, just ignore it.
         if(! kernelTLS.this_processor ) return;
+        // before the kernel thread has even started running. When that happens, an interrupt
+        // with a null 'this_processor' will be caught, just ignore it.
+        if(! __cfaabi_tls.this_processor ) return;
         choose(sfp->si_value.sival_int) {
                 case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
                 case PREEMPT_TERMINATE: verify( __atomic_load_n( &kernelTLS.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
+                case PREEMPT_TERMINATE: verify( __atomic_load_n( &__cfaabi_tls.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
                 default:
                         abort( "internal error, signal value is %d", sfp->si_value.sival_int );
 …
         // Check if it is safe to preempt here
         if( !preemption_ready() ) { return; }
         __cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", kernelTLS.this_processor, kernelTLS.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
+        if( !preemption_ready( ip ) ) { return; }
+        __cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", __cfaabi_tls.this_processor, __cfaabi_tls.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
         // Sync flag : prevent recursive calls to the signal handler
         kernelTLS.preemption_state.in_progress = true;
+        __cfaabi_tls.preemption_state.in_progress = true;
         // Clear sighandler mask before context switching.
 …
+        }
-        // TODO: this should go in finish action
         // Clear the in progress flag
         kernelTLS.preemption_state.in_progress = false;
+        __cfaabi_tls.preemption_state.in_progress = false;
         // Preemption can occur here
 …
         force_yield( __ALARM_PREEMPTION ); // Do the actual __cfactx_switch
+}
+static void sigHandler_alarm( __CFA_SIGPARMS__ ) {
+        abort("SIGALRM should never reach the signal handler");
+}
+#if !defined(__CFA_NO_STATISTICS__)
+        int __print_alarm_stats = 0;
+#endif
 // Main of the alarm thread
 // Waits on SIGALRM and send SIGUSR1 to whom ever needs it
 static void * alarm_loop( __attribute__((unused)) void * args ) {
+        __processor_id_t id;
+        id.full_proc = false;
+        id.id = doregister(&id);
+        __cfaabi_tls.this_proc_id = &id;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct __stats_t local_stats;
+                __cfaabi_tls.this_stats = &local_stats;
+                __init_stats( &local_stats );
+        #endif
         // Block sigalrms to control when they arrive
         sigset_t mask;
 …
 EXIT:
         __cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
+        unregister(&id);
+        #if !defined(__CFA_NO_STATISTICS__)
+                if( 0 != __print_alarm_stats ) {
+                        __print_stats( &local_stats, __print_alarm_stats, "Alarm", "Thread", 0p );
+                }
+        #endif
         return 0p;
+}
-//=============================================================================================
-// Kernel Signal Debug
-//=============================================================================================
-void __cfaabi_check_preemption() {
-        bool ready = kernelTLS.preemption_state.enabled;
-        if(!ready) { abort("Preemption should be ready"); }
-        sigset_t oldset;
-        int ret;
-        ret = pthread_sigmask(0, 0p, &oldset);
-        if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
-        ret = sigismember(&oldset, SIGUSR1);
-        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-        if(ret == 1) { abort("ERROR SIGUSR1 is disabled"); }
-        ret = sigismember(&oldset, SIGALRM);
-        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-        if(ret == 0) { abort("ERROR SIGALRM is enabled"); }
-        ret = sigismember(&oldset, SIGTERM);
-        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-        if(ret == 1) { abort("ERROR SIGTERM is disabled"); }
+}
-#ifdef __CFA_WITH_VERIFY__
-bool __cfaabi_dbg_in_kernel() {
-        return !kernelTLS.preemption_state.enabled;
+}
-#endif
 // Local Variables: //

libcfa/src/concurrency/preemption.hfa

-              r3c64c668
+              r58fe85a
 #pragma once
+#include "bits/locks.hfa"
 #include "alarm.hfa"
-#include "kernel_private.hfa"
+void kernel_start_preemption();
+void kernel_stop_preemption();
+struct event_kernel_t {
+        alarm_list_t alarms;
+        __spinlock_t lock;
+};
+extern event_kernel_t * event_kernel;
 void update_preemption( processor * this, Duration duration );

libcfa/src/concurrency/thread.cfa

-              r3c64c668
+              r58fe85a
 #include "kernel_private.hfa"
+#include "exception.hfa"
 #define __CFA_INVOKE_PRIVATE__
 …
         context{ 0p, 0p };
         self_cor{ name, storage, storageSize };
+        ticket = TICKET_RUNNING;
         state = Start;
         preempted = __NO_PREEMPTION;
 …
         self_mon_p = &self_mon;
         curr_cluster = &cl;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
+        link.preferred = -1;
+        #if defined( __CFA_WITH_VERIFY__ )
+                canary = 0x0D15EA5E0D15EA5Ep;
+        #endif
+        seqable.next = 0p;
+        seqable.back = 0p;
         node.next = 0p;
 …
 void ^?{}($thread& this) with( this ) {
+        #if defined( __CFA_WITH_VERIFY__ )
+                canary = 0xDEADDEADDEADDEADp;
+        #endif
         unregister(curr_cluster, this);
         ^self_cor{};
+}
+FORALL_DATA_INSTANCE(ThreadCancelled, (dtype thread_t), (thread_t))
+forall(dtype T)
+void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src) {
+        dst->virtual_table = src->virtual_table;
+        dst->the_thread = src->the_thread;
+        dst->the_exception = src->the_exception;
+}
+forall(dtype T)
+const char * msg(ThreadCancelled(T) *) {
+        return "ThreadCancelled";
+}
+forall(dtype T)
+static void default_thread_cancel_handler(ThreadCancelled(T) & ) {
+        abort( "Unhandled thread cancellation.\n" );
+}
+forall(dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
+void ?{}( thread_dtor_guard_t & this,
+                T & thrd, void(*defaultResumptionHandler)(ThreadCancelled(T) &)) {
+        $monitor * m = get_monitor(thrd);
+        $thread * desc = get_thread(thrd);
+        // Setup the monitor guard
+        void (*dtor)(T& mutex this) = ^?{};
+        bool join = defaultResumptionHandler != (void(*)(ThreadCancelled(T)&))0;
+        (this.mg){&m, (void(*)())dtor, join};
+        /* paranoid */ verifyf( Halted == desc->state || Cancelled == desc->state, "Expected thread to be Halted or Cancelled, was %d\n", (int)desc->state );
+        // After the guard set-up and any wait, check for cancellation.
+        struct _Unwind_Exception * cancellation = desc->self_cor.cancellation;
+        if ( likely( 0p == cancellation ) ) {
+                return;
+        } else if ( Cancelled == desc->state ) {
+                return;
+        }
+        desc->state = Cancelled;
+        if (!join) {
+                defaultResumptionHandler = default_thread_cancel_handler;
+        }
+        ThreadCancelled(T) except;
+        // TODO: Remove explitate vtable set once trac#186 is fixed.
+        except.virtual_table = &get_exception_vtable(&except);
+        except.the_thread = &thrd;
+        except.the_exception = __cfaehm_cancellation_exception( cancellation );
+        throwResume except;
+        except.the_exception->virtual_table->free( except.the_exception );
+        free( cancellation );
+        desc->self_cor.cancellation = 0p;
+}
+void ^?{}( thread_dtor_guard_t & this ) {
+        ^(this.mg){};
+}
 …
         this_thrd->context.[SP, FP] = this_thrd->self_cor.context.[SP, FP];
         verify( this_thrd->context.SP );
+        /* paranoid */ verify( this_thrd->context.SP );
         __schedule_thread(this_thrd);
+        __schedule_thread( this_thrd );
         enable_interrupts( __cfaabi_dbg_ctx );
+}
 …
+}
+//-----------------------------------------------------------------------------
+forall(dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
+T & join( T & this ) {
+        thread_dtor_guard_t guard = { this, defaultResumptionHandler };
+        return this;
+}
+uint64_t thread_rand() {
+        disable_interrupts();
+        uint64_t ret = __tls_rand();
+        enable_interrupts( __cfaabi_dbg_ctx );
+        return ret;
+}
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/thread.hfa

-              r3c64c668
+              r58fe85a
 #include "kernel.hfa"
 #include "monitor.hfa"
+#include "exception.hfa"
 //-----------------------------------------------------------------------------
 // thread trait
 trait is_thread(dtype T) {
       void ^?{}(T& mutex this);
       void main(T& this);
       $thread* get_thread(T& this);
+        void ^?{}(T& mutex this);
+        void main(T& this);
+        $thread* get_thread(T& this);
 };
+FORALL_DATA_EXCEPTION(ThreadCancelled, (dtype thread_t), (thread_t)) (
+        thread_t * the_thread;
+        exception_t * the_exception;
+);
+forall(dtype T)
+void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src);
+forall(dtype T)
+const char * msg(ThreadCancelled(T) *);
 // define that satisfies the trait without using the thread keyword
 …
 static inline void ?{}($thread & this, const char * const name, struct cluster & cl, size_t stackSize ) { this{ name, cl, 0p, stackSize }; }
+struct thread_dtor_guard_t {
+        monitor_dtor_guard_t mg;
+};
+forall( dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
+void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(ThreadCancelled(T) &) );
+void ^?{}( thread_dtor_guard_t & this );
 //-----------------------------------------------------------------------------
 // thread runner
 …
 forall( dtype T | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this );
-//-----------------------------------------------------------------------------
-// Thread getters
-static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
 //-----------------------------------------------------------------------------
 …
 bool force_yield( enum __Preemption_Reason );
+static inline void yield() {
+        force_yield(__MANUAL_PREEMPTION);
+}
+//----------
+// sleep: force thread to block and be rescheduled after Duration duration
+void sleep( Duration duration );
+// Yield: yield N times
+static inline void yield( unsigned times ) {
+        for( times ) {
+                yield();
+        }
+}
+//----------
+// join
+forall( dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
+T & join( T & this );
 // Local Variables: //

libcfa/src/containers/vector.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Jul  5 18:00:07 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Jul 22 10:01:18 2017
 // Update Count     : 3
+// Last Modified On : Wed Jun 17 11:02:46 2020
+// Update Count     : 4
 //
 #pragma once
-extern "C" {
 #include <stdbool.h>
+}
 //------------------------------------------------------------------------------

libcfa/src/exception.c

-              r3c64c668
+              r58fe85a
 // Author           : Andrew Beach
 // Created On       : Mon Jun 26 15:13:00 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Feb 22 18:17:34 2018
 // Update Count     : 11
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Oct 27 16:27:00 2020
+// Update Count     : 35
 //
+// Normally we would get this from the CFA prelude.
 #include <stddef.h> // for size_t
+#include <unwind.h> // for struct _Unwind_Exception {...};
 #include "exception.h"
-// Implementation of the secret header.
 #include <stdlib.h>
 #include <stdio.h>
-#include <unwind.h>
 #include <bits/debug.hfa>
+// FIX ME: temporary hack to keep ARM build working
+#include "concurrency/invoke.h"
+#include "stdhdr/assert.h"
+#if defined( __ARM_ARCH )
+#warning FIX ME: temporary hack to keep ARM build working
 #ifndef _URC_FATAL_PHASE1_ERROR
 #define _URC_FATAL_PHASE1_ERROR 2
+#define _URC_FATAL_PHASE1_ERROR 3
 #endif // ! _URC_FATAL_PHASE1_ERROR
 #ifndef _URC_FATAL_PHASE2_ERROR
 #define _URC_FATAL_PHASE2_ERROR 2
 #endif // ! _URC_FATAL_PHASE2_ERROR
+#endif // __ARM_ARCH
 #include "lsda.h"
+/* The exception class for our exceptions. Because of the vendor component
+ * its value would not be standard.
+ * Vendor: UWPL
+ * Language: CFA\0
+ */
+const _Unwind_Exception_Class __cfaehm_exception_class = 0x4c50575500414643;
 // Base exception vtable is abstract, you should not have base exceptions.
 struct __cfaabi_ehm__base_exception_t_vtable
                 ___cfaabi_ehm__base_exception_t_vtable_instance = {
+struct __cfaehm_base_exception_t_vtable
+                ___cfaehm_base_exception_t_vtable_instance = {
         .parent = NULL,
         .size = 0,
 …
-// Temperary global exception context. Does not work with concurency.
-struct exception_context_t {
-    struct __cfaabi_ehm__try_resume_node * top_resume;
-    struct __cfaabi_ehm__try_resume_node * current_resume;
-    exception_t * current_exception;
-    int current_handler_index;
-} shared_stack = {NULL, NULL, 0, 0};
 // Get the current exception context.
 // There can be a single global until multithreading occurs, then each stack
+// needs its own. It will have to be updated to handle that.
+struct exception_context_t * this_exception_context() {
+// needs its own. We get this from libcfathreads (no weak attribute).
+__attribute__((weak)) struct exception_context_t * this_exception_context() {
+        static struct exception_context_t shared_stack = {NULL, NULL};
         return &shared_stack;
+}
-//#define SAVE_EXCEPTION_CONTEXT(to_name)
-//struct exception_context_t * to_name = this_exception_context();
-//exception * this_exception() {
-//    return this_exception_context()->current_exception;
-//}
-// This macro should be the only thing that needs to change across machines.
-// Used in the personality function, way down in termination.
-// struct _Unwind_Context * -> _Unwind_Reason_Code(*)(exception_t *)
-#define MATCHER_FROM_CONTEXT(ptr_to_context) \
-        (*(_Unwind_Reason_Code(**)(exception_t *))(_Unwind_GetCFA(ptr_to_context) + 8))
 // RESUMPTION ================================================================
+void __cfaabi_ehm__throw_resume(exception_t * except) {
+        __cfaabi_dbg_print_safe("Throwing resumption exception\n");
+        struct __cfaabi_ehm__try_resume_node * original_head = shared_stack.current_resume;
+        struct __cfaabi_ehm__try_resume_node * current =
+                (original_head) ? original_head->next : shared_stack.top_resume;
+        for ( ; current ; current = current->next) {
+                shared_stack.current_resume = current;
+                if (current->handler(except)) {
+                        shared_stack.current_resume = original_head;
+                        return;
+static void reset_top_resume(struct __cfaehm_try_resume_node ** store) {
+        this_exception_context()->top_resume = *store;
+}
+void __cfaehm_throw_resume(exception_t * except, void (*defaultHandler)(exception_t *)) {
+        struct exception_context_t * context = this_exception_context();
+        __cfadbg_print_safe(exception, "Throwing resumption exception\n");
+        {
+                __attribute__((cleanup(reset_top_resume)))
+                struct __cfaehm_try_resume_node * original_head = context->top_resume;
+                struct __cfaehm_try_resume_node * current = context->top_resume;
+                for ( ; current ; current = current->next) {
+                        context->top_resume = current->next;
+                        if (current->handler(except)) {
+                                return;
+                        }
+                }
+        }
+        __cfaabi_dbg_print_safe("Unhandled exception\n");
+        shared_stack.current_resume = original_head;
+        // Fall back to termination:
+        __cfaabi_ehm__throw_terminate(except);
+        // TODO: Default handler for resumption.
+        } // End the search and return to the top of the stack.
+        // No handler found, fall back to the default operation.
+        __cfadbg_print_safe(exception, "Unhandled exception\n");
+        defaultHandler(except);
+}
 …
 // be added after the node is built but before it is made the top node.
 void __cfaabi_ehm__try_resume_setup(struct __cfaabi_ehm__try_resume_node * node,
+void __cfaehm_try_resume_setup(struct __cfaehm_try_resume_node * node,
                         _Bool (*handler)(exception_t * except)) {
+        node->next = shared_stack.top_resume;
+        struct exception_context_t * context = this_exception_context();
+        node->next = context->top_resume;
         node->handler = handler;
+        shared_stack.top_resume = node;
+}
+void __cfaabi_ehm__try_resume_cleanup(struct __cfaabi_ehm__try_resume_node * node) {
+        shared_stack.top_resume = node->next;
+}
+// TERMINATION ===============================================================
+// MEMORY MANAGEMENT (still for integers)
+// May have to move to cfa for constructors and destructors (references).
+struct __cfaabi_ehm__node {
+        struct __cfaabi_ehm__node * next;
+};
+        context->top_resume = node;
+}
+void __cfaehm_try_resume_cleanup(struct __cfaehm_try_resume_node * node) {
+        struct exception_context_t * context = this_exception_context();
+        context->top_resume = node->next;
+}
+// MEMORY MANAGEMENT =========================================================
 #define NODE_TO_EXCEPT(node) ((exception_t *)(1 + (node)))
+#define EXCEPT_TO_NODE(except) ((struct __cfaabi_ehm__node *)(except) - 1)
+#define EXCEPT_TO_NODE(except) ((struct __cfaehm_node *)(except) - 1)
+#define UNWIND_TO_NODE(unwind) ((struct __cfaehm_node *)(unwind))
+#define NULL_MAP(map, ptr) ((ptr) ? (map(ptr)) : NULL)
+// How to clean up an exception in various situations.
+static void __cfaehm_exception_cleanup(
+                _Unwind_Reason_Code reason,
+                struct _Unwind_Exception * exception) {
+        switch (reason) {
+        case _URC_FOREIGN_EXCEPTION_CAUGHT:
+                // This one we could clean-up to allow cross-language exceptions.
+        case _URC_FATAL_PHASE1_ERROR:
+        case _URC_FATAL_PHASE2_ERROR:
+        default:
+                abort();
+        }
+}
 // Creates a copy of the indicated exception and sets current_exception to it.
 static void __cfaabi_ehm__allocate_exception( exception_t * except ) {
+static void __cfaehm_allocate_exception( exception_t * except ) {
         struct exception_context_t * context = this_exception_context();
         // Allocate memory for the exception.
         struct __cfaabi_ehm__node * store = malloc(
                 sizeof( struct __cfaabi_ehm__node ) + except->virtual_table->size );
+        struct __cfaehm_node * store = malloc(
+                sizeof( struct __cfaehm_node ) + except->virtual_table->size );
         if ( ! store ) {
 …
+        }
+        // Initialize the node:
+        exception_t * except_store = NODE_TO_EXCEPT(store);
+        store->unwind_exception.exception_class = __cfaehm_exception_class;
+        store->unwind_exception.exception_cleanup = __cfaehm_exception_cleanup;
+        store->handler_index = 0;
+        except->virtual_table->copy( except_store, except );
         // Add the node to the list:
+        store->next = EXCEPT_TO_NODE(context->current_exception);
+        context->current_exception = NODE_TO_EXCEPT(store);
+        // Copy the exception to storage.
+        except->virtual_table->copy( context->current_exception, except );
+        store->next = NULL_MAP(EXCEPT_TO_NODE, context->current_exception);
+        context->current_exception = except_store;
+}
 // Delete the provided exception, unsetting current_exception if relivant.
 static void __cfaabi_ehm__delete_exception( exception_t * except ) {
         struct exception_context_t * context = this_exception_context();
         __cfaabi_dbg_print_safe("Deleting Exception\n");
+static void __cfaehm_delete_exception( exception_t * except ) {
+        struct exception_context_t * context = this_exception_context();
+        __cfadbg_print_safe(exception, "Deleting Exception\n");
         // Remove the exception from the list.
         struct __cfaabi_ehm__node * to_free = EXCEPT_TO_NODE(except);
         struct __cfaabi_ehm__node * node;
+        struct __cfaehm_node * to_free = EXCEPT_TO_NODE(except);
+        struct __cfaehm_node * node;
         if ( context->current_exception == except ) {
                 node = to_free->next;
                 context->current_exception = (node) ? NODE_TO_EXCEPT(node) : 0;
+                context->current_exception = NULL_MAP(NODE_TO_EXCEPT, node);
         } else {
                 node = EXCEPT_TO_NODE(context->current_exception);
                 // It may always be in the first or second position.
                 while( to_free != node->next ) {
+                while ( to_free != node->next ) {
                         node = node->next;
+                }
 …
+}
+// If this isn't a rethrow (*except==0), delete the provided exception.
+void __cfaabi_ehm__cleanup_terminate( void * except ) {
+        if ( *(void**)except ) __cfaabi_ehm__delete_exception( *(exception_t **)except );
+}
+// We need a piece of storage to raise the exception
+struct _Unwind_Exception this_exception_storage;
+// CANCELLATION ==============================================================
 // Function needed by force unwind
 …
                 int version,
                 _Unwind_Action actions,
                 _Unwind_Exception_Class exceptionClass,
+                _Unwind_Exception_Class exception_class,
                 struct _Unwind_Exception * unwind_exception,
+                struct _Unwind_Context * context,
+                void * some_param) {
+        if( actions & _UA_END_OF_STACK  ) exit(1);
+        if( actions & _UA_CLEANUP_PHASE ) return _URC_NO_REASON;
+        return _URC_FATAL_PHASE2_ERROR;
+                struct _Unwind_Context * unwind_context,
+                void * stop_param) {
+        // Verify actions follow the rules we expect.
+        verify(actions & _UA_CLEANUP_PHASE);
+        verify(actions & _UA_FORCE_UNWIND);
+        verify(!(actions & _UA_SEARCH_PHASE));
+        verify(!(actions & _UA_HANDLER_FRAME));
+        if ( actions & _UA_END_OF_STACK ) {
+                abort();
+        } else {
+                return _URC_NO_REASON;
+        }
+}
+__attribute__((weak)) _Unwind_Reason_Code
+__cfaehm_cancellation_unwind( struct _Unwind_Exception * exception ) {
+        return _Unwind_ForcedUnwind( exception, _Stop_Fn, (void*)0x22 );
+}
+// Cancel the current stack, prefroming approprate clean-up and messaging.
+void __cfaehm_cancel_stack( exception_t * exception ) {
+        __cfaehm_allocate_exception( exception );
+        struct exception_context_t * context = this_exception_context();
+        struct __cfaehm_node * node = EXCEPT_TO_NODE(context->current_exception);
+        // Preform clean-up of any extra active exceptions.
+        while ( node->next ) {
+                struct __cfaehm_node * to_free = node->next;
+                node->next = to_free->next;
+                exception_t * except = NODE_TO_EXCEPT( to_free );
+                except->virtual_table->free( except );
+            free( to_free );
+        }
+        _Unwind_Reason_Code ret;
+        ret = __cfaehm_cancellation_unwind( &node->unwind_exception );
+        printf("UNWIND ERROR %d after force unwind\n", ret);
+        abort();
+}
+// TERMINATION ===============================================================
+// If this isn't a rethrow (*except==0), delete the provided exception.
+void __cfaehm_cleanup_terminate( void * except ) {
+        if ( *(void**)except ) __cfaehm_delete_exception( *(exception_t **)except );
+}
+static void __cfaehm_cleanup_default( exception_t ** except ) {
+        __cfaehm_delete_exception( *except );
+        *except = NULL;
+}
 // The exception that is being thrown must already be stored.
+__attribute__((noreturn)) void __cfaabi_ehm__begin_unwind(void) {
+        if ( ! this_exception_context()->current_exception ) {
+static void __cfaehm_begin_unwind(void(*defaultHandler)(exception_t *)) {
+        struct exception_context_t * context = this_exception_context();
+        if ( NULL == context->current_exception ) {
                 printf("UNWIND ERROR missing exception in begin unwind\n");
                 abort();
+        }
+        struct _Unwind_Exception * storage =
+                &EXCEPT_TO_NODE(context->current_exception)->unwind_exception;
         // Call stdlibc to raise the exception
+        _Unwind_Reason_Code ret = _Unwind_RaiseException( &this_exception_storage );
+        __cfadbg_print_safe(exception, "Begin unwinding (storage &p, context %p)\n", storage, context);
+        _Unwind_Reason_Code ret = _Unwind_RaiseException( storage );
         // If we reach here it means something happened. For resumption to work we need to find a way
 …
         // the whole stack.
+        if( ret == _URC_END_OF_STACK ) {
+                // No proper handler was found. This can be handled in many ways, C++ calls std::terminate.
+                // Here we force unwind the stack, basically raising a cancellation.
+                printf("Uncaught exception %p\n", &this_exception_storage);
+                ret = _Unwind_ForcedUnwind( &this_exception_storage, _Stop_Fn, (void*)0x22 );
+                printf("UNWIND ERROR %d after force unwind\n", ret);
+        // We did not simply reach the end of the stack without finding a handler. This is an error.
+        if ( ret != _URC_END_OF_STACK ) {
+                printf("UNWIND ERROR %d after raise exception\n", ret);
                 abort();
+        }
+        // We did not simply reach the end of the stack without finding a handler. This is an error.
+        printf("UNWIND ERROR %d after raise exception\n", ret);
+        // No handler found, go to the default operation.
+        __cfadbg_print_safe(exception, "Uncaught exception %p\n", storage);
+        __attribute__((cleanup(__cfaehm_cleanup_default)))
+        exception_t * exception = context->current_exception;
+        defaultHandler( exception );
+}
+void __cfaehm_throw_terminate( exception_t * val, void (*defaultHandler)(exception_t *) ) {
+        __cfadbg_print_safe(exception, "Throwing termination exception\n");
+        __cfaehm_allocate_exception( val );
+        __cfaehm_begin_unwind( defaultHandler );
+}
+static __attribute__((noreturn)) void __cfaehm_rethrow_adapter( exception_t * except ) {
+        // TODO: Print some error message.
+        (void)except;
         abort();
+}
+void __cfaabi_ehm__throw_terminate( exception_t * val ) {
+        __cfaabi_dbg_print_safe("Throwing termination exception\n");
+        __cfaabi_ehm__allocate_exception( val );
+        __cfaabi_ehm__begin_unwind();
+}
+void __cfaabi_ehm__rethrow_terminate(void) {
+        __cfaabi_dbg_print_safe("Rethrowing termination exception\n");
+        __cfaabi_ehm__begin_unwind();
+}
+#pragma GCC push_options
+#pragma GCC optimize("O0")
+void __cfaehm_rethrow_terminate(void) {
+        __cfadbg_print_safe(exception, "Rethrowing termination exception\n");
+        __cfaehm_begin_unwind( __cfaehm_rethrow_adapter );
+        abort();
+}
+#if defined( __x86_64 ) || defined( __i386 )
 // This is our personality routine. For every stack frame annotated with
 // ".cfi_personality 0x3,__gcfa_personality_v0" this function will be called twice when unwinding.
 //  Once in the search phase and once in the cleanup phase.
+_Unwind_Reason_Code __gcfa_personality_v0 (
+                int version, _Unwind_Action actions, unsigned long long exceptionClass,
+                struct _Unwind_Exception* unwind_exception,
+                struct _Unwind_Context* context)
+_Unwind_Reason_Code __gcfa_personality_v0(
+                int version,
+                _Unwind_Action actions,
+                unsigned long long exception_class,
+                struct _Unwind_Exception * unwind_exception,
+                struct _Unwind_Context * unwind_context)
+{
+        //__cfaabi_dbg_print_safe("CFA: 0x%lx\n", _Unwind_GetCFA(context));
+        __cfaabi_dbg_print_safe("Personality function (%d, %x, %llu, %p, %p):",
+                        version, actions, exceptionClass, unwind_exception, context);
+        // If we've reached the end of the stack then there is nothing much we can do...
+        if( actions & _UA_END_OF_STACK ) return _URC_END_OF_STACK;
+        //__cfadbg_print_safe(exception, "CFA: 0x%lx\n", _Unwind_GetCFA(context));
+        __cfadbg_print_safe(exception, "Personality function (%d, %x, %llu, %p, %p):",
+                        version, actions, exception_class, unwind_exception, unwind_context);
+        // Verify that actions follow the rules we expect.
+        // This function should never be called at the end of the stack.
+        verify(!(actions & _UA_END_OF_STACK));
+        // Either only the search phase flag is set or...
         if (actions & _UA_SEARCH_PHASE) {
+                __cfaabi_dbg_print_safe(" lookup phase");
+        }
+        else if (actions & _UA_CLEANUP_PHASE) {
+                __cfaabi_dbg_print_safe(" cleanup phase");
+        }
+        // Just in case, probably can't actually happen
+        else {
+                printf(" error\n");
+                return _URC_FATAL_PHASE1_ERROR;
+                verify(actions == _UA_SEARCH_PHASE);
+                __cfadbg_print_safe(exception, " lookup phase");
+        // ... we are in clean-up phase.
+        } else {
+                verify(actions & _UA_CLEANUP_PHASE);
+                __cfadbg_print_safe(exception, " cleanup phase");
+                // We shouldn't be the handler frame during forced unwind.
+                if (actions & _UA_HANDLER_FRAME) {
+                        verify(!(actions & _UA_FORCE_UNWIND));
+                        __cfadbg_print_safe(exception, " (handler frame)");
+                } else if (actions & _UA_FORCE_UNWIND) {
+                        __cfadbg_print_safe(exception, " (force unwind)");
+                }
+        }
         // Get a pointer to the language specific data from which we will read what we need
         const unsigned char * lsd = (const unsigned char*) _Unwind_GetLanguageSpecificData( context );
         if( !lsd ) {    //Nothing to do, keep unwinding
+        const unsigned char * lsd = _Unwind_GetLanguageSpecificData( unwind_context );
+        if ( !lsd ) {   //Nothing to do, keep unwinding
                 printf(" no LSD");
                 goto UNWIND;
 …
         // Get the instuction pointer and a reading pointer into the exception table
         lsda_header_info lsd_info;
+        const unsigned char * cur_ptr = parse_lsda_header(context, lsd, &lsd_info);
+        _Unwind_Ptr instruction_ptr = _Unwind_GetIP( context );
+        const unsigned char * cur_ptr = parse_lsda_header(unwind_context, lsd, &lsd_info);
+        _Unwind_Ptr instruction_ptr = _Unwind_GetIP(unwind_context);
+        struct exception_context_t * context = this_exception_context();
         // Linearly search the table for stuff to do
         while( cur_ptr < lsd_info.action_table ) {
+        while ( cur_ptr < lsd_info.action_table ) {
                 _Unwind_Ptr callsite_start;
                 _Unwind_Ptr callsite_len;
 …
                 // Have we reach the correct frame info yet?
                 if( lsd_info.Start + callsite_start + callsite_len < instruction_ptr ) {
+                if ( lsd_info.Start + callsite_start + callsite_len < instruction_ptr ) {
 #ifdef __CFA_DEBUG_PRINT__
                         void * ls = (void*)lsd_info.Start;
 …
                         void * ep = (void*)lsd_info.Start + callsite_start + callsite_len;
                         void * ip = (void*)instruction_ptr;
                         __cfaabi_dbg_print_safe("\nfound %p - %p (%p, %p, %p), looking for %p\n",
+                        __cfadbg_print_safe(exception, "\nfound %p - %p (%p, %p, %p), looking for %p\n",
                                         bp, ep, ls, cs, cl, ip);
 #endif // __CFA_DEBUG_PRINT__
 …
                 // Have we gone too far?
                 if( lsd_info.Start + callsite_start > instruction_ptr ) {
+                if ( lsd_info.Start + callsite_start > instruction_ptr ) {
                         printf(" gone too far");
                         break;
+                }
+                // Something to do?
+                if( callsite_landing_pad ) {
+                        // Which phase are we in
+                        if (actions & _UA_SEARCH_PHASE) {
+                                // In search phase, these means we found a potential handler we must check.
+                                // We have arbitrarily decided that 0 means nothing to do and 1 means there is
+                                // a potential handler. This doesn't seem to conflict the gcc default behavior.
+                                if (callsite_action != 0) {
+                                        // Now we want to run some code to see if the handler matches
+                                        // This is the tricky part where we want to the power to run arbitrary code
+                                        // However, generating a new exception table entry and try routine every time
+                                        // is way more expansive than we might like
+                                        // The information we have is :
+                                        //  - The GR (Series of registers)
+                                        //    GR1=GP Global Pointer of frame ref by context
+                                        //  - The instruction pointer
+                                        //  - The instruction pointer info (???)
+                                        //  - The CFA (Canonical Frame Address)
+                                        //  - The BSP (Probably the base stack pointer)
+                                        // The current apprach uses one exception table entry per try block
+                                        _uleb128_t imatcher;
+                                        // Get the relative offset to the {...}?
+                                        cur_ptr = read_uleb128(cur_ptr, &imatcher);
+                                        _Unwind_Reason_Code (*matcher)(exception_t *) =
+                                                MATCHER_FROM_CONTEXT(context);
+                                        int index = matcher(shared_stack.current_exception);
+                                        _Unwind_Reason_Code ret = (0 == index)
+                                                ? _URC_CONTINUE_UNWIND : _URC_HANDLER_FOUND;
+                                        shared_stack.current_handler_index = index;
+                                        // Based on the return value, check if we matched the exception
+                                        if( ret == _URC_HANDLER_FOUND) {
+                                                __cfaabi_dbg_print_safe(" handler found\n");
+                                        } else {
+                                                __cfaabi_dbg_print_safe(" no handler\n");
+                                        }
+                                        return ret;
+                // Check for what we must do:
+                if ( 0 == callsite_landing_pad ) {
+                        // Nothing to do, move along
+                        __cfadbg_print_safe(exception, " no landing pad");
+                } else if (actions & _UA_SEARCH_PHASE) {
+                        // In search phase, these means we found a potential handler we must check.
+                        // We have arbitrarily decided that 0 means nothing to do and 1 means there is
+                        // a potential handler. This doesn't seem to conflict the gcc default behavior.
+                        if (callsite_action != 0) {
+                                // Now we want to run some code to see if the handler matches
+                                // This is the tricky part where we want to the power to run arbitrary code
+                                // However, generating a new exception table entry and try routine every time
+                                // is way more expansive than we might like
+                                // The information we have is :
+                                //  - The GR (Series of registers)
+                                //    GR1=GP Global Pointer of frame ref by context
+                                //  - The instruction pointer
+                                //  - The instruction pointer info (???)
+                                //  - The CFA (Canonical Frame Address)
+                                //  - The BSP (Probably the base stack pointer)
+                                // The current apprach uses one exception table entry per try block
+                                _uleb128_t imatcher;
+                                // Get the relative offset to the {...}?
+                                cur_ptr = read_uleb128(cur_ptr, &imatcher);
+                                _Unwind_Word match_pos =
+#                               if defined( __x86_64 )
+                                    _Unwind_GetCFA(unwind_context) + 8;
+#                               elif defined( __i386 )
+                                    _Unwind_GetCFA(unwind_context) + 24;
+#                               elif defined( __ARM_ARCH )
+#                                   warning FIX ME: check if anything needed for ARM
+;
+#                               endif
+                                int (*matcher)(exception_t *) = *(int(**)(exception_t *))match_pos;
+                                int index = matcher(context->current_exception);
+                                _Unwind_Reason_Code ret = (0 == index)
+                                        ? _URC_CONTINUE_UNWIND : _URC_HANDLER_FOUND;
+                                UNWIND_TO_NODE(unwind_exception)->handler_index = index;
+                                // Based on the return value, check if we matched the exception
+                                if (ret == _URC_HANDLER_FOUND) {
+                                        __cfadbg_print_safe(exception, " handler found\n");
+                                } else {
+                                        // TODO: Continue the search if there is more in the table.
+                                        __cfadbg_print_safe(exception, " no handler\n");
+                                }
+                                // This is only a cleanup handler, ignore it
+                                __cfaabi_dbg_print_safe(" no action");
+                                return ret;
+                        }
+                        else if (actions & _UA_CLEANUP_PHASE) {
+                                if( (callsite_action != 0) && !(actions & _UA_HANDLER_FRAME) ){
+                                        // If this is a potential exception handler
+                                        // but not the one that matched the exception in the seach phase,
+                                        // just ignore it
+                                        goto UNWIND;
+                                }
+                                // We need to run some clean-up or a handler
+                                // These statment do the right thing but I don't know any specifics at all
+                                _Unwind_SetGR( context, __builtin_eh_return_data_regno(0), (_Unwind_Ptr) unwind_exception );
+                                _Unwind_SetGR( context, __builtin_eh_return_data_regno(1), 0 );
+                                // I assume this sets the instruction pointer to the adress of the landing pad
+                                // It doesn't actually set it, it only state the value that needs to be set once we return _URC_INSTALL_CONTEXT
+                                _Unwind_SetIP( context, ((lsd_info.LPStart) + (callsite_landing_pad)) );
+                                __cfaabi_dbg_print_safe(" action\n");
+                                // Return have some action to run
+                                return _URC_INSTALL_CONTEXT;
+                        // This is only a cleanup handler, ignore it
+                        __cfadbg_print_safe(exception, " no action");
+                } else {
+                        // In clean-up phase, no destructors here but this could be the handler.
+                        if ( (callsite_action != 0) && !(actions & _UA_HANDLER_FRAME) ){
+                                // If this is a potential exception handler
+                                // but not the one that matched the exception in the seach phase,
+                                // just ignore it
+                                goto UNWIND;
+                        }
+                        // We need to run some clean-up or a handler
+                        // These statment do the right thing but I don't know any specifics at all
+                        _Unwind_SetGR( unwind_context, __builtin_eh_return_data_regno(0),
+                                (_Unwind_Ptr)unwind_exception );
+                        _Unwind_SetGR( unwind_context, __builtin_eh_return_data_regno(1), 0 );
+                        // I assume this sets the instruction pointer to the adress of the landing pad
+                        // It doesn't actually set it, it only state the value that needs to be set once we
+                        // return _URC_INSTALL_CONTEXT
+                        _Unwind_SetIP( unwind_context, ((lsd_info.LPStart) + (callsite_landing_pad)) );
+                        __cfadbg_print_safe(exception, " action\n");
+                        // Return have some action to run
+                        return _URC_INSTALL_CONTEXT;
+                }
-                // Nothing to do, move along
-                __cfaabi_dbg_print_safe(" no landing pad");
+        }
         // No handling found
         __cfaabi_dbg_print_safe(" table end reached\n");
+        __cfadbg_print_safe(exception, " table end reached");
         UNWIND:
         __cfaabi_dbg_print_safe(" unwind\n");
+        __cfadbg_print_safe(exception, " unwind\n");
         // Keep unwinding the stack
         return _URC_CONTINUE_UNWIND;
+}
+#pragma GCC push_options
+#pragma GCC optimize(0)
 // Try statements are hoisted out see comments for details. While this could probably be unique
 // and simply linked from libcfa but there is one problem left, see the exception table for details
 __attribute__((noinline))
 void __cfaabi_ehm__try_terminate(void (*try_block)(),
+void __cfaehm_try_terminate(void (*try_block)(),
                 void (*catch_block)(int index, exception_t * except),
                 __attribute__((unused)) int (*match_block)(exception_t * except)) {
 …
         //! printf("%p %p %p %p\n", &try_block, &catch_block, &match_block, &xy);
-        // Setup statments: These 2 statments won't actually result in any code, they only setup global tables.
-        // However, they clobber gcc cancellation support from gcc.  We can replace the personality routine but
-        // replacing the exception table gcc generates is not really doable, it generates labels based on how the
-        // assembly works.
         // Setup the personality routine and exception table.
+        // Unforturnately these clobber gcc cancellation support which means we can't get access to
+        // the attribute cleanup tables at the same time. We would have to inspect the assembly to
+        // create a new set ourselves.
 #ifdef __PIC__
         asm volatile (".cfi_personality 0x9b,CFA.ref.__gcfa_personality_v0");
 …
         // Label which defines the end of the area for which the handler is setup.
         asm volatile (".TRYEND:");
         // Label which defines the start of the exception landing pad.  Basically what is called when the exception is
         // caught.  Note, if multiple handlers are given, the multiplexing should be done by the generated code, not the
         // exception runtime.
+        // Label which defines the start of the exception landing pad. Basically what is called when
+        // the exception is caught. Note, if multiple handlers are given, the multiplexing should be
+        // done by the generated code, not the exception runtime.
         asm volatile (".CATCH:");
         // Exception handler
+        catch_block( shared_stack.current_handler_index,
+                     shared_stack.current_exception );
+        // Note: Saving the exception context on the stack breaks termination exceptions.
+        catch_block( EXCEPT_TO_NODE( this_exception_context()->current_exception )->handler_index,
+                     this_exception_context()->current_exception );
+}
 …
 #ifdef __PIC__
-#if defined( __i386 ) || defined( __x86_64 )
 asm (
         // HEADER
 …
         // handler landing pad offset and 1 (action code, gcc seems to use 0).
         ".LLSDACSBCFA2:\n"
         "       .uleb128 .TRYSTART-__cfaabi_ehm__try_terminate\n"
+        "       .uleb128 .TRYSTART-__cfaehm_try_terminate\n"
         "       .uleb128 .TRYEND-.TRYSTART\n"
         "       .uleb128 .CATCH-__cfaabi_ehm__try_terminate\n"
+        "       .uleb128 .CATCH-__cfaehm_try_terminate\n"
         "       .uleb128 1\n"
         ".LLSDACSECFA2:\n"
         // TABLE FOOTER
         "       .text\n"
         "       .size   __cfaabi_ehm__try_terminate, .-__cfaabi_ehm__try_terminate\n"
+        "       .size   __cfaehm_try_terminate, .-__cfaehm_try_terminate\n"
 );
 …
         "       .quad __gcfa_personality_v0\n"
 #else // then __i386
         "   .long __gcfa_personality_v0\n"
+        "       .long __gcfa_personality_v0\n"
 #endif
 );
-#else
-#error Exception Handling: unknown architecture for position independent code.
-#endif // __i386 || __x86_64
 #else // __PIC__
-#if defined( __i386 ) || defined( __x86_64 )
 asm (
         // HEADER
 …
         ".LLSDACSBCFA2:\n"
         //      Handled area start (relative to start of function)
         "       .uleb128 .TRYSTART-__cfaabi_ehm__try_terminate\n"
+        "       .uleb128 .TRYSTART-__cfaehm_try_terminate\n"
         //      Handled area length
         "       .uleb128 .TRYEND-.TRYSTART\n"
         //      Handler landing pad address (relative to start of function)
         "       .uleb128 .CATCH-__cfaabi_ehm__try_terminate\n"
+        "       .uleb128 .CATCH-__cfaehm_try_terminate\n"
         //      Action code, gcc seems to always use 0.
         "       .uleb128 1\n"
 …
         ".LLSDACSECFA2:\n"
         "       .text\n"
         "       .size   __cfaabi_ehm__try_terminate, .-__cfaabi_ehm__try_terminate\n"
+        "       .size   __cfaehm_try_terminate, .-__cfaehm_try_terminate\n"
         "       .ident  \"GCC: (Ubuntu 6.2.0-3ubuntu11~16.04) 6.2.0 20160901\"\n"
         "       .section        .note.GNU-stack,\"x\",@progbits\n"
 );
+#endif // __PIC__
+#pragma GCC pop_options
+#elif defined( __ARM_ARCH )
+_Unwind_Reason_Code __gcfa_personality_v0(
+                int version,
+                _Unwind_Action actions,
+                unsigned long long exception_class,
+                struct _Unwind_Exception * unwind_exception,
+                struct _Unwind_Context * unwind_context) {
+        return _URC_CONTINUE_UNWIND;
+}
+__attribute__((noinline))
+void __cfaehm_try_terminate(void (*try_block)(),
+                void (*catch_block)(int index, exception_t * except),
+                __attribute__((unused)) int (*match_block)(exception_t * except)) {
+}
 #else
+#error Exception Handling: unknown architecture for position dependent code.
+#endif // __i386 || __x86_64
+#endif // __PIC__
+#pragma GCC pop_options
+        #error unsupported hardware architecture
+#endif // __x86_64 || __i386

libcfa/src/exception.h

-              r3c64c668
+              r58fe85a
 // file "LICENCE" distributed with Cforall.
 //
 // exception.h -- Builtins for exception handling.
+// exception.h -- Internal exception handling definitions.
 //
 // Author           : Andrew Beach
 // Created On       : Mon Jun 26 15:11:00 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Feb 22 18:11:15 2018
 // Update Count     : 8
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Oct 27 14:45:00 2020
+// Update Count     : 11
 //
 #pragma once
+// This could be considered several headers. All are internal to the exception
+// system but needed to depending on whether they are C/Cforall code and
+// whether or not they are part of the builtins.
 #ifdef __cforall
 …
 #endif
+struct __cfaabi_ehm__base_exception_t;
+typedef struct __cfaabi_ehm__base_exception_t exception_t;
+struct __cfaabi_ehm__base_exception_t_vtable {
+        const struct __cfaabi_ehm__base_exception_t_vtable * parent;
+// Included in C code or the built-ins.
+#if !defined(__cforall) || defined(__cforall_builtins__)
+struct __cfaehm_base_exception_t;
+typedef struct __cfaehm_base_exception_t exception_t;
+struct __cfaehm_base_exception_t_vtable {
+        const struct __cfaehm_base_exception_t_vtable * parent;
         size_t size;
         void (*copy)(struct __cfaabi_ehm__base_exception_t *this,
                      struct __cfaabi_ehm__base_exception_t * other);
         void (*free)(struct __cfaabi_ehm__base_exception_t *this);
         const char * (*msg)(struct __cfaabi_ehm__base_exception_t *this);
+        void (*copy)(struct __cfaehm_base_exception_t *this,
+                     struct __cfaehm_base_exception_t * other);
+        void (*free)(struct __cfaehm_base_exception_t *this);
+        const char * (*msg)(struct __cfaehm_base_exception_t *this);
 };
 struct __cfaabi_ehm__base_exception_t {
         struct __cfaabi_ehm__base_exception_t_vtable const * virtual_table;
+struct __cfaehm_base_exception_t {
+        struct __cfaehm_base_exception_t_vtable const * virtual_table;
 };
 extern struct __cfaabi_ehm__base_exception_t_vtable
         ___cfaabi_ehm__base_exception_t_vtable_instance;
+extern struct __cfaehm_base_exception_t_vtable
+        ___cfaehm_base_exception_t_vtable_instance;
+void __cfaehm_cancel_stack(exception_t * except) __attribute__((noreturn));
 // Used in throw statement translation.
 void __cfaabi_ehm__throw_terminate(exception_t * except) __attribute__((noreturn));
 void __cfaabi_ehm__rethrow_terminate() __attribute__((noreturn));
 void __cfaabi_ehm__throw_resume(exception_t * except);
+void __cfaehm_throw_terminate(exception_t * except, void (*)(exception_t *));
+void __cfaehm_rethrow_terminate() __attribute__((noreturn));
+void __cfaehm_throw_resume(exception_t * except, void (*)(exception_t *));
 // Function catches termination exceptions.
 void __cfaabi_ehm__try_terminate(
     void (*try_block)(),
     void (*catch_block)(int index, exception_t * except),
     int (*match_block)(exception_t * except));
+void __cfaehm_try_terminate(
+        void (*try_block)(),
+        void (*catch_block)(int index, exception_t * except),
+        int (*match_block)(exception_t * except));
 // Clean-up the exception in catch blocks.
 void __cfaabi_ehm__cleanup_terminate(void * except);
+void __cfaehm_cleanup_terminate(void * except);
 // Data structure creates a list of resume handlers.
 struct __cfaabi_ehm__try_resume_node {
     struct __cfaabi_ehm__try_resume_node * next;
     _Bool (*handler)(exception_t * except);
+struct __cfaehm_try_resume_node {
+        struct __cfaehm_try_resume_node * next;
+        _Bool (*handler)(exception_t * except);
 };
 // These act as constructor and destructor for the resume node.
 void __cfaabi_ehm__try_resume_setup(
     struct __cfaabi_ehm__try_resume_node * node,
     _Bool (*handler)(exception_t * except));
 void __cfaabi_ehm__try_resume_cleanup(
     struct __cfaabi_ehm__try_resume_node * node);
+void __cfaehm_try_resume_setup(
+        struct __cfaehm_try_resume_node * node,
+        _Bool (*handler)(exception_t * except));
+void __cfaehm_try_resume_cleanup(
+        struct __cfaehm_try_resume_node * node);
 // Check for a standard way to call fake deconstructors.
+struct __cfaabi_ehm__cleanup_hook {};
+struct __cfaehm_cleanup_hook {};
+#endif
+// Included in C code and the library.
+#if !defined(__cforall) || !defined(__cforall_builtins__)
+struct __cfaehm_node {
+        struct _Unwind_Exception unwind_exception;
+        struct __cfaehm_node * next;
+        int handler_index;
+};
+static inline exception_t * __cfaehm_cancellation_exception(
+                struct _Unwind_Exception * unwind_exception ) {
+        return (exception_t *)(1 + (struct __cfaehm_node *)unwind_exception);
+}
+#endif
 #ifdef __cforall
+}
+// Built-ins not visible in C.
+#if defined(__cforall_builtins__)
+// Not all the built-ins can be expressed in C. These can't be
+// implemented in the .c file either so they all have to be inline.
+trait is_exception(dtype exceptT, dtype virtualT) {
+        /* The first field must be a pointer to a virtual table.
+         * That virtual table must be a decendent of the base exception virtual table.
+         */
+        virtualT const & get_exception_vtable(exceptT *);
+        // Always returns the virtual table for this type (associated types hack).
+};
+trait is_termination_exception(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT)) {
+        void defaultTerminationHandler(exceptT &);
+};
+trait is_resumption_exception(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT)) {
+        void defaultResumptionHandler(exceptT &);
+};
+forall(dtype exceptT, dtype virtualT | is_termination_exception(exceptT, virtualT))
+static inline void $throw(exceptT & except) {
+        __cfaehm_throw_terminate(
+                (exception_t *)&except,
+                (void(*)(exception_t *))defaultTerminationHandler
+        );
+}
+forall(dtype exceptT, dtype virtualT | is_resumption_exception(exceptT, virtualT))
+static inline void $throwResume(exceptT & except) {
+        __cfaehm_throw_resume(
+                (exception_t *)&except,
+                (void(*)(exception_t *))defaultResumptionHandler
+        );
+}
+forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+static inline void cancel_stack(exceptT & except) __attribute__((noreturn)) {
+        __cfaehm_cancel_stack( (exception_t *)&except );
+}
+forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+static inline void defaultTerminationHandler(exceptT & except) {
+        return cancel_stack( except );
+}
+forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+static inline void defaultResumptionHandler(exceptT & except) {
+        throw except;
+}
 #endif
+#endif

libcfa/src/executor.cfa

-              r3c64c668
+              r58fe85a
 // buffer.
-#include <bits/containers.hfa>
 #include <thread.hfa>
 #include <stdio.h>
+#include <containers/list.hfa>
+forall( dtype T )
+monitor Buffer {                                        // unbounded buffer
+    __queue_t( T ) queue;                               // unbounded list of work requests
+    condition delay;
+}; // Buffer
+forall( dtype T | is_node(T) ) {
+    void insert( Buffer( T ) & mutex buf, T * elem ) with(buf) {
+        append( queue, elem );                          // insert element into buffer
+        signal( delay );                                // restart
+    } // insert
+forall( dtype T | $dlistable(T, T) ) {
+        monitor Buffer {                                                                        // unbounded buffer
+                dlist( T, T ) queue;                                                    // unbounded list of work requests
+                condition delay;
+        }; // Buffer
     T * remove( Buffer( T ) & mutex buf ) with(buf) {
         if ( queue.head != 0 ) wait( delay );                   // no request to process ? => wait
+//      return pop_head( queue );
+    } // remove
+} // distribution
+        void insert( Buffer(T) & mutex buf, T * elem ) with(buf) {
+                dlist( T, T ) * qptr = &queue;                                  // workaround https://cforall.uwaterloo.ca/trac/ticket/166
+                insert_last( *qptr, *elem );                                    // insert element into buffer
+                signal( delay );                                                                // restart
+        } // insert
+struct WRequest {                                       // client request, no return
+    void (* action)( void );
+    WRequest * next;                                    // intrusive queue field
+        T * remove( Buffer(T) & mutex buf ) with(buf) {
+                dlist( T, T ) * qptr = &queue;                                  // workaround https://cforall.uwaterloo.ca/trac/ticket/166
+                // if ( (*qptr)`is_empty ) wait( delay );                       // no request to process ? => wait
+          if ( (*qptr)`is_empty ) return 0p;                            // no request to process ? => wait
+                return &pop_first( *qptr );
+        } // remove
+} // forall
+struct WRequest {                                                                               // client request, no return
+        void (* action)( void );
+        DLISTED_MGD_IMPL_IN(WRequest)
 }; // WRequest
+DLISTED_MGD_IMPL_OUT(WRequest)
+WRequest *& get_next( WRequest & this ) { return this.next; }
+void ?{}( WRequest & req ) with(req) { action = 0; next = 0; }
+void ?{}( WRequest & req, void (* action)( void ) ) with(req) { req.action = action; next = 0; }
+void ?{}( WRequest & req ) with(req) { action = 0; }
+void ?{}( WRequest & req, void (* action)( void ) ) with(req) { req.action = action; }
 bool stop( WRequest & req ) { return req.action == 0; }
 void doit( WRequest & req ) { req.action(); }
 // Each worker has its own work buffer to reduce contention between client and server. Hence, work requests arrive and
 // are distributed into buffers in a roughly round-robin order.
+// Each worker has its own set (when requests buffers > workers) of work buffers to reduce contention between client
+// and server, where work requests arrive and are distributed into buffers in a roughly round-robin order.
 thread Worker {
+    Buffer( WRequest ) * requests;
+    unsigned int start, range;
+        Buffer(WRequest) * requests;
+        WRequest * request;
+        unsigned int start, range;
 }; // Worker
 void main( Worker & w ) with(w) {
     for ( int i = 0;; i = (i + 1) % range ) {
         WRequest * request = remove( requests[i + start] );
       if ( ! request ) { yield(); continue; }
       if ( stop( *request ) ) break;
         doit( *request );
         delete( request );
     } // for
+        for ( int i = 0;; i = (i + 1) % range ) {
+                request = remove( requests[i + start] );
+          if ( ! request ) { yield(); continue; }
+          if ( stop( *request ) ) break;
+                doit( *request );
+                delete( request );
+        } // for
 } // Worker::main
 void ?{}( Worker & worker, cluster * wc, Buffer( WRequest ) * requests, unsigned int start, unsigned int range ) {
+    (*get_thread(worker)){ *wc };                       // create on given cluster
     worker.[requests, start, range] = [requests, start, range];
+void ?{}( Worker & worker, cluster * wc, Buffer(WRequest) * requests, unsigned int start, unsigned int range ) {
+        ((thread &)worker){ *wc };
+        worker.[requests, request, start, range] = [requests, 0p, start, range];
 } // ?{}
+WRequest * current_request( Worker & worker ) { return worker.request; }
 struct Executor {
+    cluster * cluster;                                  // if workers execute on separate cluster
+    processor ** processors;                            // array of virtual processors adding parallelism for workers
+    Buffer( WRequest ) * requests;                      // list of work requests
+    Worker ** workers;                                  // array of workers executing work requests
+    unsigned int nprocessors, nworkers, nmailboxes;     // number of mailboxes/workers/processor tasks
+    bool sepClus;                                       // use same or separate cluster for executor
+        cluster * cluster;                                                                      // if workers execute on separate cluster
+        processor ** processors;                                                        // array of virtual processors adding parallelism for workers
+        Buffer(WRequest) * requests;                                            // list of work requests
+        Worker ** workers;                                                                      // array of workers executing work requests
+        unsigned int nprocessors, nworkers, nrqueues;           // number of processors/threads/request queues
+        bool sepClus;                                                                           // use same or separate cluster for executor
+        unsigned int next;                                                                      // demultiplexed across worker buffers
 }; // Executor
-static thread_local unsigned int next;                  // demultiplexed across worker buffers
 unsigned int tickets( Executor & ex ) with(ex) {
     //return uFetchAdd( next, 1 ) % nmailboxes;
     return next++ % nmailboxes;                         // no locking, interference randomizes
+        //return uFetchAdd( next, 1 ) % nrqueues;
+        return next++ % nrqueues;                                                       // no locking, interference randomizes
 } // tickets
 void ?{}( Executor & ex, unsigned int np, unsigned int nw, unsigned int nm, bool sc = false ) with(ex) {
     [nprocessors, nworkers, nmailboxes, sepClus] = [np, nw, nm, sc];
     assert( nmailboxes >= nworkers );
     cluster = sepClus ? new( "Executor" ) : active_cluster();
     processors = (processor **)anew( nprocessors );
     requests = anew( nmailboxes );
     workers = (Worker **)anew( nworkers );
+void ?{}( Executor & ex, unsigned int np, unsigned int nw, unsigned int nr, bool sc = false ) with(ex) {
+        [nprocessors, nworkers, nrqueues, sepClus] = [np, nw, nr, sc];
+        assert( nrqueues >= nworkers );
+        cluster = sepClus ? new( "Executor" ) : active_cluster();
+        processors = aalloc( nprocessors );
+        requests = anew( nrqueues );
+        workers = aalloc( nworkers );
     for ( i; nprocessors ) {
         processors[ i ] = new( *cluster );
     } // for
+        for ( i; nprocessors ) {
+                processors[i] = new( *cluster );
+        } // for
+    unsigned int reqPerWorker = nmailboxes / nworkers, extras = nmailboxes % nworkers;
+    for ( unsigned int i = 0, step = 0; i < nworkers; i += 1, step += reqPerWorker + ( i < extras ? 1 : 0 ) ) {
+        workers[ i ] = new( cluster, requests, step, reqPerWorker + ( i < extras ? 1 : 0 ) );
+    } // for
+        unsigned int reqPerWorker = nrqueues / nworkers, extras = nrqueues % nworkers;
+//      for ( unsigned int i = 0, start = 0, range; i < nworkers; i += 1, start += range ) {
+    for ( i; nworkers : start; 0u ~ @ ~ range : range; ) {
+            range = reqPerWorker + ( i < extras ? 1 : 0 );
+                workers[i] = new( cluster, requests, start, range );
+        } // for
 } // ?{}
 void ?{}( Executor & ex, unsigned int nprocessors, unsigned int nworkers, bool sepClus = false ) {
     ex{ nprocessors, nworkers, nworkers, sepClus };
+        ex{ nprocessors, nworkers, nworkers, sepClus };
+}
 void ?{}( Executor & ex, unsigned int nprocessors, bool sepClus = false ) {
     ex{ nprocessors, nprocessors, nprocessors, sepClus };
+        ex{ nprocessors, nprocessors, nprocessors, sepClus };
+}
 void ?{}( Executor & ex ) {                             // special for current cluster
     ex{ 0, active_cluster()->nprocessors, false };
+void ?{}( Executor & ex ) {                                                             // special for current cluster, no processors added
+        ex{ 0, active_cluster()->nprocessors, false };
+}
 void ^?{}( Executor & ex ) with(ex) {
     // Add one sentinel per worker to stop them. Since in destructor, no new work should be queued.  Cannot combine next
+    // two loops and only have a single sentinel because workers arrive in arbitrary order, so worker1 may take the
     // single sentinel while waiting for worker 0 to end.
+        // Add one sentinel per worker to stop them. Since in destructor, no new external work should be queued.  Cannot
+        // combine next two loops and only have a single sentinel because workers arrive in arbitrary order, so worker1 may
+        // take the single sentinel while waiting for worker 0 to end.
     WRequest sentinel[nworkers];
     unsigned int reqPerWorker = nmailboxes / nworkers;
     for ( unsigned int i = 0, step = 0; i < nworkers; i += 1, step += reqPerWorker ) {
         insert( requests[step], &sentinel[i] );         // force eventually termination
     } // for
     for ( i; nworkers ) {
         delete( workers[ i ] );
     } // for
     for ( i; nprocessors ) {
         delete( processors[ i ] );
     } // for
+        WRequest sentinel[nworkers];
+        unsigned int reqPerWorker = nrqueues / nworkers;
+        for ( unsigned int i = 0, step = 0; i < nworkers; i += 1, step += reqPerWorker ) {
+                insert( requests[step], &sentinel[i] );                 // force eventually termination
+        } // for
+        for ( i; nworkers ) {
+                delete( workers[i] );
+        } // for
+        for ( i; nprocessors ) {
+                delete( processors[i] );
+        } // for
+    delete( workers );
+    delete( requests );
+    delete( processors );
+    if ( sepClus ) { delete( cluster ); }
+        free( workers );
+//      adelete( nrqueues, requests );
+        for ( i; nrqueues ) ^?{}( requests[i] );                        // FIX ME: problem with resolver
+        free( requests );
+        free( processors );
+        if ( sepClus ) { delete( cluster ); }
 } // ^?{}
 void send( Executor & ex, void (* action)( void ) ) {   // asynchronous call, no return value
     WRequest * node = new( action );
     insert( ex.requests[tickets( ex )], node );
+        WRequest * node = new( action );
+        insert( ex.requests[tickets( ex )], node );
 } // send
 int counter = 0;
 void workie( void ) {
     __atomic_add_fetch( &counter, 1, __ATOMIC_SEQ_CST );
 //    fprintf( stderr, "workie\n" );
+void work( void ) {
+        __atomic_add_fetch( &counter, 1, __ATOMIC_SEQ_CST );
+        // fprintf( stderr, "workie\n" );
+}
+int main() {
+    {
+        Executor exector;
+        for ( i; 3000 ) {
+            send( exector, workie );
+            if ( i % 100 ) yield();
+        } // for
+    }
+    printf( "%d\n", counter );
+int main( int argc, char * argv[] ) {
+        int times = 1_000_000;
+        if ( argc == 2 ) times = atoi( argv[1] );
+        processor p[7];
+        {
+                Executor exector;
+                for ( i; times ) {
+                        send( exector, work );
+                        if ( i % 100 == 0 ) yield();
+                } // for
+        }
+        printf( "%d\n", counter );
+}
 // Local Variables: //
+// tab-width: 4" //
 // compile-command: "cfa executor.cfa" //
 // End: //

libcfa/src/fstream.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Feb  7 19:01:01 2020
 // Update Count     : 363
+// Last Modified On : Fri Jun 19 16:24:54 2020
+// Update Count     : 384
 //
 …
 //*********************************** ofstream ***********************************
+// *********************************** ofstream ***********************************
 …
         #ifdef __CFA_DEBUG__
         if ( file == 0p ) {
+                abort | IO_MSG "open output file \"" | name | "\"" | nl | strerror( errno );
+                throw (Open_Failure){ os };
+                // abort | IO_MSG "open output file \"" | name | "\"" | nl | strerror( errno );
         } // if
         #endif // __CFA_DEBUG__
 …
 void close( ofstream & os ) {
+        if ( (FILE *)(os.$file) == stdout || (FILE *)(os.$file) == stderr ) return;
+  if ( (FILE *)(os.$file) == 0p ) return;
+  if ( (FILE *)(os.$file) == (FILE *)stdout || (FILE *)(os.$file) == (FILE *)stderr ) return;
         if ( fclose( (FILE *)(os.$file) ) == EOF ) {
                 abort | IO_MSG "close output" | nl | strerror( errno );
         } // if
+        os.$file = 0p;
 } // close
 …
 //*********************************** ifstream ***********************************
+// *********************************** ifstream ***********************************
 …
         #ifdef __CFA_DEBUG__
         if ( file == 0p ) {
+                abort | IO_MSG "open input file \"" | name | "\"" | nl | strerror( errno );
+                throw (Open_Failure){ is };
+                // abort | IO_MSG "open input file \"" | name | "\"" | nl | strerror( errno );
         } // if
         #endif // __CFA_DEBUG__
 …
 void close( ifstream & is ) {
+        if ( (FILE *)(is.$file) == stdin ) return;
+  if ( (FILE *)(is.$file) == 0p ) return;
+  if ( (FILE *)(is.$file) == (FILE *)stdin ) return;
         if ( fclose( (FILE *)(is.$file) ) == EOF ) {
                 abort | IO_MSG "close input" | nl | strerror( errno );
         } // if
+        is.$file = 0p;
 } // close
 …
 ifstream & sin = sinFile, & stdin = sinFile;
+// *********************************** exceptions ***********************************
+void ?{}( Open_Failure & this, ofstream & ostream ) {
+        VTABLE_INIT(this, Open_Failure);
+        this.ostream = &ostream;
+        this.tag = 1;
+}
+void ?{}( Open_Failure & this, ifstream & istream ) {
+        VTABLE_INIT(this, Open_Failure);
+        this.istream = &istream;
+        this.tag = 0;
+}
+const char * Open_Failure_msg(Open_Failure * this) {
+        return "Open_Failure";
+}
+VTABLE_INSTANCE(Open_Failure)(Open_Failure_msg);
+void throwOpen_Failure( ofstream & ostream ) {
+        Open_Failure exc = { ostream };
+}
+void throwOpen_Failure( ifstream & istream ) {
+        Open_Failure exc = { istream };
+}
 // Local Variables: //
 // tab-width: 4 //

libcfa/src/fstream.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Feb 17 08:29:23 2020
 // Update Count     : 175
+// Last Modified On : Fri Jun 19 16:29:17 2020
+// Update Count     : 189
 //
 …
 #include "iostream.hfa"
+#include <exception.hfa>
 //*********************************** ofstream ***********************************
+// *********************************** ofstream ***********************************
 …
 //*********************************** ifstream ***********************************
+// *********************************** ifstream ***********************************
 …
 extern ifstream & sin, & stdin;                                                 // aliases
+// *********************************** exceptions ***********************************
+DATA_EXCEPTION(Open_Failure)(
+        union {
+                ofstream * ostream;
+                ifstream * istream;
+        };
+        // TEMPORARY: need polymorphic exceptions
+        int tag;                                                                                        // 1 => ostream; 0 => istream
+);
+void ?{}( Open_Failure & this, ofstream & ostream );
+void ?{}( Open_Failure & this, ifstream & istream );
 // Local Variables: //
 // mode: c //

libcfa/src/heap.cfa

-              r3c64c668
+              r58fe85a
 // file "LICENCE" distributed with Cforall.
 //
 // heap.c --
+// heap.cfa --
 //
 // Author           : Peter A. Buhr
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 10:04:51 2020
 // Update Count     : 648
+// Last Modified On : Wed Dec 16 12:28:25 2020
+// Update Count     : 1023
 //
 #include <unistd.h>                                                                             // sbrk, sysconf
+#include <stdlib.h>                                                                             // EXIT_FAILURE
 #include <stdbool.h>                                                                    // true, false
 #include <stdio.h>                                                                              // snprintf, fileno
 #include <errno.h>                                                                              // errno
 #include <string.h>                                                                             // memset, memcpy
+extern "C" {
+#include <limits.h>                                                                             // ULONG_MAX
+#include <malloc.h>                                                                             // memalign, malloc_usable_size
 #include <sys/mman.h>                                                                   // mmap, munmap
+} // extern "C"
+// #comment TD : Many of these should be merged into math I believe
+#include "bits/align.hfa"                                                               // libPow2
+#include "bits/align.hfa"                                                               // libAlign
 #include "bits/defs.hfa"                                                                // likely, unlikely
 #include "bits/locks.hfa"                                                               // __spinlock_t
 #include "startup.hfa"                                                                  // STARTUP_PRIORITY_MEMORY
+//#include "stdlib.hfa"                                                                 // bsearchl
+#include "malloc.h"
+#define MIN(x, y) (y > x ? x : y)
+#include "math.hfa"                                                                             // ceiling
+#include "bitmanip.hfa"                                                                 // is_pow2, ceiling2
 static bool traceHeap = false;
 …
         // Define the default extension heap amount in units of bytes. When the uC++ supplied heap reaches the brk address,
         // the brk address is extended by the extension amount.
         __CFA_DEFAULT_HEAP_EXPANSION__ = (1 * 1024 * 1024),
+        __CFA_DEFAULT_HEAP_EXPANSION__ = (10 * 1024 * 1024),
         // Define the mmap crossover point during allocation. Allocations less than this amount are allocated from buckets;
 …
 #ifdef __CFA_DEBUG__
 static unsigned int allocFree;                                                  // running total of allocations minus frees
+static size_t allocUnfreed;                                                             // running total of allocations minus frees
 static void prtUnfreed() {
         if ( allocFree != 0 ) {
+        if ( allocUnfreed != 0 ) {
                 // DO NOT USE STREAMS AS THEY MAY BE UNAVAILABLE AT THIS POINT.
                 char helpText[512];
                 int len = snprintf( helpText, sizeof(helpText), "CFA warning (UNIX pid:%ld) : program terminating with %u(0x%x) bytes of storage allocated but not freed.\n"
+                int len = snprintf( helpText, sizeof(helpText), "CFA warning (UNIX pid:%ld) : program terminating with %zu(0x%zx) bytes of storage allocated but not freed.\n"
                                                         "Possible cause is unfreed storage allocated by the program or system/library routines called from the program.\n",
                                                         (long int)getpid(), allocFree, allocFree ); // always print the UNIX pid
+                                                        (long int)getpid(), allocUnfreed, allocUnfreed ); // always print the UNIX pid
                 __cfaabi_bits_write( STDERR_FILENO, helpText, len ); // print debug/nodebug
         } // if
 …
 extern "C" {
         void heapAppStart() {                                                           // called by __cfaabi_appready_startup
                 allocFree = 0;
+                allocUnfreed = 0;
         } // heapAppStart
 …
 // statically allocated variables => zero filled.
+static size_t pageSize;                                                                 // architecture pagesize
+size_t __page_size;                                                                             // architecture pagesize
+int __map_prot;                                                                                 // common mmap/mprotect protection
 static size_t heapExpand;                                                               // sbrk advance
 static size_t mmapStart;                                                                // cross over point for mmap
 …
 #define LOCKFREE 1
 #define BUCKETLOCK SPINLOCK
+#if BUCKETLOCK == LOCKFREE
+#include <uStackLF.h>
+#if BUCKETLOCK == SPINLOCK
+#elif BUCKETLOCK == LOCKFREE
+#include <stackLockFree.hfa>
+#else
+        #error undefined lock type for bucket lock
 #endif // LOCKFREE
 …
 struct HeapManager {
-//      struct FreeHeader;                                                                      // forward declaration
         struct Storage {
                 struct Header {                                                                 // header
 …
                                                 struct {                                                // 4-byte word => 8-byte header, 8-byte word => 16-byte header
                                                         #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ && __SIZEOF_POINTER__ == 4
                                                         uint32_t padding;                       // unused, force home/blocksize to overlay alignment in fake header
+                                                        uint64_t padding;                       // unused, force home/blocksize to overlay alignment in fake header
                                                         #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ && __SIZEOF_POINTER__ == 4
                                                         union {
+//                                                              FreeHeader * home;              // allocated block points back to home locations (must overlay alignment)
+                                                                // FreeHeader * home;           // allocated block points back to home locations (must overlay alignment)
+                                                                // 2nd low-order bit => zero filled
                                                                 void * home;                    // allocated block points back to home locations (must overlay alignment)
                                                                 size_t blockSize;               // size for munmap (must overlay alignment)
                                                                 #if BUCKLOCK == SPINLOCK
+                                                                #if BUCKETLOCK == SPINLOCK
                                                                 Storage * next;                 // freed block points next freed block of same size
                                                                 #endif // SPINLOCK
                                                         };
+                                                        size_t size;                            // allocation size in bytes
                                                         #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ && __SIZEOF_POINTER__ == 4
                                                         uint32_t padding;                       // unused, force home/blocksize to overlay alignment in fake header
+                                                        uint64_t padding;                       // unused, force home/blocksize to overlay alignment in fake header
                                                         #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ && __SIZEOF_POINTER__ == 4
                                                 };
+                                                // future code
+                                                #if BUCKLOCK == LOCKFREE
+                                                Stack<Storage>::Link next;              // freed block points next freed block of same size (double-wide)
+                                                #if BUCKETLOCK == LOCKFREE
+                                                Link(Storage) next;                             // freed block points next freed block of same size (double-wide)
                                                 #endif // LOCKFREE
                                         };
                                 } real; // RealHeader
                                 struct FakeHeader {
                                         #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
                                         uint32_t alignment;                                     // low-order bits of home/blockSize used for tricks
+                                        uint32_t alignment;                                     // 1st low-order bit => fake header & alignment
                                         #endif // __ORDER_LITTLE_ENDIAN__
 …
         struct FreeHeader {
                 #if BUCKLOCK == SPINLOCK
+                #if BUCKETLOCK == SPINLOCK
                 __spinlock_t lock;                                                              // must be first field for alignment
                 Storage * freeList;
-                #elif BUCKLOCK == LOCKFREE
-                // future code
-                StackLF<Storage> freeList;
                 #else
                         #error undefined lock type for bucket lock
                 #endif // SPINLOCK
+                StackLF(Storage) freeList;
+                #endif // BUCKETLOCK
                 size_t blockSize;                                                               // size of allocations on this list
         }; // FreeHeader
 …
 }; // HeapManager
+#if BUCKETLOCK == LOCKFREE
+static inline {
+        Link(HeapManager.Storage) * ?`next( HeapManager.Storage * this ) { return &this->header.kind.real.next; }
+        void ?{}( HeapManager.FreeHeader & ) {}
+        void ^?{}( HeapManager.FreeHeader & ) {}
+} // distribution
+#endif // LOCKFREE
 static inline size_t getKey( const HeapManager.FreeHeader & freeheader ) { return freeheader.blockSize; }
 …
 #define __STATISTICS__
+// Bucket size must be multiple of 16.
+// Powers of 2 are common allocation sizes, so make powers of 2 generate the minimum required size.
+// Size of array must harmonize with NoBucketSizes and individual bucket sizes must be multiple of 16.
+// Smaller multiples of 16 and powers of 2 are common allocation sizes, so make them generate the minimum required bucket size.
+// malloc(0) returns 0p, so no bucket is necessary for 0 bytes returning an address that can be freed.
 static const unsigned int bucketSizes[] @= {                    // different bucket sizes
 , 32, 48, 64 + sizeof(HeapManager.Storage), // 4
 , 112, 128 + sizeof(HeapManager.Storage), // 3
+ + sizeof(HeapManager.Storage), 32 + sizeof(HeapManager.Storage), 48 + sizeof(HeapManager.Storage), 64 + sizeof(HeapManager.Storage), // 4
+ + sizeof(HeapManager.Storage), 112 + sizeof(HeapManager.Storage), 128 + sizeof(HeapManager.Storage), // 3
 , 192, 224, 256 + sizeof(HeapManager.Storage), // 4
 , 384, 448, 512 + sizeof(HeapManager.Storage), // 4
 …
 };
 static_assert( NoBucketSizes == sizeof(bucketSizes) / sizeof(bucketSizes[0]), "size of bucket array wrong" );
+static_assert( NoBucketSizes == sizeof(bucketSizes) / sizeof(bucketSizes[0] ), "size of bucket array wrong" );
 #ifdef FASTLOOKUP
 …
 #endif // FASTLOOKUP
 static int mmapFd = -1;                                                                 // fake or actual fd for anonymous file
+static const off_t mmapFd = -1;                                                 // fake or actual fd for anonymous file
 #ifdef __CFA_DEBUG__
 static bool heapBoot = 0;                                                               // detect recursion during boot
 #endif // __CFA_DEBUG__
+// The constructor for heapManager is called explicitly in memory_startup.
 static HeapManager heapManager __attribute__(( aligned (128) )) @= {}; // size of cache line to prevent false sharing
 …
 #ifdef __STATISTICS__
 // Heap statistics counters.
+static unsigned int malloc_calls;
+static unsigned long long int malloc_storage;
+static unsigned int aalloc_calls;
+static unsigned long long int aalloc_storage;
+static unsigned int calloc_calls;
+static unsigned long long int calloc_storage;
+static unsigned int memalign_calls;
+static unsigned long long int memalign_storage;
+static unsigned int amemalign_calls;
+static unsigned long long int amemalign_storage;
+static unsigned int cmemalign_calls;
+static unsigned long long int cmemalign_storage;
+static unsigned int resize_calls;
+static unsigned long long int resize_storage;
+static unsigned int realloc_calls;
+static unsigned long long int realloc_storage;
+static unsigned int free_calls;
+static unsigned long long int free_storage;
+static unsigned int mmap_calls;
 static unsigned long long int mmap_storage;
 static unsigned int mmap_calls;
+static unsigned int munmap_calls;
 static unsigned long long int munmap_storage;
 static unsigned int munmap_calls;
+static unsigned int sbrk_calls;
 static unsigned long long int sbrk_storage;
-static unsigned int sbrk_calls;
-static unsigned long long int malloc_storage;
-static unsigned int malloc_calls;
-static unsigned long long int free_storage;
-static unsigned int free_calls;
-static unsigned long long int calloc_storage;
-static unsigned int calloc_calls;
-static unsigned long long int memalign_storage;
-static unsigned int memalign_calls;
-static unsigned long long int cmemalign_storage;
-static unsigned int cmemalign_calls;
-static unsigned long long int realloc_storage;
-static unsigned int realloc_calls;
 // Statistics file descriptor (changed by malloc_stats_fd).
 static int statfd = STDERR_FILENO;                                              // default stderr
+static int stat_fd = STDERR_FILENO;                                             // default stderr
 // Use "write" because streams may be shutdown when calls are made.
 static void printStats() {
         char helpText[512];
+        char helpText[1024];
         __cfaabi_bits_print_buffer( STDERR_FILENO, helpText, sizeof(helpText),
                                                                         "\nHeap statistics:\n"
                                                                         "  malloc: calls %u / storage %llu\n"
+                                                                        "  aalloc: calls %u / storage %llu\n"
                                                                         "  calloc: calls %u / storage %llu\n"
                                                                         "  memalign: calls %u / storage %llu\n"
+                                                                        "  amemalign: calls %u / storage %llu\n"
                                                                         "  cmemalign: calls %u / storage %llu\n"
+                                                                        "  resize: calls %u / storage %llu\n"
                                                                         "  realloc: calls %u / storage %llu\n"
                                                                         "  free: calls %u / storage %llu\n"
 …
                                                                         "  sbrk: calls %u / storage %llu\n",
                                                                         malloc_calls, malloc_storage,
+                                                                        aalloc_calls, aalloc_storage,
                                                                         calloc_calls, calloc_storage,
                                                                         memalign_calls, memalign_storage,
+                                                                        amemalign_calls, amemalign_storage,
                                                                         cmemalign_calls, cmemalign_storage,
+                                                                        resize_calls, resize_storage,
                                                                         realloc_calls, realloc_storage,
                                                                         free_calls, free_storage,
 …
 static int printStatsXML( FILE * stream ) {                             // see malloc_info
         char helpText[512];
+        char helpText[1024];
         int len = snprintf( helpText, sizeof(helpText),
                                                 "<malloc version=\"1\">\n"
 …
                                                 "</sizes>\n"
                                                 "<total type=\"malloc\" count=\"%u\" size=\"%llu\"/>\n"
+                                                "<total type=\"aalloc\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"calloc\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"memalign\" count=\"%u\" size=\"%llu\"/>\n"
+                                                "<total type=\"amemalign\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"cmemalign\" count=\"%u\" size=\"%llu\"/>\n"
+                                                "<total type=\"resize\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"realloc\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"free\" count=\"%u\" size=\"%llu\"/>\n"
 …
                                                 "</malloc>",
                                                 malloc_calls, malloc_storage,
+                                                aalloc_calls, aalloc_storage,
                                                 calloc_calls, calloc_storage,
                                                 memalign_calls, memalign_storage,
+                                                amemalign_calls, amemalign_storage,
                                                 cmemalign_calls, cmemalign_storage,
+                                                resize_calls, resize_storage,
                                                 realloc_calls, realloc_storage,
                                                 free_calls, free_storage,
 …
-// static inline void noMemory() {
-//      abort( "Heap memory exhausted at %zu bytes.\n"
-//                 "Possible cause is very large memory allocation and/or large amount of unfreed storage allocated by the program or system/library routines.",
-//                 ((char *)(sbrk( 0 )) - (char *)(heapManager.heapBegin)) );
-// } // noMemory
-static inline void checkAlign( size_t alignment ) {
-        if ( alignment < libAlign() || ! libPow2( alignment ) ) {
-                abort( "Alignment %zu for memory allocation is less than %d and/or not a power of 2.", alignment, libAlign() );
-        } // if
-} // checkAlign
-static inline bool setHeapExpand( size_t value ) {
-  if ( heapExpand < pageSize ) return true;
-        heapExpand = value;
-        return false;
-} // setHeapExpand
 // thunk problem
 size_t Bsearchl( unsigned int key, const unsigned int * vals, size_t dim ) {
 …
 static inline bool setMmapStart( size_t value ) {               // true => mmapped, false => sbrk
   if ( value < pageSize || bucketSizes[NoBucketSizes - 1] < value ) return true;
+  if ( value < __page_size || bucketSizes[NoBucketSizes - 1] < value ) return false;
         mmapStart = value;                                                                      // set global
 …
         assert( maxBucketsUsed < NoBucketSizes );                       // subscript failure ?
         assert( mmapStart <= bucketSizes[maxBucketsUsed] ); // search failure ?
         return false;
+        return true;
 } // setMmapStart
+// <-------+----------------------------------------------------> bsize (bucket size)
+// |header |addr
+//==================================================================================
+//                   align/offset |
+// <-----------------<------------+-----------------------------> bsize (bucket size)
+//                   |fake-header | addr
+#define headerAddr( addr ) ((HeapManager.Storage.Header *)( (char *)addr - sizeof(HeapManager.Storage) ))
+#define realHeader( header ) ((HeapManager.Storage.Header *)((char *)header - header->kind.fake.offset))
+// <-------<<--------------------- dsize ---------------------->> bsize (bucket size)
+// |header |addr
+//==================================================================================
+//                   align/offset |
+// <------------------------------<<---------- dsize --------->>> bsize (bucket size)
+//                   |fake-header |addr
+#define dataStorage( bsize, addr, header ) (bsize - ( (char *)addr - (char *)header ))
+static inline void checkAlign( size_t alignment ) {
+        if ( alignment < libAlign() || ! is_pow2( alignment ) ) {
+                abort( "Alignment %zu for memory allocation is less than %d and/or not a power of 2.", alignment, libAlign() );
+        } // if
+} // checkAlign
 …
 static inline void fakeHeader( HeapManager.Storage.Header *& header, size_t & alignment ) {
         if ( unlikely( (header->kind.fake.alignment & 1) == 1 ) ) { // fake header ?
-                size_t offset = header->kind.fake.offset;
                 alignment = header->kind.fake.alignment & -2;   // remove flag from value
                 #ifdef __CFA_DEBUG__
                 checkAlign( alignment );                                                // check alignment
                 #endif // __CFA_DEBUG__
+                header = (HeapManager.Storage.Header *)((char *)header - offset);
+                header = realHeader( header );                                  // backup from fake to real header
+        } else {
+                alignment = libAlign();                                                 // => no fake header
         } // if
 } // fakeHeader
+// <-------+----------------------------------------------------> bsize (bucket size)
+// |header |addr
+//==================================================================================
+//                                | alignment
+// <-----------------<------------+-----------------------------> bsize (bucket size)
+//                   |fake-header | addr
+#define headerAddr( addr ) ((HeapManager.Storage.Header *)( (char *)addr - sizeof(HeapManager.Storage) ))
+// <-------<<--------------------- dsize ---------------------->> bsize (bucket size)
+// |header |addr
+//==================================================================================
+//                                | alignment
+// <------------------------------<<---------- dsize --------->>> bsize (bucket size)
+//                   |fake-header |addr
+#define dataStorage( bsize, addr, header ) (bsize - ( (char *)addr - (char *)header ))
+static inline bool headers( const char name[] __attribute__(( unused )), void * addr, HeapManager.Storage.Header *& header, HeapManager.FreeHeader *& freeElem, size_t & size, size_t & alignment ) with ( heapManager ) {
+static inline bool headers( const char name[] __attribute__(( unused )), void * addr, HeapManager.Storage.Header *& header, HeapManager.FreeHeader *& freeElem,
+                                                        size_t & size, size_t & alignment ) with( heapManager ) {
         header = headerAddr( addr );
         if ( unlikely( heapEnd < addr ) ) {                                     // mmapped ?
+  if ( unlikely( addr < heapBegin || heapEnd < addr ) ) { // mmapped ?
                 fakeHeader( header, alignment );
                 size = header->kind.real.blockSize & -3;                // mmap size
 …
         #ifdef __CFA_DEBUG__
         checkHeader( addr < heapBegin || header < (HeapManager.Storage.Header *)heapBegin, name, addr ); // bad low address ?
+        checkHeader( header < (HeapManager.Storage.Header *)heapBegin, name, addr ); // bad low address ?
         #endif // __CFA_DEBUG__
 …
 } // headers
+static inline void * extend( size_t size ) with ( heapManager ) {
+#ifdef __CFA_DEBUG__
+#if __SIZEOF_POINTER__ == 4
+#define MASK 0xdeadbeef
+#else
+#define MASK 0xdeadbeefdeadbeef
+#endif
+#define STRIDE size_t
+static void * Memset( void * addr, STRIDE size ) {              // debug only
+        if ( size % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, size %zd not multiple of %zd.", size, sizeof(STRIDE) );
+        if ( (STRIDE)addr % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, addr %p not multiple of %zd.", addr, sizeof(STRIDE) );
+        STRIDE * end = (STRIDE *)addr + size / sizeof(STRIDE);
+        for ( STRIDE * p = (STRIDE *)addr; p < end; p += 1 ) *p = MASK;
+        return addr;
+} // Memset
+#endif // __CFA_DEBUG__
+#define NO_MEMORY_MSG "insufficient heap memory available for allocating %zd new bytes."
+static inline void * extend( size_t size ) with( heapManager ) {
         lock( extlock __cfaabi_dbg_ctx2 );
         ptrdiff_t rem = heapRemaining - size;
 …
                 // If the size requested is bigger than the current remaining storage, increase the size of the heap.
+                size_t increase = libCeiling( size > heapExpand ? size : heapExpand, libAlign() );
+                if ( sbrk( increase ) == (void *)-1 ) {
+                size_t increase = ceiling2( size > heapExpand ? size : heapExpand, __page_size );
+                // Do not call abort or strerror( errno ) as they may call malloc.
+                if ( sbrk( increase ) == (void *)-1 ) {                 // failed, no memory ?
                         unlock( extlock );
+                        errno = ENOMEM;
+                        return 0p;
+                        __cfaabi_bits_print_nolock( STDERR_FILENO, NO_MEMORY_MSG, size );
+                        _exit( EXIT_FAILURE );
+                } // if
+                if ( mprotect( (char *)heapEnd + heapRemaining, increase, __map_prot ) ) {
+                        unlock( extlock );
+                        __cfaabi_bits_print_nolock( STDERR_FILENO, "extend() : internal error, mprotect failure, heapEnd:%p size:%zd, errno:%d.\n", heapEnd, increase, errno );
+                        _exit( EXIT_FAILURE );
                 } // if
                 #ifdef __STATISTICS__
 …
                 #ifdef __CFA_DEBUG__
                 // Set new memory to garbage so subsequent uninitialized usages might fail.
+                memset( (char *)heapEnd + heapRemaining, '\377', increase );
+                memset( (char *)heapEnd + heapRemaining, '\xde', increase );
+                //Memset( (char *)heapEnd + heapRemaining, increase );
                 #endif // __CFA_DEBUG__
                 rem = heapRemaining + increase - size;
 …
 static inline void * doMalloc( size_t size ) with ( heapManager ) {
+static inline void * doMalloc( size_t size ) with( heapManager ) {
         HeapManager.Storage * block;                                            // pointer to new block of storage
 …
         // along with the block and is a multiple of the alignment size.
   if ( unlikely( size > ~0ul - sizeof(HeapManager.Storage) ) ) return 0p;
+  if ( unlikely( size > ULONG_MAX - sizeof(HeapManager.Storage) ) ) return 0p;
         size_t tsize = size + sizeof(HeapManager.Storage);
         if ( likely( tsize < mmapStart ) ) {                            // small size => sbrk
 …
                         posn = Bsearchl( (unsigned int)tsize, bucketSizes, (size_t)maxBucketsUsed );
                 HeapManager.FreeHeader * freeElem = &freeLists[posn];
+                // #ifdef FASTLOOKUP
+                // if ( tsize < LookupSizes )
+                //      freeElem = &freeLists[lookup[tsize]];
+                // else
+                // #endif // FASTLOOKUP
+                //      freeElem = bsearchl( tsize, freeLists, (size_t)maxBucketsUsed ); // binary search
+                // HeapManager.FreeHeader * freeElem =
+                //      #ifdef FASTLOOKUP
+                //      tsize < LookupSizes ? &freeLists[lookup[tsize]] :
+                //      #endif // FASTLOOKUP
+                //      bsearchl( tsize, freeLists, (size_t)maxBucketsUsed ); // binary search
+                assert( freeElem <= &freeLists[maxBucketsUsed] ); // subscripting error ?
+                assert( tsize <= freeElem->blockSize );                 // search failure ?
+                verify( freeElem <= &freeLists[maxBucketsUsed] ); // subscripting error ?
+                verify( tsize <= freeElem->blockSize );                 // search failure ?
                 tsize = freeElem->blockSize;                                    // total space needed for request
                 // Spin until the lock is acquired for this particular size of block.
                 #if defined( SPINLOCK )
+                #if BUCKETLOCK == SPINLOCK
                 lock( freeElem->lock __cfaabi_dbg_ctx2 );
                 block = freeElem->freeList;                                             // remove node from stack
                 #else
                 block = freeElem->freeList.pop();
                 #endif // SPINLOCK
+                block = pop( freeElem->freeList );
+                #endif // BUCKETLOCK
                 if ( unlikely( block == 0p ) ) {                                // no free block ?
                         #if defined( SPINLOCK )
+                        #if BUCKETLOCK == SPINLOCK
                         unlock( freeElem->lock );
                         #endif // SPINLOCK
+                        #endif // BUCKETLOCK
                         // Freelist for that size was empty, so carve it out of the heap if there's enough left, or get some more
 …
                         block = (HeapManager.Storage *)extend( tsize ); // mutual exclusion on call
+  if ( unlikely( block == 0p ) ) return 0p;
+                #if defined( SPINLOCK )
+                #if BUCKETLOCK == SPINLOCK
                 } else {
                         freeElem->freeList = block->header.kind.real.next;
                         unlock( freeElem->lock );
                 #endif // SPINLOCK
+                #endif // BUCKETLOCK
                 } // if
                 block->header.kind.real.home = freeElem;                // pointer back to free list of apropriate size
         } else {                                                                                        // large size => mmap
   if ( unlikely( size > ~0ul - pageSize ) ) return 0p;
                 tsize = libCeiling( tsize, pageSize );                  // must be multiple of page size
+  if ( unlikely( size > ULONG_MAX - __page_size ) ) return 0p;
+                tsize = ceiling2( tsize, __page_size );                 // must be multiple of page size
                 #ifdef __STATISTICS__
                 __atomic_add_fetch( &mmap_calls, 1, __ATOMIC_SEQ_CST );
                 __atomic_add_fetch( &mmap_storage, tsize, __ATOMIC_SEQ_CST );
                 #endif // __STATISTICS__
+                block = (HeapManager.Storage *)mmap( 0, tsize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, mmapFd, 0 );
+                if ( block == (HeapManager.Storage *)MAP_FAILED ) {
+                block = (HeapManager.Storage *)mmap( 0, tsize, __map_prot, MAP_PRIVATE | MAP_ANONYMOUS, mmapFd, 0 );
+                if ( block == (HeapManager.Storage *)MAP_FAILED ) { // failed ?
+                        if ( errno == ENOMEM ) abort( NO_MEMORY_MSG, tsize ); // no memory
                         // Do not call strerror( errno ) as it may call malloc.
                         abort( "(HeapManager &)0x%p.doMalloc() : internal error, mmap failure, size:%zu error:%d.", &heapManager, tsize, errno );
                 } // if
+                        abort( "(HeapManager &)0x%p.doMalloc() : internal error, mmap failure, size:%zu errno:%d.", &heapManager, tsize, errno );
+                } //if
                 #ifdef __CFA_DEBUG__
                 // Set new memory to garbage so subsequent uninitialized usages might fail.
+                memset( block, '\377', tsize );
+                memset( block, '\xde', tsize );
+                //Memset( block, tsize );
                 #endif // __CFA_DEBUG__
                 block->header.kind.real.blockSize = tsize;              // storage size for munmap
         } // if
+        block->header.kind.real.size = size;                            // store allocation size
         void * addr = &(block->data);                                           // adjust off header to user bytes
+        verify( ((uintptr_t)addr & (libAlign() - 1)) == 0 ); // minimum alignment ?
         #ifdef __CFA_DEBUG__
+        assert( ((uintptr_t)addr & (libAlign() - 1)) == 0 ); // minimum alignment ?
+        __atomic_add_fetch( &allocFree, tsize, __ATOMIC_SEQ_CST );
+        __atomic_add_fetch( &allocUnfreed, tsize, __ATOMIC_SEQ_CST );
         if ( traceHeap() ) {
                 enum { BufferSize = 64 };
                 char helpText[BufferSize];
                 int len = snprintf( helpText, BufferSize, "%p = Malloc( %zu ) (allocated %zu)\n", addr, size, tsize );
-                // int len = snprintf( helpText, BufferSize, "Malloc %p %zu\n", addr, size );
                 __cfaabi_bits_write( STDERR_FILENO, helpText, len ); // print debug/nodebug
         } // if
 …
 static inline void doFree( void * addr ) with ( heapManager ) {
+static inline void doFree( void * addr ) with( heapManager ) {
         #ifdef __CFA_DEBUG__
         if ( unlikely( heapManager.heapBegin == 0p ) ) {
 …
                 #endif // __STATISTICS__
                 if ( munmap( header, size ) == -1 ) {
-                        #ifdef __CFA_DEBUG__
                         abort( "Attempt to deallocate storage %p not allocated or with corrupt header.\n"
                                    "Possible cause is invalid pointer.",
                                    addr );
-                        #endif // __CFA_DEBUG__
                 } // if
         } else {
                 #ifdef __CFA_DEBUG__
                 // Set free memory to garbage so subsequent usages might fail.
+                memset( ((HeapManager.Storage *)header)->data, '\377', freeElem->blockSize - sizeof( HeapManager.Storage ) );
+                memset( ((HeapManager.Storage *)header)->data, '\xde', freeElem->blockSize - sizeof( HeapManager.Storage ) );
+                //Memset( ((HeapManager.Storage *)header)->data, freeElem->blockSize - sizeof( HeapManager.Storage ) );
                 #endif // __CFA_DEBUG__
 …
                 free_storage += size;
                 #endif // __STATISTICS__
                 #if defined( SPINLOCK )
+                #if BUCKETLOCK == SPINLOCK
                 lock( freeElem->lock __cfaabi_dbg_ctx2 );               // acquire spin lock
                 header->kind.real.next = freeElem->freeList;    // push on stack
 …
                 unlock( freeElem->lock );                                               // release spin lock
                 #else
                 freeElem->freeList.push( *(HeapManager.Storage *)header );
                 #endif // SPINLOCK
+                push( freeElem->freeList, *(HeapManager.Storage *)header );
+                #endif // BUCKETLOCK
         } // if
         #ifdef __CFA_DEBUG__
         __atomic_add_fetch( &allocFree, -size, __ATOMIC_SEQ_CST );
+        __atomic_add_fetch( &allocUnfreed, -size, __ATOMIC_SEQ_CST );
         if ( traceHeap() ) {
+                enum { BufferSize = 64 };
+                char helpText[BufferSize];
+                char helpText[64];
                 int len = snprintf( helpText, sizeof(helpText), "Free( %p ) size:%zu\n", addr, size );
                 __cfaabi_bits_write( STDERR_FILENO, helpText, len ); // print debug/nodebug
 …
 size_t prtFree( HeapManager & manager ) with ( manager ) {
+size_t prtFree( HeapManager & manager ) with( manager ) {
         size_t total = 0;
         #ifdef __STATISTICS__
 …
                 #endif // __STATISTICS__
                 #if defined( SPINLOCK )
+                #if BUCKETLOCK == SPINLOCK
                 for ( HeapManager.Storage * p = freeLists[i].freeList; p != 0p; p = p->header.kind.real.next ) {
                 #else
+                for ( HeapManager.Storage * p = freeLists[i].freeList.top(); p != 0p; p = p->header.kind.real.next.top ) {
+                #endif // SPINLOCK
+                        for(;;) {
+//              for ( HeapManager.Storage * p = top( freeLists[i].freeList ); p != 0p; p = (p)`next->top ) {
+//              for ( HeapManager.Storage * p = top( freeLists[i].freeList ); p != 0p; /* p = getNext( p )->top */) {
+//                      HeapManager.Storage * temp = p->header.kind.real.next.top; // FIX ME: direct assignent fails, initialization works`
+//                      typeof(p) temp = (( p )`next)->top;                     // FIX ME: direct assignent fails, initialization works`
+//                      p = temp;
+                #endif // BUCKETLOCK
                         total += size;
                         #ifdef __STATISTICS__
 …
+static void ?{}( HeapManager & manager ) with ( manager ) {
+        pageSize = sysconf( _SC_PAGESIZE );
+static void ?{}( HeapManager & manager ) with( manager ) {
+        __page_size = sysconf( _SC_PAGESIZE );
+        __map_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
         for ( unsigned int i = 0; i < NoBucketSizes; i += 1 ) { // initialize the free lists
 …
         #endif // FASTLOOKUP
         if ( setMmapStart( default_mmap_start() ) ) {
+        if ( ! setMmapStart( default_mmap_start() ) ) {
                 abort( "HeapManager : internal error, mmap start initialization failure." );
         } // if
 …
         char * end = (char *)sbrk( 0 );
+        sbrk( (char *)libCeiling( (long unsigned int)end, libAlign() ) - end ); // move start of heap to multiple of alignment
+        heapBegin = heapEnd = sbrk( 0 );                                        // get new start point
+        heapBegin = heapEnd = sbrk( (char *)ceiling2( (long unsigned int)end, __page_size ) - end ); // move start of heap to multiple of alignment
 } // HeapManager
 …
         if ( traceHeapTerm() ) {
                 printStats();
                 // if ( prtfree() ) prtFree( heapManager, true );
+                // prtUnfreed() called in heapAppStop()
         } // if
         #endif // __STATISTICS__
 …
 void memory_startup( void ) {
         #ifdef __CFA_DEBUG__
+        if ( unlikely( heapBoot ) ) {                                           // check for recursion during system boot
+                // DO NOT USE STREAMS AS THEY MAY BE UNAVAILABLE AT THIS POINT.
+        if ( heapBoot ) {                                                                       // check for recursion during system boot
                 abort( "boot() : internal error, recursively invoked during system boot." );
         } // if
 …
         #endif // __CFA_DEBUG__
         //assert( heapManager.heapBegin != 0 );
+        //verify( heapManager.heapBegin != 0 );
         //heapManager{};
         if ( heapManager.heapBegin == 0p ) heapManager{};
+        if ( heapManager.heapBegin == 0p ) heapManager{};       // sanity check
 } // memory_startup
 …
 static inline void * mallocNoStats( size_t size ) {             // necessary for malloc statistics
+        //assert( heapManager.heapBegin != 0 );
+        if ( unlikely( heapManager.heapBegin == 0p ) ) heapManager{}; // called before memory_startup ?
+        void * addr = doMalloc( size );
+        if ( unlikely( addr == 0p ) ) errno = ENOMEM;           // POSIX
+        return addr;
+        verify( heapManager.heapBegin != 0p );                          // called before memory_startup ?
+  if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
+#if __SIZEOF_POINTER__ == 8
+        verify( size < ((typeof(size_t))1 << 48) );
+#endif // __SIZEOF_POINTER__ == 8
+        return doMalloc( size );
 } // mallocNoStats
+static inline void * callocNoStats( size_t noOfElems, size_t elemSize ) {
+        size_t size = noOfElems * elemSize;
+static inline void * callocNoStats( size_t dim, size_t elemSize ) {
+        size_t size = dim * elemSize;
+  if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
         char * addr = (char *)mallocNoStats( size );
-  if ( unlikely( addr == 0p ) ) return 0p;
         HeapManager.Storage.Header * header;
         HeapManager.FreeHeader * freeElem;
         size_t bsize, alignment;
-        bool mapped __attribute__(( unused )) = headers( "calloc", addr, header, freeElem, bsize, alignment );
         #ifndef __CFA_DEBUG__
+        bool mapped =
+        #endif // __CFA_DEBUG__
+                headers( "calloc", addr, header, freeElem, bsize, alignment );
+        #ifndef __CFA_DEBUG__
         // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
         if ( ! mapped )
         #endif // __CFA_DEBUG__
+                // Zero entire data space even when > than size => realloc without a new allocation and zero fill works.
+                // <-------00000000000000000000000000000000000000000000000000000> bsize (bucket size)
+                // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
                 // `-header`-addr                      `-size
                 memset( addr, '\0', bsize - sizeof(HeapManager.Storage) ); // set to zeros
+                memset( addr, '\0', size );                                             // set to zeros
         header->kind.real.blockSize |= 2;                                       // mark as zero filled
 …
+static inline void * memalignNoStats( size_t alignment, size_t size ) { // necessary for malloc statistics
+static inline void * memalignNoStats( size_t alignment, size_t size ) {
+  if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
         #ifdef __CFA_DEBUG__
         checkAlign( alignment );                                                        // check alignment
 …
         // add sizeof(Storage) for fake header
         char * addr = (char *)mallocNoStats( size + alignment - libAlign() + sizeof(HeapManager.Storage) );
-  if ( unlikely( addr == 0p ) ) return addr;
         // address in the block of the "next" alignment address
         char * user = (char *)libCeiling( (uintptr_t)(addr + sizeof(HeapManager.Storage)), alignment );
+        char * user = (char *)ceiling2( (uintptr_t)(addr + sizeof(HeapManager.Storage)), alignment );
         // address of header from malloc
         HeapManager.Storage.Header * realHeader = headerAddr( addr );
+        realHeader->kind.real.size = size;                                      // correct size to eliminate above alignment offset
         // address of fake header * before* the alignment location
         HeapManager.Storage.Header * fakeHeader = headerAddr( user );
 …
+static inline void * cmemalignNoStats( size_t alignment, size_t noOfElems, size_t elemSize ) {
+        size_t size = noOfElems * elemSize;
+static inline void * cmemalignNoStats( size_t alignment, size_t dim, size_t elemSize ) {
+        size_t size = dim * elemSize;
+  if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
         char * addr = (char *)memalignNoStats( alignment, size );
+  if ( unlikely( addr == 0p ) ) return 0p;
         HeapManager.Storage.Header * header;
         HeapManager.FreeHeader * freeElem;
         size_t bsize;
-        bool mapped __attribute__(( unused )) = headers( "cmemalign", addr, header, freeElem, bsize, alignment );
         #ifndef __CFA_DEBUG__
+        bool mapped =
+        #endif // __CFA_DEBUG__
+                headers( "cmemalign", addr, header, freeElem, bsize, alignment );
         // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
+        #ifndef __CFA_DEBUG__
         if ( ! mapped )
         #endif // __CFA_DEBUG__
+                memset( addr, '\0', dataStorage( bsize, addr, header ) ); // set to zeros
+        header->kind.real.blockSize |= 2;                               // mark as zero filled
+                // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
+                // `-header`-addr                      `-size
+                memset( addr, '\0', size );                                             // set to zeros
+        header->kind.real.blockSize |= 2;                                       // mark as zero filled
         return addr;
 } // cmemalignNoStats
-// supported mallopt options
-#ifndef M_MMAP_THRESHOLD
-#define M_MMAP_THRESHOLD (-1)
-#endif // M_TOP_PAD
-#ifndef M_TOP_PAD
-#define M_TOP_PAD (-2)
-#endif // M_TOP_PAD
 extern "C" {
+        // The malloc() function allocates size bytes and returns a pointer to the allocated memory. The memory is not
+        // initialized. If size is 0, then malloc() returns either 0p, or a unique pointer value that can later be
+        // successfully passed to free().
+        // Allocates size bytes and returns a pointer to the allocated memory.  The contents are undefined. If size is 0,
+        // then malloc() returns a unique pointer value that can later be successfully passed to free().
         void * malloc( size_t size ) {
                 #ifdef __STATISTICS__
 …
         } // malloc
+        // The calloc() function allocates memory for an array of nmemb elements of size bytes each and returns a pointer to
+        // the allocated memory. The memory is set to zero. If nmemb or size is 0, then calloc() returns either 0p, or a
+        // unique pointer value that can later be successfully passed to free().
+        void * calloc( size_t noOfElems, size_t elemSize ) {
+        // Same as malloc() except size bytes is an array of dim elements each of elemSize bytes.
+        void * aalloc( size_t dim, size_t elemSize ) {
+                size_t size = dim * elemSize;
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
+                __atomic_add_fetch( &aalloc_storage, size, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return mallocNoStats( size );
+        } // aalloc
+        // Same as aalloc() with memory set to zero.
+        void * calloc( size_t dim, size_t elemSize ) {
                 #ifdef __STATISTICS__
                 __atomic_add_fetch( &calloc_calls, 1, __ATOMIC_SEQ_CST );
                 __atomic_add_fetch( &calloc_storage, noOfElems * elemSize, __ATOMIC_SEQ_CST );
                 #endif // __STATISTICS__
                 return callocNoStats( noOfElems, elemSize );
+                __atomic_add_fetch( &calloc_storage, dim * elemSize, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return callocNoStats( dim, elemSize );
         } // calloc
+        // The realloc() function changes the size of the memory block pointed to by ptr to size bytes. The contents will be
+        // unchanged in the range from the start of the region up to the minimum of the old and new sizes. If the new size
+        // is larger than the old size, the added memory will not be initialized.  If ptr is 0p, then the call is
+        // equivalent to malloc(size), for all values of size; if size is equal to zero, and ptr is not 0p, then the call
+        // is equivalent to free(ptr). Unless ptr is 0p, it must have been returned by an earlier call to malloc(),
+        // calloc() or realloc(). If the area pointed to was moved, a free(ptr) is done.
+        void * realloc( void * oaddr, size_t size ) {
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
+        // Change the size of the memory block pointed to by oaddr to size bytes. The contents are undefined.  If oaddr is
+        // 0p, then the call is equivalent to malloc(size), for all values of size; if size is equal to zero, and oaddr is
+        // not 0p, then the call is equivalent to free(oaddr). Unless oaddr is 0p, it must have been returned by an earlier
+        // call to malloc(), alloc(), calloc() or realloc(). If the area pointed to was moved, a free(oaddr) is done.
+        void * resize( void * oaddr, size_t size ) {
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
                 #endif // __STATISTICS__
                 // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+          if ( unlikely( size == 0 ) ) { free( oaddr ); return mallocNoStats( size ); } // special cases
+          if ( unlikely( oaddr == 0p ) ) return mallocNoStats( size );
+          if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
+          if ( unlikely( oaddr == 0p ) ) {
+                        #ifdef __STATISTICS__
+                        __atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
+                        #endif // __STATISTICS__
+                        return mallocNoStats( size );
+                } // if
                 HeapManager.Storage.Header * header;
                 HeapManager.FreeHeader * freeElem;
+                size_t bsize, oalign = 0;
+                size_t bsize, oalign;
+                headers( "resize", oaddr, header, freeElem, bsize, oalign );
+                size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
+                // same size, DO NOT preserve STICKY PROPERTIES.
+                if ( oalign == libAlign() && size <= odsize && odsize <= size * 2 ) { // allow 50% wasted storage for smaller size
+                        header->kind.real.blockSize &= -2;                      // no alignment and turn off 0 fill
+                        header->kind.real.size = size;                          // reset allocation size
+                        return oaddr;
+                } // if
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                // change size, DO NOT preserve STICKY PROPERTIES.
+                free( oaddr );
+                return mallocNoStats( size );                                   // create new area
+        } // resize
+        // Same as resize() but the contents are unchanged in the range from the start of the region up to the minimum of
+        // the old and new sizes.
+        void * realloc( void * oaddr, size_t size ) {
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+          if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
+          if ( unlikely( oaddr == 0p ) ) {
+                        #ifdef __STATISTICS__
+                        __atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
+                        #endif // __STATISTICS__
+                        return mallocNoStats( size );
+                } // if
+                HeapManager.Storage.Header * header;
+                HeapManager.FreeHeader * freeElem;
+                size_t bsize, oalign;
                 headers( "realloc", oaddr, header, freeElem, bsize, oalign );
                 size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
+          if ( size <= odsize && odsize <= size * 2 ) { // allow up to 50% wasted storage in smaller size
+                        // Do not know size of original allocation => cannot do 0 fill for any additional space because do not know
+                        // where to start filling, i.e., do not overwrite existing values in space.
+                        //
+                        // This case does not result in a new profiler entry because the previous one still exists and it must match with
+                        // the free for this memory.  Hence, this realloc does not appear in the profiler output.
+                size_t osize = header->kind.real.size;                  // old allocation size
+                bool ozfill = (header->kind.real.blockSize & 2); // old allocation zero filled
+          if ( unlikely( size <= odsize ) && odsize <= size * 2 ) { // allow up to 50% wasted storage
+                        header->kind.real.size = size;                          // reset allocation size
+                        if ( unlikely( ozfill ) && size > osize ) {     // previous request zero fill and larger ?
+                                memset( (char *)oaddr + osize, '\0', size - osize ); // initialize added storage
+                        } // if
                         return oaddr;
                 } // if
                 #ifdef __STATISTICS__
                 __atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
+                __atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
                 #endif // __STATISTICS__
 …
                 void * naddr;
+                if ( unlikely( oalign != 0 ) ) {                                // previous request memalign?
+                        if ( unlikely( header->kind.real.blockSize & 2 ) ) { // previous request zero fill
+                                naddr = cmemalignNoStats( oalign, 1, size ); // create new aligned area
+                        } else {
+                                naddr = memalignNoStats( oalign, size ); // create new aligned area
+                if ( likely( oalign == libAlign() ) ) {                 // previous request not aligned ?
+                        naddr = mallocNoStats( size );                          // create new area
+                } else {
+                        naddr = memalignNoStats( oalign, size );        // create new aligned area
+                } // if
+                headers( "realloc", naddr, header, freeElem, bsize, oalign );
+                memcpy( naddr, oaddr, min( osize, size ) );             // copy bytes
+                free( oaddr );
+                if ( unlikely( ozfill ) ) {                                             // previous request zero fill ?
+                        header->kind.real.blockSize |= 2;                       // mark new request as zero filled
+                        if ( size > osize ) {                                           // previous request larger ?
+                                memset( (char *)naddr + osize, '\0', size - osize ); // initialize added storage
                         } // if
+                } else {
+                        if ( unlikely( header->kind.real.blockSize & 2 ) ) { // previous request zero fill
+                                naddr = callocNoStats( 1, size );               // create new area
+                        } else {
+                                naddr = mallocNoStats( size );                  // create new area
+                        } // if
+                } // if
+          if ( unlikely( naddr == 0p ) ) return 0p;
+                headers( "realloc", naddr, header, freeElem, bsize, oalign );
+                size_t ndsize = dataStorage( bsize, naddr, header ); // data storage avilable in bucket
+                // To preserve prior fill, the entire bucket must be copied versus the size.
+                memcpy( naddr, oaddr, MIN( odsize, ndsize ) );  // copy bytes
+                free( oaddr );
+                } // if
                 return naddr;
         } // realloc
+        // The obsolete function memalign() allocates size bytes and returns a pointer to the allocated memory. The memory
         // address will be a multiple of alignment, which must be a power of two.
+        // Same as malloc() except the memory address is a multiple of alignment, which must be a power of two. (obsolete)
         void * memalign( size_t alignment, size_t size ) {
                 #ifdef __STATISTICS__
 …
+        // The cmemalign() function is the same as calloc() with memory alignment.
+        void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize ) {
+        // Same as aalloc() with memory alignment.
+        void * amemalign( size_t alignment, size_t dim, size_t elemSize ) {
+                size_t size = dim * elemSize;
                 #ifdef __STATISTICS__
                 __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
+                __atomic_add_fetch( &cmemalign_storage, noOfElems * elemSize, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return cmemalignNoStats( alignment, noOfElems, elemSize );
+                __atomic_add_fetch( &cmemalign_storage, size, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return memalignNoStats( alignment, size );
+        } // amemalign
+        // Same as calloc() with memory alignment.
+        void * cmemalign( size_t alignment, size_t dim, size_t elemSize ) {
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
+                __atomic_add_fetch( &cmemalign_storage, dim * elemSize, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return cmemalignNoStats( alignment, dim, elemSize );
         } // cmemalign
+        // The function aligned_alloc() is the same as memalign(), except for the added restriction that size should be a
+        // multiple of alignment.
+        // Same as memalign(), but ISO/IEC 2011 C11 Section 7.22.2 states: the value of size shall be an integral multiple
+    // of alignment. This requirement is universally ignored.
         void * aligned_alloc( size_t alignment, size_t size ) {
                 return memalign( alignment, size );
 …
         // The function posix_memalign() allocates size bytes and places the address of the allocated memory in *memptr. The
         // address of the allocated memory will be a multiple of alignment, which must be a power of two and a multiple of
         // sizeof(void *). If size is 0, then posix_memalign() returns either 0p, or a unique pointer value that can later
         // be successfully passed to free(3).
+        // Allocates size bytes and places the address of the allocated memory in *memptr. The address of the allocated
+        // memory shall be a multiple of alignment, which must be a power of two and a multiple of sizeof(void *). If size
+        // is 0, then posix_memalign() returns either 0p, or a unique pointer value that can later be successfully passed to
+        // free(3).
         int posix_memalign( void ** memptr, size_t alignment, size_t size ) {
           if ( alignment < sizeof(void *) || ! libPow2( alignment ) ) return EINVAL; // check alignment
+          if ( alignment < libAlign() || ! is_pow2( alignment ) ) return EINVAL; // check alignment
                 * memptr = memalign( alignment, size );
-          if ( unlikely( * memptr == 0p ) ) return ENOMEM;
                 return 0;
         } // posix_memalign
+        // The obsolete function valloc() allocates size bytes and returns a pointer to the allocated memory. The memory
+        // address will be a multiple of the page size.  It is equivalent to memalign(sysconf(_SC_PAGESIZE),size).
+        // Allocates size bytes and returns a pointer to the allocated memory. The memory address shall be a multiple of the
+        // page size.  It is equivalent to memalign(sysconf(_SC_PAGESIZE),size).
         void * valloc( size_t size ) {
                 return memalign( pageSize, size );
+                return memalign( __page_size, size );
         } // valloc
+        // The free() function frees the memory space pointed to by ptr, which must have been returned by a previous call to
+        // malloc(), calloc() or realloc().  Otherwise, or if free(ptr) has already been called before, undefined behavior
+        // occurs. If ptr is 0p, no operation is performed.
+        // Same as valloc but rounds size to multiple of page size.
+        void * pvalloc( size_t size ) {
+                return memalign( __page_size, ceiling2( size, __page_size ) );
+        } // pvalloc
+        // Frees the memory space pointed to by ptr, which must have been returned by a previous call to malloc(), calloc()
+        // or realloc().  Otherwise, or if free(ptr) has already been called before, undefined behaviour occurs. If ptr is
+        // 0p, no operation is performed.
         void free( void * addr ) {
                 #ifdef __STATISTICS__
 …
         // The malloc_alignment() function returns the alignment of the allocation.
+        // Returns the alignment of an allocation.
         size_t malloc_alignment( void * addr ) {
           if ( unlikely( addr == 0p ) ) return libAlign();      // minimum alignment
 …
                         return header->kind.fake.alignment & -2;        // remove flag from value
                 } else {
                         return libAlign ();                                                     // minimum alignment
+                        return libAlign();                                                      // minimum alignment
                 } // if
         } // malloc_alignment
+        // The malloc_zero_fill() function returns true if the allocation is zero filled, i.e., initially allocated by calloc().
+        // Set the alignment for an the allocation and return previous alignment or 0 if no alignment.
+        size_t $malloc_alignment_set( void * addr, size_t alignment ) {
+          if ( unlikely( addr == 0p ) ) return libAlign();      // minimum alignment
+                size_t ret;
+                HeapManager.Storage.Header * header = headerAddr( addr );
+                if ( (header->kind.fake.alignment & 1) == 1 ) { // fake header ?
+                        ret = header->kind.fake.alignment & -2;         // remove flag from old value
+                        header->kind.fake.alignment = alignment | 1; // add flag to new value
+                } else {
+                        ret = 0;                                                                        // => no alignment to change
+                } // if
+                return ret;
+        } // $malloc_alignment_set
+        // Returns true if the allocation is zero filled, e.g., allocated by calloc().
         bool malloc_zero_fill( void * addr ) {
           if ( unlikely( addr == 0p ) ) return false;           // null allocation is not zero fill
                 HeapManager.Storage.Header * header = headerAddr( addr );
                 if ( (header->kind.fake.alignment & 1) == 1 ) { // fake header ?
                         header = (HeapManager.Storage.Header *)((char *)header - header->kind.fake.offset);
                 } // if
                 return (header->kind.real.blockSize & 2) != 0;  // zero filled (calloc/cmemalign) ?
+                        header = realHeader( header );                          // backup from fake to real header
+                } // if
+                return (header->kind.real.blockSize & 2) != 0;  // zero filled ?
         } // malloc_zero_fill
+        // The malloc_usable_size() function returns the number of usable bytes in the block pointed to by ptr, a pointer to
+        // a block of memory allocated by malloc(3) or a related function.
+        // Set allocation is zero filled and return previous zero filled.
+        bool $malloc_zero_fill_set( void * addr ) {
+          if ( unlikely( addr == 0p ) ) return false;           // null allocation is not zero fill
+                HeapManager.Storage.Header * header = headerAddr( addr );
+                if ( (header->kind.fake.alignment & 1) == 1 ) { // fake header ?
+                        header = realHeader( header );                          // backup from fake to real header
+                } // if
+                bool ret = (header->kind.real.blockSize & 2) != 0; // zero filled ?
+                header->kind.real.blockSize |= 2;                               // mark as zero filled
+                return ret;
+        } // $malloc_zero_fill_set
+        // Returns original total allocation size (not bucket size) => array size is dimension * sizeif(T).
+        size_t malloc_size( void * addr ) {
+          if ( unlikely( addr == 0p ) ) return 0;                       // null allocation has zero size
+                HeapManager.Storage.Header * header = headerAddr( addr );
+                if ( (header->kind.fake.alignment & 1) == 1 ) { // fake header ?
+                        header = realHeader( header );                          // backup from fake to real header
+                } // if
+                return header->kind.real.size;
+        } // malloc_size
+        // Set allocation size and return previous size.
+        size_t $malloc_size_set( void * addr, size_t size ) {
+          if ( unlikely( addr == 0p ) ) return 0;                       // null allocation has 0 size
+                HeapManager.Storage.Header * header = headerAddr( addr );
+                if ( (header->kind.fake.alignment & 1) == 1 ) { // fake header ?
+                        header = realHeader( header );                          // backup from fake to real header
+                } // if
+                size_t ret = header->kind.real.size;
+                header->kind.real.size = size;
+                return ret;
+        } // $malloc_size_set
+        // Returns the number of usable bytes in the block pointed to by ptr, a pointer to a block of memory allocated by
+        // malloc or a related function.
         size_t malloc_usable_size( void * addr ) {
           if ( unlikely( addr == 0p ) ) return 0;                       // null allocation has 0 size
 …
                 headers( "malloc_usable_size", addr, header, freeElem, bsize, alignment );
                 return dataStorage( bsize, addr, header );      // data storage in bucket
+                return dataStorage( bsize, addr, header );              // data storage in bucket
         } // malloc_usable_size
+        // The malloc_stats() function prints (on default standard error) statistics about memory allocated by malloc(3) and
+        // related functions.
+        // Prints (on default standard error) statistics about memory allocated by malloc and related functions.
         void malloc_stats( void ) {
                 #ifdef __STATISTICS__
 …
         } // malloc_stats
+        // The malloc_stats_fd() function changes the file descripter where malloc_stats() writes the statistics.
+        // Changes the file descripter where malloc_stats() writes statistics.
         int malloc_stats_fd( int fd __attribute__(( unused )) ) {
                 #ifdef __STATISTICS__
                 int temp = statfd;
                 statfd = fd;
+                int temp = stat_fd;
+                stat_fd = fd;
                 return temp;
                 #else
 …
+        // The mallopt() function adjusts parameters that control the behavior of the memory-allocation functions (see
+        // malloc(3)). The param argument specifies the parameter to be modified, and value specifies the new value for that
+        // parameter.
+        // Adjusts parameters that control the behaviour of the memory-allocation functions (see malloc). The param argument
+        // specifies the parameter to be modified, and value specifies the new value for that parameter.
         int mallopt( int option, int value ) {
                 choose( option ) {
                   case M_TOP_PAD:
                         if ( setHeapExpand( value ) ) return 1;
+                        heapExpand = ceiling2( value, __page_size ); return 1;
                   case M_MMAP_THRESHOLD:
                         if ( setMmapStart( value ) ) return 1;
+                        break;
                 } // switch
                 return 0;                                                                               // error, unsupported
         } // mallopt
+        // The malloc_trim() function attempts to release free memory at the top of the heap (by calling sbrk(2) with a
         // suitable argument).
+        // Attempt to release free memory at the top of the heap (by calling sbrk with a suitable argument).
         int malloc_trim( size_t ) {
                 return 0;                                                                               // => impossible to release memory
 …
         // The malloc_info() function exports an XML string that describes the current state of the memory-allocation
         // implementation in the caller.  The string is printed on the file stream stream.  The exported string includes
         // information about all arenas (see malloc(3)).
+        // Exports an XML string that describes the current state of the memory-allocation implementation in the caller.
+        // The string is printed on the file stream stream.  The exported string includes information about all arenas (see
+        // malloc).
         int malloc_info( int options, FILE * stream ) {
+                if ( options != 0 ) { errno = EINVAL; return -1; }
+          if ( options != 0 ) { errno = EINVAL; return -1; }
+                #ifdef __STATISTICS__
                 return printStatsXML( stream );
+                #else
+                return 0;                                                                               // unsupported
+                #endif // __STATISTICS__
         } // malloc_info
         // The malloc_get_state() function records the current state of all malloc(3) internal bookkeeping variables (but
         // not the actual contents of the heap or the state of malloc_hook(3) functions pointers).  The state is recorded in
         // a system-dependent opaque data structure dynamically allocated via malloc(3), and a pointer to that data
         // structure is returned as the function result.  (It is the caller's responsibility to free(3) this memory.)
+        // Records the current state of all malloc internal bookkeeping variables (but not the actual contents of the heap
+        // or the state of malloc_hook functions pointers).  The state is recorded in a system-dependent opaque data
+        // structure dynamically allocated via malloc, and a pointer to that data structure is returned as the function
+        // result.  (The caller must free this memory.)
         void * malloc_get_state( void ) {
                 return 0p;                                                                              // unsupported
 …
         // The malloc_set_state() function restores the state of all malloc(3) internal bookkeeping variables to the values
         // recorded in the opaque data structure pointed to by state.
         int malloc_set_state( void * ptr ) {
+        // Restores the state of all malloc internal bookkeeping variables to the values recorded in the opaque data
+        // structure pointed to by state.
+        int malloc_set_state( void * ) {
                 return 0;                                                                               // unsupported
         } // malloc_set_state
 …
 // Must have CFA linkage to overload with C linkage realloc.
 void * realloc( void * oaddr, size_t nalign, size_t size ) {
+void * resize( void * oaddr, size_t nalign, size_t size ) {
         #ifdef __STATISTICS__
         __atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
+        __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
         #endif // __STATISTICS__
+        // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+  if ( unlikely( size == 0 ) ) { free( oaddr ); return mallocNoStats( size ); } // special cases
+  if ( unlikely( oaddr == 0p ) ) return mallocNoStats( size );
+        if ( unlikely( nalign == 0 ) ) nalign = libAlign();     // reset alignment to minimum
+        if ( unlikely( nalign < libAlign() ) ) nalign = libAlign(); // reset alignment to minimum
         #ifdef __CFA_DEBUG__
         else
 …
         #endif // __CFA_DEBUG__
+        HeapManager.Storage.Header * header;
+        HeapManager.FreeHeader * freeElem;
+        size_t bsize, oalign = 0;
+        headers( "realloc", oaddr, header, freeElem, bsize, oalign );
+        size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
+  if ( oalign != 0 && (uintptr_t)oaddr % nalign == 0 ) { // has alignment and just happens to work out
+                headerAddr( oaddr )->kind.fake.alignment = nalign | 1; // update alignment (could be the same)
+                return realloc( oaddr, size );
+        // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
+  if ( unlikely( oaddr == 0p ) ) {
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return memalignNoStats( nalign, size );
+        } // if
+        // Attempt to reuse existing alignment.
+        HeapManager.Storage.Header * header = headerAddr( oaddr );
+        bool isFakeHeader = header->kind.fake.alignment & 1; // old fake header ?
+        size_t oalign;
+        if ( isFakeHeader ) {
+                oalign = header->kind.fake.alignment & -2;              // old alignment
+                if ( (uintptr_t)oaddr % nalign == 0                             // lucky match ?
+                         && ( oalign <= nalign                                          // going down
+                                  || (oalign >= nalign && oalign <= 256) ) // little alignment storage wasted ?
+                        ) {
+                        headerAddr( oaddr )->kind.fake.alignment = nalign | 1; // update alignment (could be the same)
+                        HeapManager.FreeHeader * freeElem;
+                        size_t bsize, oalign;
+                        headers( "resize", oaddr, header, freeElem, bsize, oalign );
+                        size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
+                        if ( size <= odsize && odsize <= size * 2 ) { // allow 50% wasted data storage
+                                headerAddr( oaddr )->kind.fake.alignment = nalign | 1; // update alignment (could be the same)
+                                header->kind.real.blockSize &= -2;              // turn off 0 fill
+                                header->kind.real.size = size;                  // reset allocation size
+                                return oaddr;
+                        } // if
+                } // if
+        } else if ( ! isFakeHeader                                                      // old real header (aligned on libAlign) ?
+                                && nalign == libAlign() ) {                             // new alignment also on libAlign => no fake header needed
+                return resize( oaddr, size );                                   // duplicate special case checks
         } // if
         #ifdef __STATISTICS__
+        __atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
+        #endif // __STATISTICS__
+        // change size, DO NOT preserve STICKY PROPERTIES.
+        free( oaddr );
+        return memalignNoStats( nalign, size );                         // create new aligned area
+} // resize
+void * realloc( void * oaddr, size_t nalign, size_t size ) {
+        if ( unlikely( nalign < libAlign() ) ) nalign = libAlign(); // reset alignment to minimum
+        #ifdef __CFA_DEBUG__
+        else
+                checkAlign( nalign );                                                   // check alignment
+        #endif // __CFA_DEBUG__
+        // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
+  if ( unlikely( oaddr == 0p ) ) {
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
+                __atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return memalignNoStats( nalign, size );
+        } // if
+        // Attempt to reuse existing alignment.
+        HeapManager.Storage.Header * header = headerAddr( oaddr );
+        bool isFakeHeader = header->kind.fake.alignment & 1; // old fake header ?
+        size_t oalign;
+        if ( isFakeHeader ) {
+                oalign = header->kind.fake.alignment & -2;              // old alignment
+                if ( (uintptr_t)oaddr % nalign == 0                             // lucky match ?
+                         && ( oalign <= nalign                                          // going down
+                                  || (oalign >= nalign && oalign <= 256) ) // little alignment storage wasted ?
+                        ) {
+                        headerAddr( oaddr )->kind.fake.alignment = nalign | 1; // update alignment (could be the same)
+                        return realloc( oaddr, size );                          // duplicate alignment and special case checks
+                } // if
+        } else if ( ! isFakeHeader                                                      // old real header (aligned on libAlign) ?
+                                && nalign == libAlign() )                               // new alignment also on libAlign => no fake header needed
+                return realloc( oaddr, size );                                  // duplicate alignment and special case checks
+        #ifdef __STATISTICS__
+        __atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
         __atomic_add_fetch( &realloc_storage, size, __ATOMIC_SEQ_CST );
         #endif // __STATISTICS__
+        HeapManager.FreeHeader * freeElem;
+        size_t bsize;
+        headers( "realloc", oaddr, header, freeElem, bsize, oalign );
         // change size and copy old content to new storage
+        void * naddr;
+        if ( unlikely( header->kind.real.blockSize & 2 ) ) { // previous request zero fill
+                naddr = cmemalignNoStats( nalign, 1, size );    // create new aligned area
+        } else {
+                naddr = memalignNoStats( nalign, size );                // create new aligned area
+        } // if
+        size_t osize = header->kind.real.size;                          // old allocation size
+        bool ozfill = (header->kind.real.blockSize & 2);        // old allocation zero filled
+        void * naddr = memalignNoStats( nalign, size );         // create new aligned area
         headers( "realloc", naddr, header, freeElem, bsize, oalign );
+        size_t ndsize = dataStorage( bsize, naddr, header ); // data storage avilable in bucket
+        // To preserve prior fill, the entire bucket must be copied versus the size.
+        memcpy( naddr, oaddr, MIN( odsize, ndsize ) );          // copy bytes
+        memcpy( naddr, oaddr, min( osize, size ) );                     // copy bytes
         free( oaddr );
+        if ( unlikely( ozfill ) ) {                                                     // previous request zero fill ?
+                header->kind.real.blockSize |= 2;                               // mark new request as zero filled
+                if ( size > osize ) {                                                   // previous request larger ?
+                        memset( (char *)naddr + osize, '\0', size - osize ); // initialize added storage
+                } // if
+        } // if
         return naddr;
 } // realloc

libcfa/src/interpose.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed Mar 29 16:10:31 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Feb 17 10:18:53 2020
 // Update Count     : 166
+// Last Modified On : Fri Mar 13 17:35:37 2020
+// Update Count     : 178
 //
 #include <stdarg.h>                                                                             // va_start, va_end
+#include <stdio.h>
 #include <string.h>                                                                             // strlen
 #include <unistd.h>                                                                             // _exit, getpid
 …
 void abort( const char fmt[], ... ) __attribute__(( format(printf, 1, 2), __nothrow__, __leaf__, __noreturn__ ));
 void abort( bool signalAbort, const char fmt[], ... ) __attribute__(( format(printf, 2, 3), __nothrow__, __leaf__, __noreturn__ ));
+void __abort( bool signalAbort, const char fmt[], va_list args ) __attribute__(( __nothrow__, __leaf__, __noreturn__ ));
 extern "C" {
 …
                 va_list argp;
                 va_start( argp, fmt );
                 abort( false, fmt, argp );
+                __abort( false, fmt, argp );
                 va_end( argp );
+        }
 …
+}
+void abort( bool signalAbort, const char fmt[], ... ) {
+        void * kernel_data = kernel_abort();                            // must be done here to lock down kernel
+        int len;
+        signal( SIGABRT, SIG_DFL );                                                     // prevent final "real" abort from recursing to handler
+        len = snprintf( abort_text, abort_text_size, "Cforall Runtime error (UNIX pid:%ld) ", (long int)getpid() ); // use UNIX pid (versus getPid)
+        __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+        assert( fmt );
+        va_list args;
+        va_start( args, fmt );
+        len = vsnprintf( abort_text, abort_text_size, fmt, args );
+        va_end( args );
+        __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+        if ( fmt[strlen( fmt ) - 1] != '\n' ) {                         // add optional newline if missing at the end of the format text
+                __cfaabi_dbg_write( "\n", 1 );
+        } // if
+        kernel_abort_msg( kernel_data, abort_text, abort_text_size );
+        __cfaabi_backtrace( signalAbort ? 4 : 2 );
+        __cabi_libc.abort();                                                            // print stack trace in handler
+static volatile int __abort_stage = 0;
+// Cannot forward va_list.
+void __abort( bool signalAbort, const char fmt[], va_list args ) {
+        int stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+        // First stage: stop the cforall kernel and print
+        if(stage == 1) {
+                // increment stage
+                stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+                // must be done here to lock down kernel
+                void * kernel_data = kernel_abort();
+                int len;
+                signal( SIGABRT, SIG_DFL );                                                     // prevent final "real" abort from recursing to handler
+                len = snprintf( abort_text, abort_text_size, "Cforall Runtime error (UNIX pid:%ld) ", (long int)getpid() ); // use UNIX pid (versus getPid)
+                __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+                assert( fmt );
+                len = vsnprintf( abort_text, abort_text_size, fmt, args );
+                __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+                // add optional newline if missing at the end of the format text
+                if ( fmt[strlen( fmt ) - 1] != '\n' ) {
+                        __cfaabi_bits_write( STDERR_FILENO, "\n", 1 );
+                } // if
+                kernel_abort_msg( kernel_data, abort_text, abort_text_size );
+        }
+        // Second stage: print the backtrace
+        if(stage == 2) {
+                // increment stage
+                stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+                // print stack trace in handler
+                __cfaabi_backtrace( signalAbort ? 4 : 2 );
+        }
+        do {
+                // Finally call abort
+                __cabi_libc.abort();
+                // Loop so that we never return
+        } while(true);
+}
 …
         va_list args;
         va_start( args, fmt );
+        abort( false, fmt, args );
+        __abort( false, fmt, args );
+    // CONTROL NEVER REACHES HERE!
         va_end( args );
+}
+void abort( bool signalAbort, const char fmt[], ... ) {
+    va_list args;
+    va_start( args, fmt );
+    __abort( signalAbort, fmt, args );
+    // CONTROL NEVER REACHES HERE!
+    va_end( args );
+}

libcfa/src/iostream.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Feb 20 15:53:23 2020
 // Update Count     : 829
+// Last Modified On : Mon Aug 24 08:31:35 2020
+// Update Count     : 1130
 //
 #include "iostream.hfa"
-extern "C" {
 #include <stdio.h>
 #include <stdbool.h>                                                                    // true/false
 #include <stdint.h>                                                                             // UINT64_MAX
+//#include <string.h>                                                                   // strlen, strcmp
+#include <float.h>                                                                              // DBL_DIG, LDBL_DIG
+#include <complex.h>                                                                    // creal, cimag
+//#include <string.h>                                                                   // strlen, strcmp, memcpy
+extern "C" {
 extern size_t strlen (const char *__s) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
 extern int strcmp (const char *__s1, const char *__s2) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
 extern char *strcpy (char *__restrict __dest, const char *__restrict __src) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern void *memcpy (void *__restrict __dest, const void *__restrict __src, size_t __n) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
-#include <float.h>                                                                              // DBL_DIG, LDBL_DIG
-#include <math.h>                                                                               // isfinite
-#include <complex.h>                                                                    // creal, cimag
 } // extern "C"
+//*********************************** ostream ***********************************
+#include "math.hfa"                                                                             // isfinite, floor, ceiling_div
+#include "bitmanip.hfa"                                                                 // high1
+// *********************************** ostream ***********************************
 forall( dtype ostype | ostream( ostype ) ) {
-        ostype & ?|?( ostype & os, zero_t ) {
-                if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
-                fmt( os, "%d", 0n );
-                return os;
-        } // ?|?
-        void ?|?( ostype & os, zero_t z ) {
-                (ostype &)(os | z); ends( os );
-        } // ?|?
-        ostype & ?|?( ostype & os, one_t ) {
-                if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
-                fmt( os, "%d", 1n );
-                return os;
-        } // ?|?
-        void ?|?( ostype & os, one_t o ) {
-                (ostype &)(os | o); ends( os );
-        } // ?|?
         ostype & ?|?( ostype & os, bool b ) {
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
 …
         #define P10_UINT64 10_000_000_000_000_000_000_ULL       // 19 zeroes
+        static void base10_128( ostype & os, unsigned int128 val ) {
+                if ( val > UINT64_MAX ) {
+        static inline void base10_128( ostype & os, unsigned int128 val ) {
+#if defined(__GNUC__) && __GNUC_PREREQ(7,0)                             // gcc version >= 7
+                if ( val > P10_UINT64 ) {
+#else
+                if ( (uint64_t)(val >> 64) != 0 || (uint64_t)val > P10_UINT64 ) { // patch gcc 5 & 6 -O3 bug
+#endif // __GNUC_PREREQ(7,0)
                         base10_128( os, val / P10_UINT64 );                     // recursive
                         fmt( os, "%.19lu", (uint64_t)(val % P10_UINT64) );
 …
         } // base10_128
         static void base10_128( ostype & os, int128 val ) {
+        static inline void base10_128( ostype & os, int128 val ) {
                 if ( val < 0 ) {
                         fmt( os, "-" );                                                         // leading negative sign
 …
 } // distribution
 //*********************************** manipulators ***********************************
 //*********************************** integral ***********************************
+// *********************************** manipulators ***********************************
+// *********************************** integral ***********************************
 static const char * shortbin[] = { "0", "1", "10", "11", "100", "101", "110", "111", "1000", "1001", "1010", "1011", "1100", "1101", "1110", "1111" };
 …
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 #define IntegralFMTImpl( T, CODE, IFMTNP, IFMTP ) \
+#define IntegralFMTImpl( T, IFMTNP, IFMTP ) \
 forall( dtype ostype | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
 …
+\
                 if ( f.base == 'b' || f.base == 'B' ) {                 /* bespoke binary format */ \
+                        int bits;                                                                                                       \
+                        if ( f.val == (T){0} ) bits = 1;                        /* force at least one bit to print */ \
+                        else bits = sizeof(long long int) * 8 - __builtin_clzll( f.val ); /* position of most significant bit */ \
+                        bits = bits > sizeof(f.val) * 8 ? sizeof(f.val) * 8 : bits; \
+                        int spaces = f.wd - bits;                                       /* can be negative */ \
+                        if ( ! f.flags.nobsdp ) { spaces -= 2; }        /* base prefix takes space */ \
+                        /* printf( "%d %d\n", bits, spaces ); */ \
+                        int bits = high1( f.val );                                      /* position of most significant bit */ \
+                        if ( bits == 0 ) bits = 1;                                      /* 0 value => force one bit to print */ \
+                        int spaces; \
                         if ( ! f.flags.left ) {                                         /* right justified ? */ \
                                 /* Note, base prefix then zero padding or spacing then prefix. */ \
+                                if ( f.flags.pad0 || f.flags.pc ) { \
+                                if ( f.flags.pc ) { \
+                                        spaces = f.wd - f.pc; \
+                                        if ( ! f.flags.nobsdp ) { spaces -= 2; } /* base prefix takes space */ \
+                                        if ( spaces > 0 ) fmt( os, "%*s", spaces, " " ); /* space pad */ \
                                         if ( ! f.flags.nobsdp ) { fmt( os, "0%c", f.base ); } \
                                         if ( f.flags.pc ) spaces = f.pc - bits; \
+                                        spaces = f.pc - bits; \
                                         if ( spaces > 0 ) fmt( os, "%0*d", spaces, 0 ); /* zero pad */ \
                                 } else { \
+                                        if ( spaces > 0 ) fmt( os, "%*s", spaces, " " ); /* space pad */ \
+                                        if ( ! f.flags.nobsdp ) { fmt( os, "0%c", f.base ); } \
+                                        spaces = f.wd - bits; \
+                                        if ( ! f.flags.nobsdp ) { spaces -= 2; } /* base prefix takes space */ \
+                                        if ( f.flags.pad0 ) { \
+                                                if ( ! f.flags.nobsdp ) { fmt( os, "0%c", f.base ); } \
+                                                if ( spaces > 0 ) fmt( os, "%0*d", spaces, 0 ); /* zero pad */ \
+                                        } else { \
+                                                if ( spaces > 0 ) fmt( os, "%*s", spaces, " " ); /* space pad */ \
+                                                if ( ! f.flags.nobsdp ) { fmt( os, "0%c", f.base ); } \
+                                        } /* if */ \
                                 } /* if */ \
+                        } else if ( ! f.flags.nobsdp ) { \
+                                fmt( os, "0%c", f.base ); \
+                        } else { \
+                                if ( ! f.flags.nobsdp ) fmt( os, "0%c", f.base ); \
+                                if ( f.flags.pc ) { \
+                                        spaces = f.pc - bits; \
+                                        if ( spaces > 0 ) fmt( os, "%0*d", spaces, 0 ); /* zero pad */ \
+                                        spaces = f.wd - f.pc; \
+                                } else { /* pad0 flag ignored with left flag */ \
+                                        spaces = f.wd - bits; \
+                                } /* if */ \
+                                if ( ! f.flags.nobsdp ) { spaces -= 2; } /* base prefix takes space */ \
                         } /* if */ \
                         int shift = (bits - 1) / 4 * 4; /* floor( bits - 1, 4 ) */ \
+                        int shift = floor( bits - 1, 4 ); \
                         typeof( f.val ) temp = f.val; \
                         fmt( os, "%s", shortbin[(temp >> shift) & 0xf] ); \
 …
                         if ( f.flags.left && spaces > 0 ) fmt( os, "%*s", spaces, " " ); \
                         return os; \
                 } /* if  */ \
+                } /* if */ \
+\
                 char fmtstr[sizeof(IFMTP)];                                             /* sizeof includes '\0' */ \
 …
                 if ( ! f.flags.nobsdp ) { fmtstr[star] = '#'; star -= 1; } \
                 if ( f.flags.left ) { fmtstr[star] = '-'; star -= 1; } \
                 if ( f.flags.sign && f.base == CODE ) { fmtstr[star] = '+'; star -= 1; } \
+                if ( f.flags.sign ) { fmtstr[star] = '+'; star -= 1; } \
                 if ( f.flags.pad0 && ! f.flags.pc ) { fmtstr[star] = '0'; star -= 1; } \
                 fmtstr[star] = '%'; \
 …
                 if ( ! f.flags.pc ) {                                                   /* no precision */ \
                         fmtstr[sizeof(IFMTNP)-2] = f.base;                      /* sizeof includes '\0' */ \
                         /* printf( "%s %c %c\n", &fmtstr[star], f.base, CODE ); */ \
+                        /* printf( "%s %c\n", &fmtstr[star], f.base ); */ \
                         fmt( os, &fmtstr[star], f.wd, f.val ); \
                 } else {                                                                                /* precision */ \
                         fmtstr[sizeof(IFMTP)-2] = f.base;                       /* sizeof includes '\0' */ \
                         /* printf( "%s %c %c\n", &fmtstr[star], f.base, CODE ); */ \
+                        /* printf( "%s %c\n", &fmtstr[star], f.base ); */ \
                         fmt( os, &fmtstr[star], f.wd, f.pc, f.val ); \
                 } /* if */ \
 …
 } // distribution
 IntegralFMTImpl( signed char, 'd', "%    *hh ", "%    *.*hh " )
 IntegralFMTImpl( unsigned char, 'u', "%    *hh ", "%    *.*hh " )
 IntegralFMTImpl( signed short int, 'd', "%    *h ", "%    *.*h " )
 IntegralFMTImpl( unsigned short int, 'u', "%    *h ", "%    *.*h " )
 IntegralFMTImpl( signed int, 'd', "%    * ", "%    *.* " )
 IntegralFMTImpl( unsigned int, 'u', "%    * ", "%    *.* " )
 IntegralFMTImpl( signed long int, 'd', "%    *l ", "%    *.*l " )
 IntegralFMTImpl( unsigned long int, 'u', "%    *l ", "%    *.*l " )
 IntegralFMTImpl( signed long long int, 'd', "%    *ll ", "%    *.*ll " )
 IntegralFMTImpl( unsigned long long int, 'u', "%    *ll ", "%    *.*ll " )
+IntegralFMTImpl( signed char, "%    *hh ", "%    *.*hh " )
+IntegralFMTImpl( unsigned char, "%    *hh ", "%    *.*hh " )
+IntegralFMTImpl( signed short int, "%    *h ", "%    *.*h " )
+IntegralFMTImpl( unsigned short int, "%    *h ", "%    *.*h " )
+IntegralFMTImpl( signed int, "%    * ", "%    *.* " )
+IntegralFMTImpl( unsigned int, "%    * ", "%    *.* " )
+IntegralFMTImpl( signed long int, "%    *l ", "%    *.*l " )
+IntegralFMTImpl( unsigned long int, "%    *l ", "%    *.*l " )
+IntegralFMTImpl( signed long long int, "%    *ll ", "%    *.*ll " )
+IntegralFMTImpl( unsigned long long int, "%    *ll ", "%    *.*ll " )
+#if 0
 #if defined( __SIZEOF_INT128__ )
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 #define IntegralFMTImpl128( T, SIGNED, CODE, IFMTNP, IFMTP ) \
 forall( dtype ostype | ostream( ostype ) ) \
+static void base10_128( ostype & os, _Ostream_Manip(T) fmt ) { \
+        if ( fmt.val > UINT64_MAX ) { \
+                fmt.val /= P10_UINT64; \
+                base10_128( os, fmt ); /* recursive */ \
+                _Ostream_Manip(unsigned long long int) fmt2 @= { (uint64_t)(fmt.val % P10_UINT64), 0, 19, 'u', { .all : 0 } }; \
+                fmt2.flags.nobsdp = true; \
+                printf( "fmt2 %c %lld %d\n", fmt2.base, fmt2.val, fmt2.all );   \
+static void base10_128( ostype & os, _Ostream_Manip(T) f ) { \
+        if ( f.val > UINT64_MAX ) { \
+                unsigned long long int lsig = f.val % P10_UINT64; \
+                f.val /= P10_UINT64; /* msig */ \
+                base10_128( os, f ); /* recursion */ \
+                _Ostream_Manip(unsigned long long int) fmt @= { lsig, 0, 19, 'u', { .all : 0 } }; \
+                fmt.flags.nobsdp = true; \
+                /* printf( "fmt1 %c %lld %d\n", fmt.base, fmt.val, fmt.all ); */ \
                 sepOff( os ); \
                 (ostype &)(os | fmt2); \
+                (ostype &)(os | fmt); \
         } else { \
+                printf( "fmt %c %lld %d\n", fmt.base, fmt.val, fmt.all ); \
+                /* printf( "fmt2 %c %lld %d\n", f.base, (unsigned long long int)f.val, f.all ); */ \
+                _Ostream_Manip(SIGNED long long int) fmt @= { (SIGNED long long int)f.val, f.wd, f.pc, f.base, { .all : f.all } }; \
                 (ostype &)(os | fmt); \
         } /* if */ \
 } /* base10_128 */                                                 \
+} /* base10_128 */ \
 forall( dtype ostype | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
+\
                 if ( f.base == 'b' | f.base == 'o' | f.base == 'x' | f.base == 'X' ) { \
+                if ( f.base == 'b' | f.base == 'B' | f.base == 'o' | f.base == 'x' | f.base == 'X' ) { \
                         unsigned long long int msig = (unsigned long long int)(f.val >> 64); \
                         unsigned long long int lsig = (unsigned long long int)(f.val); \
 …
                         } else { \
                                 fmt2.flags.pad0 = fmt2.flags.nobsdp = true;     \
+                                if ( f.base == 'b' ) { \
+                                        if ( f.wd > 64 ) fmt.wd = f.wd - 64; \
+                                        fmt2.wd = 64; \
+                                if ( f.base == 'b' | f.base == 'B' ) { \
+                                        if ( fmt.flags.pc && fmt.pc > 64 ) fmt.pc -= 64; else { fmt.flags.pc = false; fmt.pc = 0; } \
+                                        if ( fmt.flags.left ) { \
+                                                fmt.flags.left = false; \
+                                                fmt.wd = 0; \
+                                                /* printf( "L %llo %llo %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all ); */ \
+                                                fmt2.flags.left = true; \
+                                                int msigd = high1( msig ); \
+                                                fmt2.wd = f.wd - (fmt.pc > msigd ? fmt.pc : msigd); \
+                                                if ( ! fmt.flags.nobsdp ) fmt2.wd -= 2; /* compensate for 0b base specifier */ \
+                                                if ( (int)fmt2.wd < 64 ) fmt2.wd = 64; /* cast deals with negative value */ \
+                                                fmt2.flags.pc = true; fmt2.pc = 64; \
+                                        } else { \
+                                                if ( fmt.wd > 64 ) fmt.wd -= 64; \
+                                                else fmt.wd = 1; \
+                                                /* printf( "R %llo %llo %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all ); */ \
+                                                fmt2.wd = 64; \
+                                        } /* if */ \
+                                        /* printf( "C %llo %d %d '%c' %x\n", fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
                                         (ostype &)(os | fmt | "" | fmt2); \
                                 } else if ( f.base == 'o' ) { \
+                                        if ( fmt.flags.pc && fmt.pc > 22 ) fmt.pc -= 22; else { fmt.flags.pc = false; fmt.pc = 0; } \
                                         fmt.val = (unsigned long long int)fmt.val >> 2; \
+                                        if ( f.wd > 21 ) fmt.wd = f.wd - 21; \
+                                        fmt2.wd = 1; \
+                                        fmt2.val = ((msig & 0x3) << 1) + 1; \
+                                        (ostype &)(os | fmt | "" | fmt2); \
+                                        sepOff( os ); \
+                                        fmt2.wd = 21; \
+                                        fmt2.val = lsig & 0x7fffffffffffffff; \
+                                        fmt2.val = ((msig & 0x3) << 1) + ((lsig & 0x8000000000000000U) != 0); \
+                                        if ( fmt.flags.left ) { \
+                                                fmt.flags.left = false; \
+                                                fmt.wd = 0; \
+                                                /* printf( "L %llo %llo %llo %d %d '%c' %x %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all, fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
+                                                (ostype &)(os | fmt | "" | fmt2); \
+                                                sepOff( os ); \
+                                                fmt2.flags.left = true; \
+                                                int msigd = ceiling_div( high1( fmt.val ), 3 ); \
+                                                fmt2.wd = f.wd - (fmt.pc > msigd ? fmt.pc : msigd); \
+                                                if ( ! fmt.flags.nobsdp ) fmt2.wd -= 1; /* compensate for 0 base specifier */ \
+                                                if ( (int)fmt2.wd < 21 ) fmt2.wd = 21; /* cast deals with negative value */ \
+                                                fmt2.flags.pc = true; fmt2.pc = 21; \
+                                        } else { \
+                                                if ( fmt.wd > 22 ) fmt.wd -= 22; \
+                                                else fmt.wd = 1; \
+                                                /* printf( "R %llo %llo %llo %d %d '%c' %x %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all, fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
+                                                (ostype &)(os | fmt | "" | fmt2); \
+                                                sepOff( os ); \
+                                                fmt2.wd = 21; \
+                                        } /* if */ \
+                                        fmt2.val = lsig & 0x7fffffffffffffffU; \
+                                        /* printf( "\nC %llo %d %d '%c' %x\n", fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
                                         (ostype &)(os | fmt2); \
+                                } else { \
+                                        if ( f.flags.left ) { \
+                                                if ( f.wd > 16 ) fmt2.wd = f.wd - 16;   \
+                                                fmt.wd = 16;                                                    \
+                                } else { /* f.base == 'x'  | f.base == 'X' */ \
+                                        if ( fmt.flags.pc && fmt.pc > 16 ) fmt.pc -= 16; else { fmt.flags.pc = false; fmt.pc = 0; } \
+                                        if ( fmt.flags.left ) { \
+                                                fmt.flags.left = false; \
+                                                fmt.wd = 0; \
+                                                /* printf( "L %llo %llo %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all ); */ \
+                                                fmt2.flags.left = true; \
+                                                int msigd = high1( msig ); \
+                                                fmt2.wd = f.wd - (fmt.pc > msigd ? fmt.pc : msigd); \
+                                                if ( ! fmt.flags.nobsdp ) fmt2.wd -= 2; /* compensate for 0x base specifier */ \
+                                                if ( (int)fmt2.wd < 16 ) fmt2.wd = 16; /* cast deals with negative value */ \
+                                                fmt2.flags.pc = true; fmt2.pc = 16; \
                                         } else { \
+                                                if ( f.wd > 16 ) fmt.wd = f.wd - 16;    \
+                                                fmt2.wd = 16;                                                   \
+                                                if ( fmt.wd > 16 ) fmt.wd -= 16; \
+                                                else fmt.wd = 1; \
+                                                /* printf( "R %llo %llo %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all ); */ \
+                                                fmt2.wd = 16; \
                                         } /* if */ \
+                                        /* printf( "C %llo %d %d '%c' %x\n", fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
                                         (ostype &)(os | fmt | "" | fmt2); \
                                 } /* if */ \
                         } /* if */ \
                 } else { \
+                        if ( CODE == 'd' ) { \
+                                if ( f.val < 0 )  { fmt( os, "-" ); sepOff( os ); f.val = -f.val; f.flags.sign = false; } \
+                        } /* if */ \
                         base10_128( os, f ); \
                 } /* if */ \
 …
 IntegralFMTImpl128( unsigned int128, unsigned, 'u', "%    *ll ", "%    *.*ll " )
 #endif // __SIZEOF_INT128__
+//*********************************** floating point ***********************************
+#endif // 0
+#if 1
+#if defined( __SIZEOF_INT128__ )
+// Default prefix for non-decimal prints is 0b, 0, 0x.
+forall( dtype ostype | ostream( ostype ) )
+static inline void base_128( ostype & os, unsigned int128 val, unsigned int128 power, _Ostream_Manip(uint64_t) & f, unsigned int maxdig, unsigned int bits, unsigned int cnt = 0 ) {
+        int wd = 1;                                                                                     // f.wd is never 0 because 0 implies left-pad
+        if ( val > power ) {                                                            // subdivide value into printable 64-bit values
+                base_128( os, val / power, power, f, maxdig, bits, cnt + 1 ); // recursive
+                f.val = val % power;
+                if ( cnt == 1 && f.flags.left ) { wd = f.wd; f.wd = maxdig; } // copy f.wd and reset for printing middle chunk
+                // printf( "R val:%#lx(%lu) wd:%u pc:%u base:%c neg:%d pc:%d left:%d nobsdp:%d sign:%d pad0:%d\n",
+                //              f.val, f.val, f.wd, f.pc, f.base, f.flags.neg, f.flags.pc, f.flags.left, f.flags.nobsdp, f.flags.sign, f.flags.pad0 );
+                (ostype &)(os | f);
+                if ( cnt == 1 ) {
+                        if ( f.flags.left ) { wd -= maxdig; f.wd = wd < 0 ? 1 : wd; } // update and restore f.wd for printing end chunk
+                        sepOff( os );                                                           // no seperator between chunks
+                } // if
+        } else {                                                                                        // print start chunk
+                f.val = val;
+                // f.pc is unsigned => use wd
+                if ( f.flags.pc && f.pc > maxdig * cnt ) { wd = f.pc - maxdig * cnt; f.pc = wd < 0 ? 0 : wd; }
+                else { f.flags.pc = false; f.pc = 0; }
+                if ( ! f.flags.left ) {                                                 // right justify
+                        wd = f.wd - maxdig * cnt;
+                        f.wd = wd < 0 ? 1 : wd;
+                        wd = maxdig;
+                } else {                                                                                // left justify
+                        if ( cnt != 0 ) {                                                       // value >= 2^64 ?
+                                unsigned int dig, bs = 0;
+                                // compute size of prefix digits and base
+                                if ( f.base == 'd' || f.base == 'u' ) { // no base prefix
+                                        dig = ceil( log10( f.val ) );           // use floating-point
+                                        if ( f.base == 'd' && (f.flags.neg || f.flags.sign) ) bs = 1; // sign ?
+                                } else {
+                                        dig = ceiling_div( high1( f.val ), bits );
+                                        if ( ! f.flags.nobsdp ) {                       // base prefix ?
+                                                if ( f.base == 'o' ) {
+                                                        // 0 prefix for octal is not added for precision with leading zero
+                                                        if ( f.pc <= dig ) bs = 1;      // 1 character prefix
+                                                } else bs = 2;                                  // 2 character prefix
+                                        } // if
+                                } // if
+                                wd = f.wd - (f.pc > dig ? f.pc : dig) - bs; // precision > leading digits ?
+                                if ( wd < 0 ) wd = 1;
+                                f.wd = 1;
+                        } // if
+                        // all manipulators handled implicitly for value < 2^64
+                } // if
+                // prior checks ensure wd not negative
+                if ( f.flags.neg ) f.val = -f.val;
+                // printf( "L val:%#lx(%lu) wd:%u pc:%u base:%c neg:%d pc:%d left:%d nobsdp:%d sign:%d pad0:%d\n",
+                //              f.val, f.val, f.wd, f.pc, f.base, f.flags.neg, f.flags.pc, f.flags.left, f.flags.nobsdp, f.flags.sign, f.flags.pad0 );
+                (ostype &)(os | f);
+                // remaining middle and end chunks are padded with 0s on the left
+                if ( ! f.flags.left ) { f.flags.pad0 = true; f.flags.pc = false; } // left pad with 0s
+                else { f.pc = maxdig; f.flags.pc = true; }              // left pad with precision
+                if ( cnt != 0 ) sepOff( os );                                   // no seperator between chunks
+                f.wd = wd;                                                                              // reset f.wd for next chunk
+                f.flags.sign = false;                                                   // no leading +/- sign
+                f.flags.nobsdp = true;                                                  // no leading base prefix
+        } // if
+} // base_128
+#define IntegralFMTImpl128( T ) \
+forall( dtype ostype | ostream( ostype ) ) { \
+        ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
+                _Ostream_Manip(uint64_t) fmt; \
+                fmt.[wd, pc, base, all] = f.[wd, pc, base, all]; \
+                if ( f.base == 'b' | f.base == 'B' ) { \
+                        base_128( os, f.val, (unsigned int128)1 << 64, fmt, 64, 1 ); \
+                } else if ( f.base == 'o' ) { \
+                        base_128( os, f.val, (unsigned int128)1 << 63, fmt, 21, 3 ); \
+                } else if ( f.base == 'd' || f.base == 'u' ) { \
+                        if ( f.base == 'd' && f.val < 0 ) { f.val = -f.val; fmt.flags.neg = true; } \
+                        base_128( os, f.val, (unsigned int128)10_000_000_000_000_000_000UL, fmt, 19, 0 ); \
+                } else { \
+                        base_128( os, f.val, (unsigned int128)1 << 64, fmt, 16, 4 ); \
+                } /* if */ \
+                return os; \
+        } /* ?|? */ \
+        void ?|?( ostype & os, _Ostream_Manip(T) f ) { (ostype &)(os | f); ends( os ); } \
+} // distribution
+IntegralFMTImpl128( int128 )
+IntegralFMTImpl128( unsigned int128 )
+#endif // __SIZEOF_INT128__
+#endif // 0
+// *********************************** floating point ***********************************
 #define PrintWithDP2( os, format, val, ... ) \
 …
 FloatingPointFMTImpl( long double, "%    *L ", "%    *.*L " )
 //*********************************** character ***********************************
+// *********************************** character ***********************************
 forall( dtype ostype | ostream( ostype ) ) {
 …
 } // distribution
 //*********************************** C string ***********************************
+// *********************************** C string ***********************************
 forall( dtype ostype | ostream( ostype ) ) {
 …
 //*********************************** istream ***********************************
+// *********************************** istream ***********************************
 …
         } // ?|?
+#if defined( __SIZEOF_INT128__ )
+        istype & ?|?( istype & is, int128 & i128 ) {
+                return (istype &)(is | (unsigned int128 &)i128);
+        } // ?|?
+        istype & ?|?( istype & is, unsigned int128 & ui128 ) {
+                char s[40];
+                bool sign = false;
+                if ( fmt( is, " %[-]", s ) == 1 ) sign = true;  // skip whitespace, negative sign ?
+                // If the input is too large, the value returned is undefined. If there is no input, no value is returned
+                if ( fmt( is, "%39[0-9]%*[0-9]", s ) == 1 ) {   // take first 39 characters, ignore remaining
+                        ui128 = 0;
+                        for ( unsigned int i = 0; s[i] != '\0'; i += 1 ) {
+                                ui128 = ui128 * 10 + s[i] - '0';
+                        } // for
+                        if ( sign ) ui128 = -ui128;
+                } else if ( sign ) ungetc( is, '-' );                   // return minus when no digits
+                return is;
+        } // ?|?
+#endif // __SIZEOF_INT128__
         istype & ?|?( istype & is, float & f ) {
 …
 } // distribution
 //*********************************** manipulators ***********************************
+// *********************************** manipulators ***********************************
 forall( dtype istype | istream( istype ) )

libcfa/src/iostream.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Feb 20 15:30:56 2020
 // Update Count     : 337
+// Last Modified On : Tue Aug 11 22:16:14 2020
+// Update Count     : 350
 //
 …
 //*********************************** ostream ***********************************
+// *********************************** ostream ***********************************
 …
 forall( dtype ostype | ostream( ostype ) ) {
-        ostype & ?|?( ostype &, zero_t );
-        void ?|?( ostype &, zero_t );
-        ostype & ?|?( ostype &, one_t );
-        void ?|?( ostype &, one_t );
         ostype & ?|?( ostype &, bool );
         void ?|?( ostype &, bool );
 …
 } // distribution
 //*********************************** manipulators ***********************************
+// *********************************** manipulators ***********************************
 forall( otype T )
 …
                 unsigned char all;
                 struct {
+                        unsigned char neg:1;                                            // val is negative
                         unsigned char pc:1;                                                     // precision specified
                         unsigned char left:1;                                           // left justify
 …
 }; // _Ostream_Manip
 //*********************************** integral ***********************************
+// *********************************** integral ***********************************
 // See 6.7.9. 19) The initialization shall occur in initializer list order, each initializer provided for a particular
 …
 IntegralFMTDecl( int128, 'd' )
 IntegralFMTDecl( unsigned int128, 'u' )
 #endif
 //*********************************** floating point ***********************************
+#endif // __SIZEOF_INT128__
+// *********************************** floating point ***********************************
 // Default suffix for values with no fraction is "."
 …
 FloatingPointFMTDecl( long double )
 //*********************************** character ***********************************
+// *********************************** character ***********************************
 static inline {
 …
 } // ?|?
 //*********************************** C string ***********************************
+// *********************************** C string ***********************************
 static inline {
 …
 //*********************************** istream ***********************************
+// *********************************** istream ***********************************
 …
         istype & ?|?( istype &, unsigned int & );
         istype & ?|?( istype &, long int & );
+        istype & ?|?( istype &, unsigned long int & );
         istype & ?|?( istype &, long long int & );
-        istype & ?|?( istype &, unsigned long int & );
         istype & ?|?( istype &, unsigned long long int & );
+#if defined( __SIZEOF_INT128__ )
+        istype & ?|?( istype &, int128 & );
+        istype & ?|?( istype &, unsigned int128 & );
+#endif // __SIZEOF_INT128__
         istype & ?|?( istype &, float & );
 …
 } // distribution
 //*********************************** manipulators ***********************************
+// *********************************** manipulators ***********************************
 struct _Istream_Cstr {
 …
         _Istream_Cstr excl( const char scanset[], char * s ) { return (_Istream_Cstr){ s, scanset, -1, { .flags.inex : true } }; }
         _Istream_Cstr & excl( const char scanset[], _Istream_Cstr & fmt ) { fmt.scanset = scanset; fmt.flags.inex = true; return fmt; }
         _Istream_Cstr ignore( const char s[] ) { return (_Istream_Cstr)@{ s, 0p, -1, { .flags.ignore : true } }; }
+        _Istream_Cstr ignore( char s[] ) { return (_Istream_Cstr)@{ s, 0p, -1, { .flags.ignore : true } }; }
         _Istream_Cstr & ignore( _Istream_Cstr & fmt ) { fmt.flags.ignore = true; return fmt; }
         _Istream_Cstr wdi( unsigned int w, char s[] ) { return (_Istream_Cstr)@{ s, 0p, w, { .all : 0 } }; }
 …
 static inline {
         _Istream_Char ignore( const char c ) { return (_Istream_Char)@{ true }; }
+        _Istream_Char ignore( const char ) { return (_Istream_Char)@{ true }; }
         _Istream_Char & ignore( _Istream_Char & fmt ) { fmt.ignore = true; return fmt; }
 } // distribution
 forall( dtype istype | istream( istype ) ) istype & ?|?( istype & is, _Istream_Char f );
 forall( otype T )
+forall( dtype T | sized( T ) )
 struct _Istream_Manip {
         T & val;                                                                                        // polymorphic base-type
 …
 //*********************************** time ***********************************
+// *********************************** time ***********************************

libcfa/src/math.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Apr 18 23:37:04 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 10:27:11 2020
 // Update Count     : 117
+// Last Modified On : Mon Aug 24 08:56:20 2020
+// Update Count     : 126
 //
 …
 #include <complex.h>
+//---------------------------------------
+#include "common.hfa"
 //---------------------- General ----------------------
+static inline float ?%?( float x, float y ) { return fmodf( x, y ); }
+static inline float fmod( float x, float y ) { return fmodf( x, y ); }
+static inline double ?%?( double x, double y ) { return fmod( x, y ); }
+// extern "C" { double fmod( double, double ); }
+static inline long double ?%?( long double x, long double y ) { return fmodl( x, y ); }
+static inline long double fmod( long double x, long double y ) { return fmodl( x, y ); }
+static inline float remainder( float x, float y ) { return remainderf( x, y ); }
+// extern "C" { double remainder( double, double ); }
+static inline long double remainder( long double x, long double y ) { return remainderl( x, y ); }
+static inline float remquo( float x, float y, int * quo ) { return remquof( x, y, quo ); }
+// extern "C" { double remquo( double x, double y, int * quo ); }
+static inline long double remquo( long double x, long double y, int * quo ) { return remquol( x, y, quo ); }
+static inline [ int, float ] remquo( float x, float y ) { int quo; x = remquof( x, y, &quo ); return [ quo, x ]; }
+static inline [ int, double ] remquo( double x, double y ) { int quo; x = remquo( x, y, &quo ); return [ quo, x ]; }
+static inline [ int, long double ] remquo( long double x, long double y ) { int quo; x = remquol( x, y, &quo ); return [ quo, x ]; }
+static inline [ float, float ] div( float x, float y ) { y = modff( x / y, &x ); return [ x, y ]; }
+static inline [ double, double ] div( double x, double y ) { y = modf( x / y, &x ); return [ x, y ]; }
+static inline [ long double, long double ] div( long double x, long double y ) { y = modfl( x / y, &x ); return [ x, y ]; }
+static inline float fma( float x, float y, float z ) { return fmaf( x, y, z ); }
+// extern "C" { double fma( double, double, double ); }
+static inline long double fma( long double x, long double y, long double z ) { return fmal( x, y, z ); }
+static inline float fdim( float x, float y ) { return fdimf( x, y ); }
+// extern "C" { double fdim( double, double ); }
+static inline long double fdim( long double x, long double y ) { return fdiml( x, y ); }
+static inline float nan( const char tag[] ) { return nanf( tag ); }
+// extern "C" { double nan( const char [] ); }
+static inline long double nan( const char tag[] ) { return nanl( tag ); }
+static inline {
+        float ?%?( float x, float y ) { return fmodf( x, y ); }
+        float fmod( float x, float y ) { return fmodf( x, y ); }
+        double ?%?( double x, double y ) { return fmod( x, y ); }
+        // extern "C" { double fmod( double, double ); }
+        long double ?%?( long double x, long double y ) { return fmodl( x, y ); }
+        long double fmod( long double x, long double y ) { return fmodl( x, y ); }
+        float remainder( float x, float y ) { return remainderf( x, y ); }
+        // extern "C" { double remainder( double, double ); }
+        long double remainder( long double x, long double y ) { return remainderl( x, y ); }
+        float remquo( float x, float y, int * quo ) { return remquof( x, y, quo ); }
+        // extern "C" { double remquo( double x, double y, int * quo ); }
+        long double remquo( long double x, long double y, int * quo ) { return remquol( x, y, quo ); }
+        [ int, float ] remquo( float x, float y ) { int quo; x = remquof( x, y, &quo ); return [ quo, x ]; }
+        [ int, double ] remquo( double x, double y ) { int quo; x = remquo( x, y, &quo ); return [ quo, x ]; }
+        [ int, long double ] remquo( long double x, long double y ) { int quo; x = remquol( x, y, &quo ); return [ quo, x ]; }
+        [ float, float ] div( float x, float y ) { y = modff( x / y, &x ); return [ x, y ]; }
+        [ double, double ] div( double x, double y ) { y = modf( x / y, &x ); return [ x, y ]; }
+        [ long double, long double ] div( long double x, long double y ) { y = modfl( x / y, &x ); return [ x, y ]; }
+        float fma( float x, float y, float z ) { return fmaf( x, y, z ); }
+        // extern "C" { double fma( double, double, double ); }
+        long double fma( long double x, long double y, long double z ) { return fmal( x, y, z ); }
+        float fdim( float x, float y ) { return fdimf( x, y ); }
+        // extern "C" { double fdim( double, double ); }
+        long double fdim( long double x, long double y ) { return fdiml( x, y ); }
+        float nan( const char tag[] ) { return nanf( tag ); }
+        // extern "C" { double nan( const char [] ); }
+        long double nan( const char tag[] ) { return nanl( tag ); }
+} // distribution
 //---------------------- Exponential ----------------------
+static inline float exp( float x ) { return expf( x ); }
+// extern "C" { double exp( double ); }
+static inline long double exp( long double x ) { return expl( x ); }
+static inline float _Complex exp( float _Complex x ) { return cexpf( x ); }
+static inline double _Complex exp( double _Complex x ) { return cexp( x ); }
+static inline long double _Complex exp( long double _Complex x ) { return cexpl( x ); }
+static inline float exp2( float x ) { return exp2f( x ); }
+// extern "C" { double exp2( double ); }
+static inline long double exp2( long double x ) { return exp2l( x ); }
+//static inline float _Complex exp2( float _Complex x ) { return cexp2f( x ); }
+//static inline double _Complex exp2( double _Complex x ) { return cexp2( x ); }
+//static inline long double _Complex exp2( long double _Complex x ) { return cexp2l( x ); }
+static inline float expm1( float x ) { return expm1f( x ); }
+// extern "C" { double expm1( double ); }
+static inline long double expm1( long double x ) { return expm1l( x ); }
+static inline float pow( float x, float y ) { return powf( x, y ); }
+// extern "C" { double pow( double, double ); }
+static inline long double pow( long double x, long double y ) { return powl( x, y ); }
+static inline float _Complex pow( float _Complex x, float _Complex y ) { return cpowf( x, y ); }
+static inline double _Complex pow( double _Complex x, double _Complex y ) { return cpow( x, y ); }
+static inline long double _Complex pow( long double _Complex x, long double _Complex y ) { return cpowl( x, y ); }
+static inline {
+        float exp( float x ) { return expf( x ); }
+        // extern "C" { double exp( double ); }
+        long double exp( long double x ) { return expl( x ); }
+        float _Complex exp( float _Complex x ) { return cexpf( x ); }
+        double _Complex exp( double _Complex x ) { return cexp( x ); }
+        long double _Complex exp( long double _Complex x ) { return cexpl( x ); }
+        float exp2( float x ) { return exp2f( x ); }
+        // extern "C" { double exp2( double ); }
+        long double exp2( long double x ) { return exp2l( x ); }
+        //float _Complex exp2( float _Complex x ) { return cexp2f( x ); }
+        //double _Complex exp2( double _Complex x ) { return cexp2( x ); }
+        //long double _Complex exp2( long double _Complex x ) { return cexp2l( x ); }
+        float expm1( float x ) { return expm1f( x ); }
+        // extern "C" { double expm1( double ); }
+        long double expm1( long double x ) { return expm1l( x ); }
+        float pow( float x, float y ) { return powf( x, y ); }
+        // extern "C" { double pow( double, double ); }
+        long double pow( long double x, long double y ) { return powl( x, y ); }
+        float _Complex pow( float _Complex x, float _Complex y ) { return cpowf( x, y ); }
+        double _Complex pow( double _Complex x, double _Complex y ) { return cpow( x, y ); }
+        long double _Complex pow( long double _Complex x, long double _Complex y ) { return cpowl( x, y ); }
+} // distribution
 //---------------------- Logarithm ----------------------
+static inline float log( float x ) { return logf( x ); }
+// extern "C" { double log( double ); }
+static inline long double log( long double x ) { return logl( x ); }
+static inline float _Complex log( float _Complex x ) { return clogf( x ); }
+static inline double _Complex log( double _Complex x ) { return clog( x ); }
+static inline long double _Complex log( long double _Complex x ) { return clogl( x ); }
+static inline float log2( float x ) { return log2f( x ); }
+// extern "C" { double log2( double ); }
+static inline long double log2( long double x ) { return log2l( x ); }
+// static inline float _Complex log2( float _Complex x ) { return clog2f( x ); }
+// static inline double _Complex log2( double _Complex x ) { return clog2( x ); }
+// static inline long double _Complex log2( long double _Complex x ) { return clog2l( x ); }
+static inline float log10( float x ) { return log10f( x ); }
+// extern "C" { double log10( double ); }
+static inline long double log10( long double x ) { return log10l( x ); }
+// static inline float _Complex log10( float _Complex x ) { return clog10f( x ); }
+// static inline double _Complex log10( double _Complex x ) { return clog10( x ); }
+// static inline long double _Complex log10( long double _Complex x ) { return clog10l( x ); }
+static inline float log1p( float x ) { return log1pf( x ); }
+// extern "C" { double log1p( double ); }
+static inline long double log1p( long double x ) { return log1pl( x ); }
+static inline int ilogb( float x ) { return ilogbf( x ); }
+// extern "C" { int ilogb( double ); }
+static inline int ilogb( long double x ) { return ilogbl( x ); }
+static inline float logb( float x ) { return logbf( x ); }
+// extern "C" { double logb( double ); }
+static inline long double logb( long double x ) { return logbl( x ); }
+static inline float sqrt( float x ) { return sqrtf( x ); }
+// extern "C" { double sqrt( double ); }
+static inline long double sqrt( long double x ) { return sqrtl( x ); }
+static inline float _Complex sqrt( float _Complex x ) { return csqrtf( x ); }
+static inline double _Complex sqrt( double _Complex x ) { return csqrt( x ); }
+static inline long double _Complex sqrt( long double _Complex x ) { return csqrtl( x ); }
+static inline float cbrt( float x ) { return cbrtf( x ); }
+// extern "C" { double cbrt( double ); }
+static inline long double cbrt( long double x ) { return cbrtl( x ); }
+static inline float hypot( float x, float y ) { return hypotf( x, y ); }
+// extern "C" { double hypot( double, double ); }
+static inline long double hypot( long double x, long double y ) { return hypotl( x, y ); }
+static inline {
+        float log( float x ) { return logf( x ); }
+        // extern "C" { double log( double ); }
+        long double log( long double x ) { return logl( x ); }
+        float _Complex log( float _Complex x ) { return clogf( x ); }
+        double _Complex log( double _Complex x ) { return clog( x ); }
+        long double _Complex log( long double _Complex x ) { return clogl( x ); }
+        float log2( float x ) { return log2f( x ); }
+        // extern "C" { double log2( double ); }
+        long double log2( long double x ) { return log2l( x ); }
+        // float _Complex log2( float _Complex x ) { return clog2f( x ); }
+        // double _Complex log2( double _Complex x ) { return clog2( x ); }
+        // long double _Complex log2( long double _Complex x ) { return clog2l( x ); }
+        float log10( float x ) { return log10f( x ); }
+        // extern "C" { double log10( double ); }
+        long double log10( long double x ) { return log10l( x ); }
+        // float _Complex log10( float _Complex x ) { return clog10f( x ); }
+        // double _Complex log10( double _Complex x ) { return clog10( x ); }
+        // long double _Complex log10( long double _Complex x ) { return clog10l( x ); }
+        float log1p( float x ) { return log1pf( x ); }
+        // extern "C" { double log1p( double ); }
+        long double log1p( long double x ) { return log1pl( x ); }
+        int ilogb( float x ) { return ilogbf( x ); }
+        // extern "C" { int ilogb( double ); }
+        int ilogb( long double x ) { return ilogbl( x ); }
+        float logb( float x ) { return logbf( x ); }
+        // extern "C" { double logb( double ); }
+        long double logb( long double x ) { return logbl( x ); }
+        float sqrt( float x ) { return sqrtf( x ); }
+        // extern "C" { double sqrt( double ); }
+        long double sqrt( long double x ) { return sqrtl( x ); }
+        float _Complex sqrt( float _Complex x ) { return csqrtf( x ); }
+        double _Complex sqrt( double _Complex x ) { return csqrt( x ); }
+        long double _Complex sqrt( long double _Complex x ) { return csqrtl( x ); }
+        float cbrt( float x ) { return cbrtf( x ); }
+        // extern "C" { double cbrt( double ); }
+        long double cbrt( long double x ) { return cbrtl( x ); }
+        float hypot( float x, float y ) { return hypotf( x, y ); }
+        // extern "C" { double hypot( double, double ); }
+        long double hypot( long double x, long double y ) { return hypotl( x, y ); }
+} // distribution
 //---------------------- Trigonometric ----------------------
+static inline float sin( float x ) { return sinf( x ); }
+// extern "C" { double sin( double ); }
+static inline long double sin( long double x ) { return sinl( x ); }
+static inline float _Complex sin( float _Complex x ) { return csinf( x ); }
+static inline double _Complex sin( double _Complex x ) { return csin( x ); }
+static inline long double _Complex sin( long double _Complex x ) { return csinl( x ); }
+static inline float cos( float x ) { return cosf( x ); }
+// extern "C" { double cos( double ); }
+static inline long double cos( long double x ) { return cosl( x ); }
+static inline float _Complex cos( float _Complex x ) { return ccosf( x ); }
+static inline double _Complex cos( double _Complex x ) { return ccos( x ); }
+static inline long double _Complex cos( long double _Complex x ) { return ccosl( x ); }
+static inline float tan( float x ) { return tanf( x ); }
+// extern "C" { double tan( double ); }
+static inline long double tan( long double x ) { return tanl( x ); }
+static inline float _Complex tan( float _Complex x ) { return ctanf( x ); }
+static inline double _Complex tan( double _Complex x ) { return ctan( x ); }
+static inline long double _Complex tan( long double _Complex x ) { return ctanl( x ); }
+static inline float asin( float x ) { return asinf( x ); }
+// extern "C" { double asin( double ); }
+static inline long double asin( long double x ) { return asinl( x ); }
+static inline float _Complex asin( float _Complex x ) { return casinf( x ); }
+static inline double _Complex asin( double _Complex x ) { return casin( x ); }
+static inline long double _Complex asin( long double _Complex x ) { return casinl( x ); }
+static inline float acos( float x ) { return acosf( x ); }
+// extern "C" { double acos( double ); }
+static inline long double acos( long double x ) { return acosl( x ); }
+static inline float _Complex acos( float _Complex x ) { return cacosf( x ); }
+static inline double _Complex acos( double _Complex x ) { return cacos( x ); }
+static inline long double _Complex acos( long double _Complex x ) { return cacosl( x ); }
+static inline float atan( float x ) { return atanf( x ); }
+// extern "C" { double atan( double ); }
+static inline long double atan( long double x ) { return atanl( x ); }
+static inline float _Complex atan( float _Complex x ) { return catanf( x ); }
+static inline double _Complex atan( double _Complex x ) { return catan( x ); }
+static inline long double _Complex atan( long double _Complex x ) { return catanl( x ); }
+static inline float atan2( float x, float y ) { return atan2f( x, y ); }
+// extern "C" { double atan2( double, double ); }
+static inline long double atan2( long double x, long double y ) { return atan2l( x, y ); }
+// alternative name for atan2
+static inline float atan( float x, float y ) { return atan2f( x, y ); }
+static inline double atan( double x, double y ) { return atan2( x, y ); }
+static inline long double atan( long double x, long double y ) { return atan2l( x, y ); }
+static inline {
+        float sin( float x ) { return sinf( x ); }
+        // extern "C" { double sin( double ); }
+        long double sin( long double x ) { return sinl( x ); }
+        float _Complex sin( float _Complex x ) { return csinf( x ); }
+        double _Complex sin( double _Complex x ) { return csin( x ); }
+        long double _Complex sin( long double _Complex x ) { return csinl( x ); }
+        float cos( float x ) { return cosf( x ); }
+        // extern "C" { double cos( double ); }
+        long double cos( long double x ) { return cosl( x ); }
+        float _Complex cos( float _Complex x ) { return ccosf( x ); }
+        double _Complex cos( double _Complex x ) { return ccos( x ); }
+        long double _Complex cos( long double _Complex x ) { return ccosl( x ); }
+        float tan( float x ) { return tanf( x ); }
+        // extern "C" { double tan( double ); }
+        long double tan( long double x ) { return tanl( x ); }
+        float _Complex tan( float _Complex x ) { return ctanf( x ); }
+        double _Complex tan( double _Complex x ) { return ctan( x ); }
+        long double _Complex tan( long double _Complex x ) { return ctanl( x ); }
+        float asin( float x ) { return asinf( x ); }
+        // extern "C" { double asin( double ); }
+        long double asin( long double x ) { return asinl( x ); }
+        float _Complex asin( float _Complex x ) { return casinf( x ); }
+        double _Complex asin( double _Complex x ) { return casin( x ); }
+        long double _Complex asin( long double _Complex x ) { return casinl( x ); }
+        float acos( float x ) { return acosf( x ); }
+        // extern "C" { double acos( double ); }
+        long double acos( long double x ) { return acosl( x ); }
+        float _Complex acos( float _Complex x ) { return cacosf( x ); }
+        double _Complex acos( double _Complex x ) { return cacos( x ); }
+        long double _Complex acos( long double _Complex x ) { return cacosl( x ); }
+        float atan( float x ) { return atanf( x ); }
+        // extern "C" { double atan( double ); }
+        long double atan( long double x ) { return atanl( x ); }
+        float _Complex atan( float _Complex x ) { return catanf( x ); }
+        double _Complex atan( double _Complex x ) { return catan( x ); }
+        long double _Complex atan( long double _Complex x ) { return catanl( x ); }
+        float atan2( float x, float y ) { return atan2f( x, y ); }
+        // extern "C" { double atan2( double, double ); }
+        long double atan2( long double x, long double y ) { return atan2l( x, y ); }
+        // alternative name for atan2
+        float atan( float x, float y ) { return atan2f( x, y ); }
+        double atan( double x, double y ) { return atan2( x, y ); }
+        long double atan( long double x, long double y ) { return atan2l( x, y ); }
+} // distribution
 //---------------------- Hyperbolic ----------------------
+static inline float sinh( float x ) { return sinhf( x ); }
+// extern "C" { double sinh( double ); }
+static inline long double sinh( long double x ) { return sinhl( x ); }
+static inline float _Complex sinh( float _Complex x ) { return csinhf( x ); }
+static inline double _Complex sinh( double _Complex x ) { return csinh( x ); }
+static inline long double _Complex sinh( long double _Complex x ) { return csinhl( x ); }
+static inline float cosh( float x ) { return coshf( x ); }
+// extern "C" { double cosh( double ); }
+static inline long double cosh( long double x ) { return coshl( x ); }
+static inline float _Complex cosh( float _Complex x ) { return ccoshf( x ); }
+static inline double _Complex cosh( double _Complex x ) { return ccosh( x ); }
+static inline long double _Complex cosh( long double _Complex x ) { return ccoshl( x ); }
+static inline float tanh( float x ) { return tanhf( x ); }
+// extern "C" { double tanh( double ); }
+static inline long double tanh( long double x ) { return tanhl( x ); }
+static inline float _Complex tanh( float _Complex x ) { return ctanhf( x ); }
+static inline double _Complex tanh( double _Complex x ) { return ctanh( x ); }
+static inline long double _Complex tanh( long double _Complex x ) { return ctanhl( x ); }
+static inline float asinh( float x ) { return asinhf( x ); }
+// extern "C" { double asinh( double ); }
+static inline long double asinh( long double x ) { return asinhl( x ); }
+static inline float _Complex asinh( float _Complex x ) { return casinhf( x ); }
+static inline double _Complex asinh( double _Complex x ) { return casinh( x ); }
+static inline long double _Complex asinh( long double _Complex x ) { return casinhl( x ); }
+static inline float acosh( float x ) { return acoshf( x ); }
+// extern "C" { double acosh( double ); }
+static inline long double acosh( long double x ) { return acoshl( x ); }
+static inline float _Complex acosh( float _Complex x ) { return cacoshf( x ); }
+static inline double _Complex acosh( double _Complex x ) { return cacosh( x ); }
+static inline long double _Complex acosh( long double _Complex x ) { return cacoshl( x ); }
+static inline float atanh( float x ) { return atanhf( x ); }
+// extern "C" { double atanh( double ); }
+static inline long double atanh( long double x ) { return atanhl( x ); }
+static inline float _Complex atanh( float _Complex x ) { return catanhf( x ); }
+static inline double _Complex atanh( double _Complex x ) { return catanh( x ); }
+static inline long double _Complex atanh( long double _Complex x ) { return catanhl( x ); }
+static inline {
+        float sinh( float x ) { return sinhf( x ); }
+        // extern "C" { double sinh( double ); }
+        long double sinh( long double x ) { return sinhl( x ); }
+        float _Complex sinh( float _Complex x ) { return csinhf( x ); }
+        double _Complex sinh( double _Complex x ) { return csinh( x ); }
+        long double _Complex sinh( long double _Complex x ) { return csinhl( x ); }
+        float cosh( float x ) { return coshf( x ); }
+        // extern "C" { double cosh( double ); }
+        long double cosh( long double x ) { return coshl( x ); }
+        float _Complex cosh( float _Complex x ) { return ccoshf( x ); }
+        double _Complex cosh( double _Complex x ) { return ccosh( x ); }
+        long double _Complex cosh( long double _Complex x ) { return ccoshl( x ); }
+        float tanh( float x ) { return tanhf( x ); }
+        // extern "C" { double tanh( double ); }
+        long double tanh( long double x ) { return tanhl( x ); }
+        float _Complex tanh( float _Complex x ) { return ctanhf( x ); }
+        double _Complex tanh( double _Complex x ) { return ctanh( x ); }
+        long double _Complex tanh( long double _Complex x ) { return ctanhl( x ); }
+        float asinh( float x ) { return asinhf( x ); }
+        // extern "C" { double asinh( double ); }
+        long double asinh( long double x ) { return asinhl( x ); }
+        float _Complex asinh( float _Complex x ) { return casinhf( x ); }
+        double _Complex asinh( double _Complex x ) { return casinh( x ); }
+        long double _Complex asinh( long double _Complex x ) { return casinhl( x ); }
+        float acosh( float x ) { return acoshf( x ); }
+        // extern "C" { double acosh( double ); }
+        long double acosh( long double x ) { return acoshl( x ); }
+        float _Complex acosh( float _Complex x ) { return cacoshf( x ); }
+        double _Complex acosh( double _Complex x ) { return cacosh( x ); }
+        long double _Complex acosh( long double _Complex x ) { return cacoshl( x ); }
+        float atanh( float x ) { return atanhf( x ); }
+        // extern "C" { double atanh( double ); }
+        long double atanh( long double x ) { return atanhl( x ); }
+        float _Complex atanh( float _Complex x ) { return catanhf( x ); }
+        double _Complex atanh( double _Complex x ) { return catanh( x ); }
+        long double _Complex atanh( long double _Complex x ) { return catanhl( x ); }
+} // distribution
 //---------------------- Error / Gamma ----------------------
+static inline float erf( float x ) { return erff( x ); }
+// extern "C" { double erf( double ); }
+static inline long double erf( long double x ) { return erfl( x ); }
+// float _Complex erf( float _Complex );
+// double _Complex erf( double _Complex );
+// long double _Complex erf( long double _Complex );
+static inline float erfc( float x ) { return erfcf( x ); }
+// extern "C" { double erfc( double ); }
+static inline long double erfc( long double x ) { return erfcl( x ); }
+// float _Complex erfc( float _Complex );
+// double _Complex erfc( double _Complex );
+// long double _Complex erfc( long double _Complex );
+static inline float lgamma( float x ) { return lgammaf( x ); }
+// extern "C" { double lgamma( double ); }
+static inline long double lgamma( long double x ) { return lgammal( x ); }
+static inline float lgamma( float x, int * sign ) { return lgammaf_r( x, sign ); }
+static inline double lgamma( double x, int * sign ) { return lgamma_r( x, sign ); }
+static inline long double lgamma( long double x, int * sign ) { return lgammal_r( x, sign ); }
+static inline float tgamma( float x ) { return tgammaf( x ); }
+// extern "C" { double tgamma( double ); }
+static inline long double tgamma( long double x ) { return tgammal( x ); }
+static inline {
+        float erf( float x ) { return erff( x ); }
+        // extern "C" { double erf( double ); }
+        long double erf( long double x ) { return erfl( x ); }
+        // float _Complex erf( float _Complex );
+        // double _Complex erf( double _Complex );
+        // long double _Complex erf( long double _Complex );
+        float erfc( float x ) { return erfcf( x ); }
+        // extern "C" { double erfc( double ); }
+        long double erfc( long double x ) { return erfcl( x ); }
+        // float _Complex erfc( float _Complex );
+        // double _Complex erfc( double _Complex );
+        // long double _Complex erfc( long double _Complex );
+        float lgamma( float x ) { return lgammaf( x ); }
+        // extern "C" { double lgamma( double ); }
+        long double lgamma( long double x ) { return lgammal( x ); }
+        float lgamma( float x, int * sign ) { return lgammaf_r( x, sign ); }
+        double lgamma( double x, int * sign ) { return lgamma_r( x, sign ); }
+        long double lgamma( long double x, int * sign ) { return lgammal_r( x, sign ); }
+        float tgamma( float x ) { return tgammaf( x ); }
+        // extern "C" { double tgamma( double ); }
+        long double tgamma( long double x ) { return tgammal( x ); }
+} // distribution
 //---------------------- Nearest Integer ----------------------
+static inline float floor( float x ) { return floorf( x ); }
+// extern "C" { double floor( double ); }
+static inline long double floor( long double x ) { return floorl( x ); }
+static inline float ceil( float x ) { return ceilf( x ); }
+// extern "C" { double ceil( double ); }
+static inline long double ceil( long double x ) { return ceill( x ); }
+static inline float trunc( float x ) { return truncf( x ); }
+// extern "C" { double trunc( double ); }
+static inline long double trunc( long double x ) { return truncl( x ); }
+static inline float rint( float x ) { return rintf( x ); }
+// extern "C" { double rint( double x ); }
+static inline long double rint( long double x ) { return rintl( x ); }
+static inline long int rint( float x ) { return lrintf( x ); }
+static inline long int rint( double x ) { return lrint( x ); }
+static inline long int rint( long double x ) { return lrintl( x ); }
+static inline long long int rint( float x ) { return llrintf( x ); }
+static inline long long int rint( double x ) { return llrint( x ); }
+static inline long long int rint( long double x ) { return llrintl( x ); }
+static inline long int lrint( float x ) { return lrintf( x ); }
+// extern "C" { long int lrint( double ); }
+static inline long int lrint( long double x ) { return lrintl( x ); }
+static inline long long int llrint( float x ) { return llrintf( x ); }
+// extern "C" { long long int llrint( double ); }
+static inline long long int llrint( long double x ) { return llrintl( x ); }
+static inline float nearbyint( float x ) { return nearbyintf( x ); }
+// extern "C" { double nearbyint( double ); }
+static inline long double nearbyint( long double x ) { return nearbyintl( x ); }
+static inline float round( float x ) { return roundf( x ); }
+// extern "C" { double round( double x ); }
+static inline long double round( long double x ) { return roundl( x ); }
+static inline long int round( float x ) { return lroundf( x ); }
+static inline long int round( double x ) { return lround( x ); }
+static inline long int round( long double x ) { return lroundl( x ); }
+static inline long long int round( float x ) { return llroundf( x ); }
+static inline long long int round( double x ) { return llround( x ); }
+static inline long long int round( long double x ) { return llroundl( x ); }
+static inline long int lround( float x ) { return lroundf( x ); }
+// extern "C" { long int lround( double ); }
+static inline long int lround( long double x ) { return lroundl( x ); }
+static inline long long int llround( float x ) { return llroundf( x ); }
+// extern "C" { long long int llround( double ); }
+static inline long long int llround( long double x ) { return llroundl( x ); }
+static inline {
+        signed char floor( signed char n, signed char align ) { return n / align * align; }
+        unsigned char floor( unsigned char n, unsigned char align ) { return n / align * align; }
+        short int floor( short int n, short int align ) { return n / align * align; }
+        unsigned short int floor( unsigned short int n, unsigned short int align ) { return n / align * align; }
+        int floor( int n, int align ) { return n / align * align; }
+        unsigned int floor( unsigned int n, unsigned int align ) { return n / align * align; }
+        long int floor( long int n, long int align ) { return n / align * align; }
+        unsigned long int floor( unsigned long int n, unsigned long int align ) { return n / align * align; }
+        long long int floor( long long int n, long long int align ) { return n / align * align; }
+        unsigned long long int floor( unsigned long long int n, unsigned long long int align ) { return n / align * align; }
+        // forall( otype T | { T ?/?( T, T ); T ?*?( T, T ); } )
+        // T floor( T n, T align ) { return n / align * align; }
+        signed char ceiling_div( signed char n, char align ) { return (n + (align - 1)) / align; }
+        unsigned char ceiling_div( unsigned char n, unsigned char align ) { return (n + (align - 1)) / align; }
+        short int ceiling_div( short int n, short int align ) { return (n + (align - 1)) / align; }
+        unsigned short int ceiling_div( unsigned short int n, unsigned short int align ) { return (n + (align - 1)) / align; }
+        int ceiling_div( int n, int align ) { return (n + (align - 1)) / align; }
+        unsigned int ceiling_div( unsigned int n, unsigned int align ) { return (n + (align - 1)) / align; }
+        long int ceiling_div( long int n, long int align ) { return (n + (align - 1)) / align; }
+        unsigned long int ceiling_div( unsigned long int n, unsigned long int align ) { return (n + (align - 1)) / align; }
+        long long int ceiling_div( long long int n, long long int align ) { return (n + (align - 1)) / align; }
+        unsigned long long int ceiling_div( unsigned long long int n, unsigned long long int align ) { return (n + (align - 1)) / align; }
+        // forall( otype T | { T ?+?( T, T ); T ?-?( T, T ); T ?%?( T, T ); } )
+        // T ceiling_div( T n, T align ) { verify( is_pow2( align ) );return (n + (align - 1)) / align; }
+        // gcc notices the div/mod pair and saves both so only one div.
+        signed char ceiling( signed char n, signed char align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        unsigned char ceiling( unsigned char n, unsigned char align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        short int ceiling( short int n, short int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        unsigned short int ceiling( unsigned short int n, unsigned short int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        int ceiling( int n, int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        unsigned int ceiling( unsigned int n, unsigned int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        long int ceiling( long int n, long int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        unsigned long int ceiling( unsigned long int n, unsigned long int align ) { return floor( n + (n % align != 0 ? align - 1 : 0) , align); }
+        long long int ceiling( long long int n, long long int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        unsigned long long int ceiling( unsigned long long int n, unsigned long long int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
+        // forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T ); T ?/?( T, T ); } )
+        // T ceiling( T n, T align ) { return return floor( n + (n % align != 0 ? align - 1 : 0), align ); *}
+        float floor( float x ) { return floorf( x ); }
+        // extern "C" { double floor( double ); }
+        long double floor( long double x ) { return floorl( x ); }
+        float ceil( float x ) { return ceilf( x ); }
+        // extern "C" { double ceil( double ); }
+        long double ceil( long double x ) { return ceill( x ); }
+        float trunc( float x ) { return truncf( x ); }
+        // extern "C" { double trunc( double ); }
+        long double trunc( long double x ) { return truncl( x ); }
+        float rint( float x ) { return rintf( x ); }
+        // extern "C" { double rint( double x ); }
+        long double rint( long double x ) { return rintl( x ); }
+        long int rint( float x ) { return lrintf( x ); }
+        long int rint( double x ) { return lrint( x ); }
+        long int rint( long double x ) { return lrintl( x ); }
+        long long int rint( float x ) { return llrintf( x ); }
+        long long int rint( double x ) { return llrint( x ); }
+        long long int rint( long double x ) { return llrintl( x ); }
+        long int lrint( float x ) { return lrintf( x ); }
+        // extern "C" { long int lrint( double ); }
+        long int lrint( long double x ) { return lrintl( x ); }
+        long long int llrint( float x ) { return llrintf( x ); }
+        // extern "C" { long long int llrint( double ); }
+        long long int llrint( long double x ) { return llrintl( x ); }
+        float nearbyint( float x ) { return nearbyintf( x ); }
+        // extern "C" { double nearbyint( double ); }
+        long double nearbyint( long double x ) { return nearbyintl( x ); }
+        float round( float x ) { return roundf( x ); }
+        // extern "C" { double round( double x ); }
+        long double round( long double x ) { return roundl( x ); }
+        long int round( float x ) { return lroundf( x ); }
+        long int round( double x ) { return lround( x ); }
+        long int round( long double x ) { return lroundl( x ); }
+        long long int round( float x ) { return llroundf( x ); }
+        long long int round( double x ) { return llround( x ); }
+        long long int round( long double x ) { return llroundl( x ); }
+        long int lround( float x ) { return lroundf( x ); }
+        // extern "C" { long int lround( double ); }
+        long int lround( long double x ) { return lroundl( x ); }
+        long long int llround( float x ) { return llroundf( x ); }
+        // extern "C" { long long int llround( double ); }
+        long long int llround( long double x ) { return llroundl( x ); }
+} // distribution
 //---------------------- Manipulation ----------------------
+static inline float copysign( float x, float y ) { return copysignf( x, y ); }
+// extern "C" { double copysign( double, double ); }
+static inline long double copysign( long double x, long double y ) { return copysignl( x, y ); }
+static inline float frexp( float x, int * ip ) { return frexpf( x, ip ); }
+// extern "C" { double frexp( double, int * ); }
+static inline long double frexp( long double x, int * ip ) { return frexpl( x, ip ); }
+static inline float ldexp( float x, int exp2 ) { return ldexpf( x, exp2 ); }
+// extern "C" { double ldexp( double, int ); }
+static inline long double ldexp( long double x, int exp2 ) { return ldexpl( x, exp2 ); }
+static inline [ float, float ] modf( float x ) { float i; x = modff( x, &i ); return [ i, x ]; }
+static inline float modf( float x, float * i ) { return modff( x, i ); }
+static inline [ double, double ] modf( double x ) { double i; x = modf( x, &i ); return [ i, x ]; }
+// extern "C" { double modf( double, double * ); }
+static inline [ long double, long double ] modf( long double x ) { long double i; x = modfl( x, &i ); return [ i, x ]; }
+static inline long double modf( long double x, long double * i ) { return modfl( x, i ); }
+static inline float nextafter( float x, float y ) { return nextafterf( x, y ); }
+// extern "C" { double nextafter( double, double ); }
+static inline long double nextafter( long double x, long double y ) { return nextafterl( x, y ); }
+static inline float nexttoward( float x, long double y ) { return nexttowardf( x, y ); }
+// extern "C" { double nexttoward( double, long double ); }
+static inline long double nexttoward( long double x, long double y ) { return nexttowardl( x, y ); }
+static inline float scalbn( float x, int exp ) { return scalbnf( x, exp ); }
+// extern "C" { double scalbn( double, int ); }
+static inline long double scalbn( long double x, int exp ) { return scalbnl( x, exp ); }
+static inline float scalbn( float x, long int exp ) { return scalblnf( x, exp ); }
+static inline double scalbn( double x, long int exp ) { return scalbln( x, exp ); }
+static inline long double scalbn( long double x, long int exp ) { return scalblnl( x, exp ); }
+static inline float scalbln( float x, long int exp ) { return scalblnf( x, exp ); }
+// extern "C" { double scalbln( double, long int ); }
+static inline long double scalbln( long double x, long int exp ) { return scalblnl( x, exp ); }
+static inline {
+        float copysign( float x, float y ) { return copysignf( x, y ); }
+        // extern "C" { double copysign( double, double ); }
+        long double copysign( long double x, long double y ) { return copysignl( x, y ); }
+        float frexp( float x, int * ip ) { return frexpf( x, ip ); }
+        // extern "C" { double frexp( double, int * ); }
+        long double frexp( long double x, int * ip ) { return frexpl( x, ip ); }
+        float ldexp( float x, int exp2 ) { return ldexpf( x, exp2 ); }
+        // extern "C" { double ldexp( double, int ); }
+        long double ldexp( long double x, int exp2 ) { return ldexpl( x, exp2 ); }
+        [ float, float ] modf( float x ) { float i; x = modff( x, &i ); return [ i, x ]; }
+        float modf( float x, float * i ) { return modff( x, i ); }
+        [ double, double ] modf( double x ) { double i; x = modf( x, &i ); return [ i, x ]; }
+        // extern "C" { double modf( double, double * ); }
+        [ long double, long double ] modf( long double x ) { long double i; x = modfl( x, &i ); return [ i, x ]; }
+        long double modf( long double x, long double * i ) { return modfl( x, i ); }
+        float nextafter( float x, float y ) { return nextafterf( x, y ); }
+        // extern "C" { double nextafter( double, double ); }
+        long double nextafter( long double x, long double y ) { return nextafterl( x, y ); }
+        float nexttoward( float x, long double y ) { return nexttowardf( x, y ); }
+        // extern "C" { double nexttoward( double, long double ); }
+        long double nexttoward( long double x, long double y ) { return nexttowardl( x, y ); }
+        float scalbn( float x, int exp ) { return scalbnf( x, exp ); }
+        // extern "C" { double scalbn( double, int ); }
+        long double scalbn( long double x, int exp ) { return scalbnl( x, exp ); }
+        float scalbn( float x, long int exp ) { return scalblnf( x, exp ); }
+        double scalbn( double x, long int exp ) { return scalbln( x, exp ); }
+        long double scalbn( long double x, long int exp ) { return scalblnl( x, exp ); }
+        float scalbln( float x, long int exp ) { return scalblnf( x, exp ); }
+        // extern "C" { double scalbln( double, long int ); }
+        long double scalbln( long double x, long int exp ) { return scalblnl( x, exp ); }
+} // distribution
 //---------------------------------------
+#include "common.hfa"
+//---------------------------------------
+forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T );T ?*?( T, T ); } )
+T lerp( T x, T y, T a ) { return x * ((T){1} - a) + y * a; }
+forall( otype T | { void ?{}( T &, zero_t ); void ?{}( T &, one_t ); int ?<?( T, T ); } )
+T step( T edge, T x ) { return x < edge ? (T){0} : (T){1}; }
+forall( otype T | { void ?{}( T &, int ); T clamp( T, T, T ); T ?-?( T, T ); T ?*?( T, T ); T ?/?( T, T ); } )
+T smoothstep( T edge0, T edge1, T x ) { T t = clamp( (x - edge0) / (edge1 - edge0), (T){0}, (T){1} ); return t * t * ((T){3} - (T){2} * t); }
+static inline {
+        forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T );T ?*?( T, T ); } )
+        T lerp( T x, T y, T a ) { return x * ((T){1} - a) + y * a; }
+        forall( otype T | { void ?{}( T &, zero_t ); void ?{}( T &, one_t ); int ?<?( T, T ); } )
+        T step( T edge, T x ) { return x < edge ? (T){0} : (T){1}; }
+        forall( otype T | { void ?{}( T &, int ); T clamp( T, T, T ); T ?-?( T, T ); T ?*?( T, T ); T ?/?( T, T ); } )
+        T smoothstep( T edge0, T edge1, T x ) { T t = clamp( (x - edge0) / (edge1 - edge0), (T){0}, (T){1} ); return t * t * ((T){3} - (T){2} * t); }
+} // distribution
 // Local Variables: //

libcfa/src/startup.cfa

-              r3c64c668
+              r58fe85a
 //
+#include <time.h>                                                                               // tzset
+#include <time.h>                // tzset
+#include <locale.h>        // setlocale
 #include "startup.hfa"
 …
     void __cfaabi_appready_startup( void ) {
                 tzset();                                                                                // initialize time global variables
+                setlocale(LC_NUMERIC, "");
                 #ifdef __CFA_DEBUG__
                 extern void heapAppStart();
 …
 struct __spinlock_t;
 extern "C" {
         void __cfaabi_dbg_record(struct __spinlock_t & this, const char prev_name[]) __attribute__(( weak )) {}
+        void __cfaabi_dbg_record_lock(struct __spinlock_t & this, const char prev_name[]) __attribute__(( weak )) {}
+}

libcfa/src/stdhdr/assert.h

-              r3c64c668
+              r58fe85a
         #define verify(x) assert(x)
         #define verifyf(x, ...) assertf(x, __VA_ARGS__)
+        #define verifyfail(...)
         #define __CFA_WITH_VERIFY__
 #else
         #define verify(x)
         #define verifyf(x, ...)
+        #define verifyfail(...)
 #endif

libcfa/src/stdhdr/malloc.h

-              r3c64c668
+              r58fe85a
 // Created On       : Thu Jul 20 15:58:16 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Aug 11 09:06:31 2018
 // Update Count     : 10
+// Last Modified On : Wed May 27 14:13:14 2020
+// Update Count     : 18
 //
-size_t default_mmap_start();                                                    // CFA extras
-size_t default_heap_expansion();
-bool traceHeap();
-bool traceHeapOn();
-bool traceHeapOff();
-bool traceHeapTerm();
-bool traceHeapTermOn();
-bool traceHeapTermOff();
-bool checkFree();
-bool checkFreeOn();
-bool checkFreeOff();
-extern "C" {
-size_t malloc_alignment( void * );
-bool malloc_zero_fill( void * );
-int malloc_stats_fd( int fd );
-void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize );
-} // extern "C"
 extern "C" {
 #include_next <malloc.h>                                                                // has internal check for multiple expansion
 } // extern "C"
+#include <heap.hfa>
 // Local Variables: //

libcfa/src/stdlib.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Thu Jan 28 17:10:29 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 08:27:08 2020
 // Update Count     : 486
+// Last Modified On : Thu Nov 12 07:46:09 2020
+// Update Count     : 503
 //
 …
 #define _XOPEN_SOURCE 600                                                               // posix_memalign, *rand48
 #include <string.h>                                                                             // memcpy, memset
-#include <malloc.h>                                                                             // malloc_usable_size
 //#include <math.h>                                                                             // fabsf, fabs, fabsl
 #include <complex.h>                                                                    // _Complex_I
 …
 //---------------------------------------
+forall( dtype T | sized(T) ) {
+        T * alloc_set( T ptr[], size_t dim, char fill ) {       // realloc array with fill
+                size_t olen = malloc_usable_size( ptr );                // current allocation
+                void * nptr = (void *)realloc( (void *)ptr, dim * sizeof(T) ); // C realloc
+                size_t nlen = malloc_usable_size( nptr );               // new allocation
+                if ( nlen > olen ) {                                                    // larger ?
+                        memset( (char *)nptr + olen, (int)fill, nlen - olen ); // initialize added storage
+                } // if
+                return (T *)nptr;
+        } // alloc_set
+        T * alloc_align_set( T ptr[], size_t align, char fill ) { // aligned realloc with fill
+                size_t olen = malloc_usable_size( ptr );                // current allocation
+                void * nptr = (void *)realloc( (void *)ptr, align, sizeof(T) ); // CFA realloc
+                // char * nptr = alloc_align( ptr, align );
+                size_t nlen = malloc_usable_size( nptr );               // new allocation
+                if ( nlen > olen ) {                                                    // larger ?
+                        memset( (char *)nptr + olen, (int)fill, nlen - olen ); // initialize added storage
+                } // if
+                return (T *)nptr;
+        } // alloc_align_set
+} // distribution
+// allocation/deallocation and constructor/destructor, non-array types
+forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } )
+T * new( Params p ) {
+        return &(*malloc()){ p };                                                       // run constructor
+} // new
+forall( dtype T | sized(T) | { void ^?{}( T & ); } )
+void delete( T * ptr ) {
+        if ( ptr ) {                                                                            // ignore null
+                ^(*ptr){};                                                                              // run destructor
+                free( ptr );
+        } // if
+} // delete
+forall( dtype T, ttype Params | sized(T) | { void ^?{}( T & ); void delete( Params ); } )
+void delete( T * ptr, Params rest ) {
+        if ( ptr ) {                                                                            // ignore null
+                ^(*ptr){};                                                                              // run destructor
+                free( ptr );
+        } // if
+        delete( rest );
+} // delete
+// allocation/deallocation and constructor/destructor, array types
+forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } )
+T * anew( size_t dim, Params p ) {
+// Cforall allocation/deallocation and constructor/destructor, array types
+forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } )
+T * anew( size_t dim, TT p ) {
         T * arr = alloc( dim );
         for ( unsigned int i = 0; i < dim; i += 1 ) {
 …
 forall( dtype T | sized(T) | { void ^?{}( T & ); } )
 void adelete( size_t dim, T arr[] ) {
+void adelete( T arr[] ) {
         if ( arr ) {                                                                            // ignore null
+                size_t dim = malloc_size( arr ) / sizeof( T );
                 for ( int i = dim - 1; i >= 0; i -= 1 ) {               // reverse allocation order, must be unsigned
                         ^(arr[i]){};                                                            // run destructor
 …
 } // adelete
 forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype Params | { void adelete( Params ); } )
 void adelete( size_t dim, T arr[], Params rest ) {
+forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype TT | { void adelete( TT ); } )
+void adelete( T arr[], TT rest ) {
         if ( arr ) {                                                                            // ignore null
+                size_t dim = malloc_size( arr ) / sizeof( T );
                 for ( int i = dim - 1; i >= 0; i -= 1 ) {               // reverse allocation order, must be unsigned
                         ^(arr[i]){};                                                            // run destructor
 …
 extern "C" {                                                                                    // override C version
         void srandom( unsigned int seed ) { srand48( (long int)seed ); }
         long int random( void ) { return mrand48(); }
+        long int random( void ) { return mrand48(); }           // GENERATES POSITIVE AND NEGATIVE VALUES
 } // extern "C"

libcfa/src/stdlib.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 08:27:01 2020
 // Update Count     : 401
+// Last Modified On : Sat Dec 12 13:52:34 2020
+// Update Count     : 536
 //
 #pragma once
 #include "bits/defs.hfa"
 #include "bits/align.hfa"
+#include "bits/defs.hfa"                                                                // OPTIONAL_THREAD
+#include "bits/align.hfa"                                                               // libAlign
 #include <stdlib.h>                                                                             // *alloc, strto*, ato*
+#include <heap.hfa>
+// Reduce includes by explicitly defining these routines.
 extern "C" {
+        void * memalign( size_t align, size_t size );           // malloc.h
+        void * memalign( size_t alignment, size_t size );       // malloc.h
+        void * pvalloc( size_t size );                                          // malloc.h
         void * memset( void * dest, int fill, size_t size ); // string.h
         void * memcpy( void * dest, const void * src, size_t size ); // string.h
-    void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize ); // CFA heap
 } // extern "C"
-void * realloc( void * oaddr, size_t nalign, size_t size ); // CFA heap
 //---------------------------------------
 …
 //---------------------------------------
+#include "common.hfa"
+//---------------------------------------
+// Macro because of returns
+#define $ARRAY_ALLOC( allocation, alignment, dim ) \
+        if ( _Alignof(T) <= libAlign() ) return (T *)(void *)allocation( dim, (size_t)sizeof(T) ); /* C allocation */ \
+        else return (T *)alignment( _Alignof(T), dim, sizeof(T) )
 static inline forall( dtype T | sized(T) ) {
         // C dynamic allocation
+        // CFA safe equivalents, i.e., implicit size specification
         T * malloc( void ) {
                 if ( _Alignof(T) <= libAlign() ) return (T *)(void *)malloc( (size_t)sizeof(T) ); // C malloc
+                if ( _Alignof(T) <= libAlign() ) return (T *)(void *)malloc( (size_t)sizeof(T) ); // C allocation
                 else return (T *)memalign( _Alignof(T), sizeof(T) );
         } // malloc
+        T * aalloc( size_t dim ) {
+                $ARRAY_ALLOC( aalloc, amemalign, dim );
+        } // aalloc
         T * calloc( size_t dim ) {
+                if ( _Alignof(T) <= libAlign() )return (T *)(void *)calloc( dim, sizeof(T) ); // C calloc
+                else return (T *)cmemalign( _Alignof(T), dim, sizeof(T) );
+                $ARRAY_ALLOC( calloc, cmemalign, dim );
         } // calloc
+        T * resize( T * ptr, size_t size ) {                            // CFA resize, eliminate return-type cast
+                if ( _Alignof(T) <= libAlign() ) return (T *)(void *)resize( (void *)ptr, size ); // CFA resize
+                else return (T *)(void *)resize( (void *)ptr, _Alignof(T), size ); // CFA resize
+        } // resize
         T * realloc( T * ptr, size_t size ) {                           // CFA realloc, eliminate return-type cast
+                return (T *)(void *)realloc( (void *)ptr, size ); // C realloc
+                if ( _Alignof(T) <= libAlign() ) return (T *)(void *)realloc( (void *)ptr, size ); // C realloc
+                else return (T *)(void *)realloc( (void *)ptr, _Alignof(T), size ); // CFA realloc
         } // realloc
 …
         } // memalign
+        T * amemalign( size_t align, size_t dim ) {
+                return (T *)amemalign( align, dim, sizeof(T) ); // CFA amemalign
+        } // amemalign
         T * cmemalign( size_t align, size_t dim  ) {
                 return (T *)cmemalign( align, dim, sizeof(T) ); // CFA cmemalign
 …
         } // posix_memalign
+        // Cforall dynamic allocation
+        T * alloc( void ) {
+                return malloc();
+        } // alloc
+        T * alloc( size_t dim ) {
+                if ( _Alignof(T) <= libAlign() ) return (T *)(void *)malloc( dim * (size_t)sizeof(T) );
+                else return (T *)memalign( _Alignof(T), dim * sizeof(T) );
+        } // alloc
+        T * alloc( T ptr[], size_t dim ) {                                      // realloc
+                return (T *)(void *)realloc( (void *)ptr, dim * sizeof(T) ); // C realloc
+        } // alloc
+        T * alloc_set( char fill ) {
+                return (T *)memset( (T *)alloc(), (int)fill, sizeof(T) ); // initialize with fill value
+        } // alloc
+        T * alloc_set( T fill ) {
+                return (T *)memcpy( (T *)alloc(), &fill, sizeof(T) ); // initialize with fill value
+        } // alloc
+        T * alloc_set( size_t dim, char fill ) {
+                return (T *)memset( (T *)alloc( dim ), (int)fill, dim * sizeof(T) ); // initialize with fill value
+        } // alloc
+        T * alloc_set( size_t dim, T fill ) {
+                T * r = (T *)alloc( dim );
+                for ( i; dim ) { memcpy( &r[i], &fill, sizeof(T) ); } // initialize with fill value
+                return r;
+        } // alloc
+        T * alloc_set( size_t dim, const T fill[] ) {
+                return (T *)memcpy( (T *)alloc( dim ), fill, dim * sizeof(T) ); // initialize with fill value
+        } // alloc
+} // distribution
+forall( dtype T | sized(T) ) {
+        T * alloc_set( T ptr[], size_t dim, char fill );        // realloc array with fill
+} // distribution
+        T * valloc( void ) {
+                return (T *)valloc( sizeof(T) );                                // C valloc
+        } // valloc
+        T * pvalloc( void ) {
+                return (T *)pvalloc( sizeof(T) );                               // C pvalloc
+        } // pvalloc
+} // distribution
+/*
+        FIX ME : fix alloc interface after Ticker Number 214 is resolved, define and add union to S_fill. Then, modify postfix-fill functions to support T * with nmemb, char, and T object of any size. Finally, change alloc_internal.
+        Or, just follow the instructions below for that.
+. Replace the current forall-block that contains defintions of S_fill and S_realloc with following:
+                forall( dtype T | sized(T) ) {
+                        union  U_fill           { char c; T * a; T t; };
+                        struct S_fill           { char tag; U_fill(T) fill; };
+                        struct S_realloc        { inline T *; };
+                }
+. Replace all current postfix-fill functions with following for updated S_fill:
+                S_fill(T) ?`fill( char a )                                      { S_fill(T) ret = {'c'}; ret.fill.c = a; return ret; }
+                S_fill(T) ?`fill( T    a )                                      { S_fill(T) ret = {'t'}; memcpy(&ret.fill.t, &a, sizeof(T)); return ret; }
+                S_fill(T) ?`fill( T    a[], size_t nmemb )      { S_fill(T) ret = {'a', nmemb}; ret.fill.a = a; return ret; }
+. Replace the $alloc_internal function which is outside ttype forall-block with following function:
+                T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill) {
+                        T * ptr = NULL;
+                        size_t size = sizeof(T);
+                        size_t copy_end = 0;
+                        if(Resize) {
+                                ptr = (T*) (void *) resize( (int *)Resize, Align, Dim * size );
+                        } else if (Realloc) {
+                                if (Fill.tag != '0') copy_end = min(malloc_size( Realloc ), Dim * size);
+                                ptr = (T*) (void *) realloc( (int *)Realloc, Align, Dim * size );
+                        } else {
+                                ptr = (T*) (void *) memalign( Align, Dim * size );
+                        }
+                        if(Fill.tag == 'c') {
+                                memset( (char *)ptr + copy_end, (int)Fill.fill.c, Dim * size - copy_end );
+                        } else if(Fill.tag == 't') {
+                                for ( int i = copy_end; i <= Dim * size - size ; i += size ) {
+                                        memcpy( (char *)ptr + i, &Fill.fill.t, size );
+                                }
+                        } else if(Fill.tag == 'a') {
+                                memcpy( (char *)ptr + copy_end, Fill.fill.a, min(Dim * size - copy_end, size * Fill.nmemb) );
+                        }
+                        return ptr;
+                } // $alloc_internal
+*/
+typedef struct S_align                  { inline size_t;  } T_align;
+typedef struct S_resize                 { inline void *;  }     T_resize;
+forall( dtype T ) {
+        struct S_fill           { char tag; char c; size_t size; T * at; char t[50]; };
+        struct S_realloc        { inline T *; };
+}
+static inline T_align   ?`align   ( size_t a )  { return (T_align){a}; }
+static inline T_resize  ?`resize  ( void * a )  { return (T_resize){a}; }
 static inline forall( dtype T | sized(T) ) {
+        T * alloc_align( size_t align ) {
+                return (T *)memalign( align, sizeof(T) );
+        } // alloc_align
+        T * alloc_align( size_t align, size_t dim ) {
+                return (T *)memalign( align, dim * sizeof(T) );
+        } // alloc_align
+        T * alloc_align( T ptr[], size_t align ) {                      // aligned realloc array
+                return (T *)(void *)realloc( (void *)ptr, align, sizeof(T) ); // CFA realloc
+        } // alloc_align
+        T * alloc_align( T ptr[], size_t align, size_t dim ) { // aligned realloc array
+                return (T *)(void *)realloc( (void *)ptr, align, dim * sizeof(T) ); // CFA realloc
+        } // alloc_align
+        T * alloc_align_set( size_t align, char fill ) {
+                return (T *)memset( (T *)alloc_align( align ), (int)fill, sizeof(T) ); // initialize with fill value
+        } // alloc_align
+        T * alloc_align_set( size_t align, T fill ) {
+                return (T *)memcpy( (T *)alloc_align( align ), &fill, sizeof(T) ); // initialize with fill value
+        } // alloc_align
+        T * alloc_align_set( size_t align, size_t dim, char fill ) {
+                return (T *)memset( (T *)alloc_align( align, dim ), (int)fill, dim * sizeof(T) ); // initialize with fill value
+        } // alloc_align
+        T * alloc_align_set( size_t align, size_t dim, T fill ) {
+                T * r = (T *)alloc_align( align, dim );
+                for ( i; dim ) { memcpy( &r[i], &fill, sizeof(T) ); } // initialize with fill value
+                return r;
+        } // alloc_align
+        T * alloc_align_set( size_t align, size_t dim, const T fill[] ) {
+                return (T *)memcpy( (T *)alloc_align( align, dim ), fill, dim * sizeof(T) );
+        } // alloc_align
+} // distribution
+forall( dtype T | sized(T) ) {
+        T * alloc_align_set( T ptr[], size_t align, size_t dim, char fill ); // aligned realloc array with fill
+} // distribution
+        S_fill(T) ?`fill ( T t ) {
+                S_fill(T) ret = { 't' };
+                size_t size = sizeof(T);
+                if(size > sizeof(ret.t)) { printf("ERROR: const object of size greater than 50 bytes given for dynamic memory fill\n"); exit(1); }
+                memcpy( &ret.t, &t, size );
+                return ret;
+        }
+        S_fill(T)               ?`fill ( char c )                               { return (S_fill(T)){ 'c', c }; }
+        S_fill(T)               ?`fill ( T * a )                                { return (S_fill(T)){ 'T', '0', 0, a }; }
+        S_fill(T)               ?`fill ( T a[], size_t nmemb )  { return (S_fill(T)){ 'a', '0', nmemb * sizeof(T), a }; }
+        S_realloc(T)    ?`realloc ( T * a )                             { return (S_realloc(T)){a}; }
+        T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill) {
+                T * ptr = NULL;
+                size_t size = sizeof(T);
+                size_t copy_end = 0;
+                if ( Resize ) {
+                        ptr = (T*) (void *) resize( (void *)Resize, Align, Dim * size );
+                } else if ( Realloc ) {
+                        if (Fill.tag != '0') copy_end = min(malloc_size( Realloc ), Dim * size);
+                        ptr = (T*) (void *) realloc( (void *)Realloc, Align, Dim * size );
+                } else {
+                        ptr = (T*) (void *) memalign( Align, Dim * size );
+                }
+                if(Fill.tag == 'c') {
+                        memset( (char *)ptr + copy_end, (int)Fill.c, Dim * size - copy_end );
+                } else if(Fill.tag == 't') {
+                        for ( int i = copy_end; i < Dim * size; i += size ) {
+                                memcpy( (char *)ptr + i, &Fill.t, size );
+                        }
+                } else if(Fill.tag == 'a') {
+                        memcpy( (char *)ptr + copy_end, Fill.at, min(Dim * size - copy_end, Fill.size) );
+                } else if(Fill.tag == 'T') {
+                        for ( int i = copy_end; i < Dim * size; i += size ) {
+                                memcpy( (char *)ptr + i, Fill.at, size );
+                        }
+                }
+                return ptr;
+        } // $alloc_internal
+        forall( ttype TT | { T * $alloc_internal( void *, T *, size_t, size_t, S_fill(T), TT ); } ) {
+                T * $alloc_internal( void *       , T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill, T_resize Resize, TT rest) {
+                return $alloc_internal( Resize, (T*)0p, Align, Dim, Fill, rest);
+                }
+                T * $alloc_internal( void * Resize, T *        , size_t Align, size_t Dim, S_fill(T) Fill, S_realloc(T) Realloc, TT rest) {
+                return $alloc_internal( (void*)0p, Realloc, Align, Dim, Fill, rest);
+                }
+                T * $alloc_internal( void * Resize, T * Realloc, size_t      , size_t Dim, S_fill(T) Fill, T_align Align, TT rest) {
+                return $alloc_internal( Resize, Realloc, Align, Dim, Fill, rest);
+                }
+                T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T)     , S_fill(T) Fill, TT rest) {
+                return $alloc_internal( Resize, Realloc, Align, Dim, Fill, rest);
+                }
+            T * alloc( TT all ) {
+                return $alloc_internal( (void*)0p, (T*)0p, (_Alignof(T) > libAlign() ? _Alignof(T) : libAlign()), (size_t)1, (S_fill(T)){'0'}, all);
+            }
+            T * alloc( size_t dim, TT all ) {
+                return $alloc_internal( (void*)0p, (T*)0p, (_Alignof(T) > libAlign() ? _Alignof(T) : libAlign()), dim, (S_fill(T)){'0'}, all);
+            }
+        } // distribution TT
+} // distribution T
 static inline forall( dtype T | sized(T) ) {
         // data, non-array types
+        // CFA safe initialization/copy, i.e., implicit size specification, non-array types
         T * memset( T * dest, char fill ) {
                 return (T *)memset( dest, fill, sizeof(T) );
 …
                 return (T *)memcpy( dest, src, sizeof(T) );
         } // memcpy
+} // distribution
+static inline forall( dtype T | sized(T) ) {
+        // data, array types
+        // CFA safe initialization/copy, i.e., implicit size specification, array types
         T * amemset( T dest[], char fill, size_t dim ) {
                 return (T *)(void *)memset( dest, fill, dim * sizeof(T) ); // C memset
 …
 } // distribution
+// allocation/deallocation and constructor/destructor, non-array types
+forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * new( Params p );
+forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void delete( T * ptr );
+forall( dtype T, ttype Params | sized(T) | { void ^?{}( T & ); void delete( Params ); } ) void delete( T * ptr, Params rest );
+// allocation/deallocation and constructor/destructor, array types
+forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * anew( size_t dim, Params p );
+forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( size_t dim, T arr[] );
+forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype Params | { void adelete( Params ); } ) void adelete( size_t dim, T arr[], Params rest );
+// CFA deallocation for multiple objects
+static inline forall( dtype T )                                                 // FIX ME, problems with 0p in list
+void free( T * ptr ) {
+        free( (void *)ptr );                                                            // C free
+} // free
+static inline forall( dtype T, ttype TT | { void free( TT ); } )
+void free( T * ptr, TT rest ) {
+        free( ptr );
+        free( rest );
+} // free
+// CFA allocation/deallocation and constructor/destructor, non-array types
+static inline forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } )
+T * new( TT p ) {
+        return &(*(T *)malloc()){ p };                                                  // run constructor
+} // new
+static inline forall( dtype T | { void ^?{}( T & ); } )
+void delete( T * ptr ) {
+        // special case for 0-sized object => always call destructor
+        if ( ptr || sizeof(ptr) == 0 ) {                                        // ignore null but not 0-sized objects
+                ^(*ptr){};                                                                              // run destructor
+        } // if
+        free( ptr );                                                                            // always call free
+} // delete
+static inline forall( dtype T, ttype TT | { void ^?{}( T & ); void delete( TT ); } )
+void delete( T * ptr, TT rest ) {
+        delete( ptr );
+        delete( rest );
+} // delete
+// CFA allocation/deallocation and constructor/destructor, array types
+forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } ) T * anew( size_t dim, TT p );
+forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( T arr[] );
+forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype TT | { void adelete( TT ); } ) void adelete( T arr[], TT rest );
 //---------------------------------------
 …
 extern "C" {                                                                                    // override C version
         void srandom( unsigned int seed );
+        long int random( void );
+        long int random( void );                                                        // GENERATES POSITIVE AND NEGATIVE VALUES
+        // For positive values, use unsigned int, e.g., unsigned int r = random() % 100U;
 } // extern "C"
 …
         long int random( long int u ) { if ( u < 0 ) return random( u, 0 ); else return random( 0, u ); } // [0,u)
         unsigned long int random( void ) { return lrand48(); }
+        unsigned long int random( unsigned long int u ) { return lrand48() % u; } // [0,u)
         unsigned long int random( unsigned long int l, unsigned long int u ) { if ( u < l ) [u, l] = [l, u]; return lrand48() % (u - l) + l; } // [l,u)
-        unsigned long int random( unsigned long int u ) { return lrand48() % u; } // [0,u)
         char random( void ) { return (unsigned long int)random(); }
 …
 //---------------------------------------
+#include "common.hfa"
+//---------------------------------------
+extern bool threading_enabled(void) OPTIONAL_THREAD;
+extern bool threading_enabled( void ) OPTIONAL_THREAD;
 // Local Variables: //

libcfa/src/time.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed Mar 14 23:18:57 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 08:24:32 2020
 // Update Count     : 654
+// Last Modified On : Wed Jun 17 16:13:00 2020
+// Update Count     : 663
 //
 …
 #include <time.h>                                                                               // timespec
-extern "C" {
 #include <sys/time.h>                                                                   // timeval
+}
 #include <time_t.hfa>                                                                   // Duration/Time types
 …
         int64_t ?`w( Duration dur ) { return dur.tn / (7LL * 24LL * 60LL * 60LL * TIMEGRAN); }
+        double ?`dns( Duration dur ) { return dur.tn; }
+        double ?`dus( Duration dur ) { return dur.tn / ((double)TIMEGRAN / 1_000_000.); }
+        double ?`dms( Duration dur ) { return dur.tn / ((double)TIMEGRAN / 1_000.); }
+        double ?`ds( Duration dur ) { return dur.tn / (double)TIMEGRAN; }
+        double ?`dm( Duration dur ) { return dur.tn / (60. * TIMEGRAN); }
+        double ?`dh( Duration dur ) { return dur.tn / (60. * 60. * (double)TIMEGRAN); }
+        double ?`dd( Duration dur ) { return dur.tn / (24. * 60. * 60. * (double)TIMEGRAN); }
+        double ?`dw( Duration dur ) { return dur.tn / (7. * 24. * 60. * 60. * (double)TIMEGRAN); }
         Duration max( Duration lhs, Duration rhs ) { return  (lhs.tn < rhs.tn) ? rhs : lhs;}
         Duration min( Duration lhs, Duration rhs ) { return !(rhs.tn < lhs.tn) ? lhs : rhs;}

longrun_tests/Makefile.am

-              r3c64c668
+              r58fe85a
 ACLOCAL_AMFLAGS  = -I automake
 include $(top_srcdir)/src/cfa.make
+include $(top_srcdir)/tools/build/cfa.make
 repeats=10
 …
         -DTEST_$(shell cat .type | tr a-z A-Z)
 TESTS = block coroutine create disjoint enter enter3 processor stack wait yield
+TESTS = block coroutine create disjoint enter enter3 locks processor stack wait yield
 # .INTERMEDIATE: $(TESTS)

src/AST/Attribute.hpp

r3c64c668	r58fe85a
51	51	template<typename node_t>
52	52	friend node_t * mutate(const node_t * node);
	53	template<typename node_t>
	54	friend node_t * shallowCopy(const node_t * node);
53	55	};
54	56

src/AST/CVQualifiers.hpp

-              r3c64c668
+              r58fe85a
                 Restrict = 1 << 1,
                 Volatile = 1 << 2,
+                Lvalue   = 1 << 3,
+                Mutex    = 1 << 4,
+                Atomic   = 1 << 5,
+                NumQualifiers = 6
+                Mutex    = 1 << 3,
+                Atomic   = 1 << 4,
+                NumQualifiers = 5
         };
         /// Mask for equivalence-preserving qualfiers
         enum { EquivQualifiers = ~(Restrict | Lvalue) };
+        enum { EquivQualifiers = ~Restrict };
         /// Underlying data for qualifiers
 …
                                 bool is_restrict : 1;
                                 bool is_volatile : 1;
-                                bool is_lvalue   : 1;
                                 bool is_mutex    : 1;
                                 bool is_atomic   : 1;

src/AST/Convert.cpp

-              r3c64c668
+              r58fe85a
 // Author           : Thierry Delisle
 // Created On       : Thu May 09 15::37::05 2019
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec 11 21:39:32 2019
 // Update Count     : 33
+// Last Modified By : Andrew Beach
+// Last Modified On : Thr Nov 12 10:07:00 2020
+// Update Count     : 34
 //
 …
 #include "AST/Attribute.hpp"
+#include "AST/Copy.hpp"
 #include "AST/Decl.hpp"
 #include "AST/Expr.hpp"
 #include "AST/Init.hpp"
 #include "AST/Stmt.hpp"
+#include "AST/TranslationUnit.hpp"
 #include "AST/TypeSubstitution.hpp"
 …
 //================================================================================================
 namespace {
+namespace ast {
 // This is to preserve the FindSpecialDecls hack. It does not (and perhaps should not)
 // allow us to use the same stratagy in the new ast.
+ast::Type * sizeType = nullptr;
+ast::FunctionDecl * dereferenceOperator = nullptr;
+ast::StructDecl   * dtorStruct = nullptr;
+ast::FunctionDecl * dtorStructDestroy = nullptr;
+// xxx - since convert back pass works, this concern seems to be unnecessary.
+// these need to be accessed in new FixInit now
+ast::ptr<ast::Type> sizeType = nullptr;
+const ast::FunctionDecl * dereferenceOperator = nullptr;
+const ast::StructDecl   * dtorStruct = nullptr;
+const ast::FunctionDecl * dtorStructDestroy = nullptr;
+}
 …
         using Cache = std::unordered_map< const ast::Node *, BaseSyntaxNode * >;
         Cache cache;
+        // Statements can no longer be shared.
+        // however, since StmtExprResult is now implemented, need to still maintain
+        // readonly references.
+        Cache readonlyCache;
         template<typename T>
 …
+        }
+        const ast::DeclWithType * visit( const ast::ObjectDecl * node ) override final {
+                auto&& bfwd = get<Expression>().accept1( node->bitfieldWidth );
+                auto&& type = get<Type>().accept1( node->type );
+                auto&& init = get<Initializer>().accept1( node->init );
+                auto&& attr = get<Attribute>().acceptL( node->attributes );
+        const ast::DeclWithType * visit( const ast::ObjectDecl * node ) override final {
                 if ( inCache( node ) ) {
                         return nullptr;
+                }
+                auto bfwd = get<Expression>().accept1( node->bitfieldWidth );
+                auto type = get<Type>().accept1( node->type );
+                auto attr = get<Attribute>().acceptL( node->attributes );
                 auto decl = new ObjectDecl(
                         node->name,
 …
                         LinkageSpec::Spec( node->linkage.val ),
                         bfwd,
                         type,
                         init,
+                        type->clone(),
+                        nullptr, // prevent infinite loop
                         attr,
                         Type::FuncSpecifiers( node->funcSpec.val )
                 );
+                return declWithTypePostamble( decl, node );
+                // handles the case where node->init references itself
+                // xxx - does it really happen?
+                declWithTypePostamble(decl, node);
+                auto init = get<Initializer>().accept1( node->init );
+                decl->init = init;
+                this->node = decl;
+                return nullptr;
+        }
         const ast::DeclWithType * visit( const ast::FunctionDecl * node ) override final {
                 if ( inCache( node ) ) return nullptr;
+                // function decl contains real variables that the type must use.
+                // the structural change means function type in and out of decl
+                // must be handled **differently** on convert back to old.
+                auto ftype = new FunctionType(
+                        cv(node->type),
+                        (bool)node->type->isVarArgs
+                );
+                ftype->returnVals = get<DeclarationWithType>().acceptL(node->returns);
+                ftype->parameters = get<DeclarationWithType>().acceptL(node->params);
+                ftype->forall = get<TypeDecl>().acceptL( node->type_params );
+                if (!node->assertions.empty()) {
+                        assert(!ftype->forall.empty());
+                        // find somewhere to place assertions back, for convenience it is the last slot
+                        ftype->forall.back()->assertions = get<DeclarationWithType>().acceptL(node->assertions);
+                }
+                visitType(node->type, ftype);
                 auto decl = new FunctionDecl(
                         node->name,
                         Type::StorageClasses( node->storage.val ),
                         LinkageSpec::Spec( node->linkage.val ),
+                        get<FunctionType>().accept1( node->type ),
+                        ftype,
+                        //get<FunctionType>().accept1( node->type ),
                         {},
                         get<Attribute>().acceptL( node->attributes ),
 …
                 decl->statements = get<CompoundStmt>().accept1( node->stmts );
                 decl->withExprs = get<Expression>().acceptL( node->withExprs );
                 if ( dereferenceOperator == node ) {
+                if ( ast::dereferenceOperator == node ) {
                         Validate::dereferenceOperator = decl;
+                }
                 if ( dtorStructDestroy == node ) {
+                if ( ast::dtorStructDestroy == node ) {
                         Validate::dtorStructDestroy = decl;
+                }
 …
         const ast::Decl * namedTypePostamble( NamedTypeDecl * decl, const ast::NamedTypeDecl * node ) {
                 // base comes from constructor
-                decl->parameters = get<TypeDecl>().acceptL( node->params );
                 decl->assertions = get<DeclarationWithType>().acceptL( node->assertions );
                 declPostamble( decl, node );
 …
                 );
                 if ( dtorStruct == node ) {
+                if ( ast::dtorStruct == node ) {
                         Validate::dtorStruct = decl;
+                }
 …
         const ast::Stmt * stmtPostamble( Statement * stmt, const ast::Stmt * node ) {
+                cache.emplace( node, stmt );
+                // force statements in old tree to be unique.
+                // cache.emplace( node, stmt );
+                readonlyCache.emplace( node, stmt );
                 stmt->location = node->location;
                 stmt->labels = makeLabelL( stmt, node->labels );
 …
                 if ( inCache( node ) ) return nullptr;
                 auto stmt = new ExprStmt( nullptr );
-                cache.emplace( node, stmt );
                 stmt->expr = get<Expression>().accept1( node->expr );
                 return stmtPostamble( stmt, node );
 …
+        }
+        const ast::Stmt * visit(const ast::SuspendStmt * node ) override final {
+                if ( inCache( node ) ) return nullptr;
+                auto stmt = new SuspendStmt();
+                stmt->then   = get<CompoundStmt>().accept1( node->then   );
+                switch(node->type) {
+                        case ast::SuspendStmt::None     : stmt->type = SuspendStmt::None     ; break;
+                        case ast::SuspendStmt::Coroutine: stmt->type = SuspendStmt::Coroutine; break;
+                        case ast::SuspendStmt::Generator: stmt->type = SuspendStmt::Generator; break;
+                }
+                return stmtPostamble( stmt, node );
+        }
         const ast::Stmt * visit( const ast::WaitForStmt * node ) override final {
                 if ( inCache( node ) ) return nullptr;
 …
                 for (decltype(src->begin()) src_i = src->begin(); src_i != src->end(); src_i++) {
                         rslt->add( src_i->first,
+                        rslt->add( src_i->first.typeString(),
                                    get<Type>().accept1(src_i->second) );
+                }
-                for (decltype(src->beginVar()) src_i = src->beginVar(); src_i != src->endVar(); src_i++) {
-                        rslt->addVar( src_i->first,
-                                      get<Expression>().accept1(src_i->second) );
+                }
 …
                 assert( tgtResnSlots.empty() );
                 if ( srcInferred.mode == ast::Expr::InferUnion::Params ) {
+                if ( srcInferred.data.inferParams ) {
                         const ast::InferredParams &srcParams = srcInferred.inferParams();
                         for (auto & srcParam : srcParams) {
 …
                                         srcParam.second.decl,
                                         get<Declaration>().accept1(srcParam.second.declptr),
                                         get<Type>().accept1(srcParam.second.actualType),
                                         get<Type>().accept1(srcParam.second.formalType),
                                         get<Expression>().accept1(srcParam.second.expr)
+                                        get<Type>().accept1(srcParam.second.actualType)->clone(),
+                                        get<Type>().accept1(srcParam.second.formalType)->clone(),
+                                        get<Expression>().accept1(srcParam.second.expr)->clone()
                                 ));
                                 assert(res.second);
+                        }
+                } else if ( srcInferred.mode == ast::Expr::InferUnion::Slots  ) {
+                }
+                if ( srcInferred.data.resnSlots ) {
                         const ast::ResnSlots &srcSlots = srcInferred.resnSlots();
                         for (auto srcSlot : srcSlots) {
 …
                 tgt->result = get<Type>().accept1(src->result);
+                // Unconditionally use a clone of the result type.
+                // We know this will leak some objects: much of the immediate conversion result.
+                // In some cases, using the conversion result directly gives unintended object sharing.
+                // A parameter (ObjectDecl, a child of a FunctionType) is shared by the weak-ref cache.
+                // But tgt->result must be fully owned privately by tgt.
+                // Applying these conservative copies here means
+                // - weak references point at the declaration's copy, not these expr.result copies (good)
+                // - we copy more objects than really needed (bad, tolerated)
+                if (tgt->result) {
+                        tgt->result = tgt->result->clone();
+                }
                 return visitBaseExpr_skipResultType(src, tgt);
+        }
 …
                         new KeywordCastExpr(
                                 get<Expression>().accept1(node->arg),
+                                castTarget
+                                castTarget,
+                                {node->concrete_target.field, node->concrete_target.getter}
+                        )
                 );
 …
         const ast::Expr * visit( const ast::StmtExpr * node ) override final {
+                auto stmts = node->stmts;
+                // disable sharing between multiple StmtExprs explicitly.
+                // this should no longer be true.
                 auto rslt = new StmtExpr(
                         get<CompoundStmt>().accept1(node->stmts)
+                        get<CompoundStmt>().accept1(stmts)
                 );
                 rslt->returnDecls = get<ObjectDecl>().acceptL(node->returnDecls);
                 rslt->dtors       = get<Expression>().acceptL(node->dtors);
+                if (node->resultExpr) {
+                        // this MUST be found by children visit
+                        rslt->resultExpr  = strict_dynamic_cast<ExprStmt *>(readonlyCache.at(node->resultExpr));
+                }
                 auto expr = visitBaseExpr( node, rslt );
 …
                 auto expr = visitBaseExpr( node, rslt );
                 this->node = expr;
+                this->node = expr->clone();
                 return nullptr;
+        }
 …
                 auto type = new BasicType{ cv( node ), (BasicType::Kind)(unsigned)node->kind };
                 // I believe this should always be a BasicType.
                 if ( sizeType == node ) {
+                if ( ast::sizeType == node ) {
                         Validate::SizeType = type;
+                }
 …
         const ast::Type * visit( const ast::FunctionType * node ) override final {
+                static std::string dummy_paramvar_prefix = "__param_";
+                static std::string dummy_returnvar_prefix = "__retval_";
                 auto ty = new FunctionType {
                         cv( node ),
                         (bool)node->isVarArgs
                 };
+                ty->returnVals = get<DeclarationWithType>().acceptL( node->returns );
+                ty->parameters = get<DeclarationWithType>().acceptL( node->params );
+                ty->forall = get<TypeDecl>().acceptL( node->forall );
+                auto returns = get<Type>().acceptL(node->returns);
+                auto params = get<Type>().acceptL(node->params);
+                int ret_index = 0;
+                for (auto t: returns) {
+                        // xxx - LinkageSpec shouldn't matter but needs to be something
+                        ObjectDecl * dummy = new ObjectDecl(dummy_returnvar_prefix + std::to_string(ret_index++), {}, LinkageSpec::C, nullptr, t, nullptr);
+                        ty->returnVals.push_back(dummy);
+                }
+                int param_index = 0;
+                for (auto t: params) {
+                        ObjectDecl * dummy = new ObjectDecl(dummy_paramvar_prefix + std::to_string(param_index++), {}, LinkageSpec::C, nullptr, t, nullptr);
+                        ty->parameters.push_back(dummy);
+                }
+                // ty->returnVals = get<DeclarationWithType>().acceptL( node->returns );
+                // ty->parameters = get<DeclarationWithType>().acceptL( node->params );
+                auto types = get<TypeInstType>().acceptL( node->forall );
+                for (auto t : types) {
+                        auto newT = new TypeDecl(*t->baseType);
+                        newT->name = t->name; // converted by typeString()
+                        for (auto asst : newT->assertions) delete asst;
+                        newT->assertions.clear();
+                        ty->forall.push_back(newT);
+                }
+                auto assts = get<VariableExpr>().acceptL( node->assertions );
+                if (!assts.empty()) {
+                        assert(!types.empty());
+                        for (auto asst : assts) {
+                                auto newDecl = new ObjectDecl(*strict_dynamic_cast<ObjectDecl*>(asst->var));
+                                delete newDecl->type;
+                                newDecl->type = asst->result->clone();
+                                newDecl->storageClasses.is_extern = true; // hack
+                                ty->forall.back()->assertions.push_back(newDecl);
+                        }
+                }
                 return visitType( node, ty );
+        }
+        const ast::Type * postvisit( const ast::ReferenceToType * old, ReferenceToType * ty ) {
+                ty->forall = get<TypeDecl>().acceptL( old->forall );
+        const ast::Type * postvisit( const ast::BaseInstType * old, ReferenceToType * ty ) {
                 ty->parameters = get<Expression>().acceptL( old->params );
                 ty->hoistType = old->hoistType;
 …
                         ty = new TypeInstType{
                                 cv( node ),
                                 node->name,
+                                node->typeString(),
                                 get<TypeDecl>().accept1( node->base ),
                                 get<Attribute>().acceptL( node->attributes )
 …
                         ty = new TypeInstType{
                                 cv( node ),
                                 node->name,
+                                node->typeString(),
                                 node->kind == ast::TypeDecl::Ftype,
                                 get<Attribute>().acceptL( node->attributes )
 …
 };
 std::list< Declaration * > convert( const std::list< ast::ptr< ast::Decl > > && translationUnit ) {
+std::list< Declaration * > convert( const ast::TranslationUnit && translationUnit ) {
         ConverterNewToOld c;
         std::list< Declaration * > decls;
         for(auto d : translationUnit) {
+        for(auto d : translationUnit.decls) {
                 decls.emplace_back( c.decl( d ) );
+        }
 …
         ast::Node * node = nullptr;
         /// cache of nodes that might be referenced by readonly<> for de-duplication
+        std::unordered_map< const BaseSyntaxNode *, ast::Node * > cache = {};
+        /// in case that some nodes are dropped by conversion (due to possible structural change)
+        /// use smart pointers in cache value to prevent accidental invalidation.
+        /// at conversion stage, all created nodes are guaranteed to be unique, therefore
+        /// const_casting out of smart pointers is permitted.
+        std::unordered_map< const BaseSyntaxNode *, ast::readonly<ast::Node> > cache = {};
         // Local Utilities:
 …
                 auto it = cache.find( old );
                 if ( it == cache.end() ) return false;
                 node = it->second;
+                node = const_cast<ast::Node *>(it->second.get());
                 return true;
+        }
 …
         virtual void visit( const FunctionDecl * old ) override final {
                 if ( inCache( old ) ) return;
+                auto paramVars = GET_ACCEPT_V(type->parameters, DeclWithType);
+                auto returnVars = GET_ACCEPT_V(type->returnVals, DeclWithType);
+                auto forall = GET_ACCEPT_V(type->forall, TypeDecl);
+                // function type is now derived from parameter decls instead of storing them
+                /*
+                auto ftype = new ast::FunctionType((ast::ArgumentFlag)old->type->isVarArgs, cv(old->type));
+                ftype->params.reserve(paramVars.size());
+                ftype->returns.reserve(returnVars.size());
+                for (auto & v: paramVars) {
+                        ftype->params.emplace_back(v->get_type());
+                }
+                for (auto & v: returnVars) {
+                        ftype->returns.emplace_back(v->get_type());
+                }
+                ftype->forall = std::move(forall);
+                */
+                // can function type have attributes? seems not to be the case.
+                // visitType(old->type, ftype);
+                // collect assertions and put directly in FunctionDecl
+                std::vector<ast::ptr<ast::DeclWithType>> assertions;
+                for (auto & param: forall) {
+                        for (auto & asst: param->assertions) {
+                                assertf(asst->unique(), "newly converted decl must be unique");
+                                assertions.emplace_back(asst);
+                        }
+                        auto mut = param.get_and_mutate();
+                        assertf(mut == param, "newly converted decl must be unique");
+                        mut->assertions.clear();
+                }
                 auto decl = new ast::FunctionDecl{
                         old->location,
                         old->name,
+                        GET_ACCEPT_1(type, FunctionType),
+                        // GET_ACCEPT_1(type, FunctionType),
+                        std::move(forall),
+                        std::move(paramVars),
+                        std::move(returnVars),
                         {},
                         { old->storageClasses.val },
                         { old->linkage.val },
                         GET_ACCEPT_V(attributes, Attribute),
+                        { old->get_funcSpec().val }
+                        { old->get_funcSpec().val },
+                        old->type->isVarArgs
                 };
+                // decl->type = ftype;
                 cache.emplace( old, decl );
+                decl->assertions = std::move(assertions);
                 decl->withExprs = GET_ACCEPT_V(withExprs, Expr);
                 decl->stmts = GET_ACCEPT_1(statements, CompoundStmt);
 …
                 if ( Validate::dereferenceOperator == old ) {
                         dereferenceOperator = decl;
+                        ast::dereferenceOperator = decl;
+                }
                 if ( Validate::dtorStructDestroy == old ) {
                         dtorStructDestroy = decl;
+                        ast::dtorStructDestroy = decl;
+                }
+        }
 …
                 if ( Validate::dtorStruct == old ) {
                         dtorStruct = decl;
+                        ast::dtorStruct = decl;
+                }
+        }
 …
                 cache.emplace( old, decl );
                 decl->assertions = GET_ACCEPT_V(assertions, DeclWithType);
-                decl->params     = GET_ACCEPT_V(parameters, TypeDecl);
                 decl->extension  = old->extension;
                 decl->uniqueId   = old->uniqueId;
 …
                 );
                 decl->assertions = GET_ACCEPT_V(assertions, DeclWithType);
-                decl->params     = GET_ACCEPT_V(parameters, TypeDecl);
                 decl->extension  = old->extension;
                 decl->uniqueId   = old->uniqueId;
 …
+        }
+        virtual void visit( const SuspendStmt * old ) override final {
+                if ( inCache( old ) ) return;
+                ast::SuspendStmt::Type type;
+                switch (old->type) {
+                        case SuspendStmt::Coroutine: type = ast::SuspendStmt::Coroutine; break;
+                        case SuspendStmt::Generator: type = ast::SuspendStmt::Generator; break;
+                        case SuspendStmt::None     : type = ast::SuspendStmt::None     ; break;
+                        default: abort();
+                }
+                this->node = new ast::SuspendStmt(
+                        old->location,
+                        GET_ACCEPT_1(then  , CompoundStmt),
+                        type,
+                        GET_LABELS_V(old->labels)
+                );
+                cache.emplace( old, this->node );
+        }
         virtual void visit( const WaitForStmt * old ) override final {
                 if ( inCache( old ) ) return;
 …
+        }
+        // TypeSubstitution shouldn't exist yet in old.
         ast::TypeSubstitution * convertTypeSubstitution(const TypeSubstitution * old) {
                 if (!old) return nullptr;
+                if (old->empty()) return nullptr;
+                assert(false);
+                /*
                 ast::TypeSubstitution *rslt = new ast::TypeSubstitution();
 …
+                }
-                for (decltype(old->beginVar()) old_i = old->beginVar(); old_i != old->endVar(); old_i++) {
-                        rslt->addVar( old_i->first,
-                                      getAccept1<ast::Expr>(old_i->second) );
+                }
                 return rslt;
+                */
+        }
 …
                 assert( oldInferParams.empty() || oldResnSlots.empty() );
                 assert( newInferred.mode == ast::Expr::InferUnion::Empty );
+                // assert( newInferred.mode == ast::Expr::InferUnion::Empty );
                 if ( !oldInferParams.empty() ) {
 …
                                 old->location,
                                 GET_ACCEPT_1(arg, Expr),
+                                castTarget
+                                castTarget,
+                                {old->concrete_target.field, old->concrete_target.getter}
+                        )
                 );
 …
                                 old->location,
                                 GET_ACCEPT_1(member, DeclWithType),
+                                GET_ACCEPT_1(aggregate, Expr)
+                                GET_ACCEPT_1(aggregate, Expr),
+                                ast::MemberExpr::NoOpConstructionChosen
+                        )
                 );
 …
                 // I believe this should always be a BasicType.
                 if ( Validate::SizeType == old ) {
                         sizeType = type;
+                        ast::sizeType = type;
+                }
                 visitType( old, type );
 …
                         cv( old )
                 };
+                ty->returns = GET_ACCEPT_V( returnVals, DeclWithType );
+                ty->params = GET_ACCEPT_V( parameters, DeclWithType );
+                ty->forall = GET_ACCEPT_V( forall, TypeDecl );
+                auto returnVars = GET_ACCEPT_V(returnVals, DeclWithType);
+                auto paramVars = GET_ACCEPT_V(parameters, DeclWithType);
+                // ty->returns = GET_ACCEPT_V( returnVals, DeclWithType );
+                // ty->params = GET_ACCEPT_V( parameters, DeclWithType );
+                for (auto & v: returnVars) {
+                        ty->returns.emplace_back(v->get_type());
+                }
+                for (auto & v: paramVars) {
+                        ty->params.emplace_back(v->get_type());
+                }
+                // xxx - when will this be non-null?
+                // will have to create dangling (no-owner) decls to be pointed to
+                auto foralls = GET_ACCEPT_V( forall, TypeDecl );
+                for (auto & param : foralls) {
+                        ty->forall.emplace_back(new ast::TypeInstType(param->name, param));
+                        for (auto asst : param->assertions) {
+                                ty->assertions.emplace_back(new ast::VariableExpr({}, asst));
+                        }
+                }
                 visitType( old, ty );
+        }
+        void postvisit( const ReferenceToType * old, ast::ReferenceToType * ty ) {
+                ty->forall = GET_ACCEPT_V( forall, TypeDecl );
+        void postvisit( const ReferenceToType * old, ast::BaseInstType * ty ) {
                 ty->params = GET_ACCEPT_V( parameters, Expr );
                 ty->hoistType = old->hoistType;
 …
                         old->location,
                         GET_ACCEPT_1(value, Expr),
                         (old->get_maybeConstructed()) ? ast::MaybeConstruct : ast::DoConstruct
+                        (old->get_maybeConstructed()) ? ast::MaybeConstruct : ast::NoConstruct
                 );
+        }
 …
                         GET_ACCEPT_V(initializers, Init),
                         GET_ACCEPT_V(designations, Designation),
                         (old->get_maybeConstructed()) ? ast::MaybeConstruct : ast::DoConstruct
+                        (old->get_maybeConstructed()) ? ast::MaybeConstruct : ast::NoConstruct
                 );
+        }
 …
 #undef GET_ACCEPT_1
 std::list< ast::ptr< ast::Decl > > convert( const std::list< Declaration * > && translationUnit ) {
+ast::TranslationUnit convert( const std::list< Declaration * > && translationUnit ) {
         ConverterOldToNew c;
+        std::list< ast::ptr< ast::Decl > > decls;
+        ast::TranslationUnit unit;
+        if (Validate::SizeType) {
+                // this should be a BasicType.
+                auto old = strict_dynamic_cast<BasicType *>(Validate::SizeType);
+                ast::sizeType = new ast::BasicType{ (ast::BasicType::Kind)(unsigned)old->kind };
+        }
         for(auto d : translationUnit) {
                 d->accept( c );
                 decls.emplace_back( c.decl() );
+                unit.decls.emplace_back( c.decl() );
+        }
         deleteAll(translationUnit);
+        return decls;
+        // Load the local static varables into the global store.
+        unit.global.sizeType = ast::sizeType;
+        unit.global.dereference = ast::dereferenceOperator;
+        unit.global.dtorStruct = ast::dtorStruct;
+        unit.global.dtorDestroy = ast::dtorStructDestroy;
+        return unit;
+}

src/AST/Convert.hpp

-              r3c64c668
+              r58fe85a
 #include <list>
-#include "AST/Node.hpp"
 class Declaration;
 namespace ast {
         class Decl;
+        struct TranslationUnit;
 };
 std::list< Declaration * > convert( const std::list< ast::ptr< ast::Decl > > && translationUnit );
 std::list< ast::ptr< ast::Decl > > convert( const std::list< Declaration * > && translationUnit );
+std::list< Declaration * > convert( const ast::TranslationUnit && translationUnit );
+ast::TranslationUnit convert( const std::list< Declaration * > && translationUnit );

src/AST/Decl.cpp

-              r3c64c668
+              r58fe85a
 // --- FunctionDecl
+FunctionDecl::FunctionDecl( const CodeLocation & loc, const std::string & name,
+        std::vector<ptr<TypeDecl>>&& forall,
+        std::vector<ptr<DeclWithType>>&& params, std::vector<ptr<DeclWithType>>&& returns,
+        CompoundStmt * stmts, Storage::Classes storage, Linkage::Spec linkage,
+        std::vector<ptr<Attribute>>&& attrs, Function::Specs fs, bool isVarArgs)
+: DeclWithType( loc, name, storage, linkage, std::move(attrs), fs ), params(std::move(params)), returns(std::move(returns)),
+        type_params(std::move(forall)), stmts( stmts ) {
+        FunctionType * ftype = new FunctionType(static_cast<ArgumentFlag>(isVarArgs));
+        for (auto & param : this->params) {
+                ftype->params.emplace_back(param->get_type());
+        }
+        for (auto & ret : this->returns) {
+                ftype->returns.emplace_back(ret->get_type());
+        }
+        for (auto & tp : this->type_params) {
+                ftype->forall.emplace_back(new TypeInstType(tp->name, tp));
+        }
+        this->type = ftype;
+}
 const Type * FunctionDecl::get_type() const { return type.get(); }
+void FunctionDecl::set_type(Type * t) { type = strict_dynamic_cast< FunctionType* >( t ); }
+void FunctionDecl::set_type( const Type * t ) {
+        type = strict_dynamic_cast< const FunctionType * >( t );
+}
 // --- TypeDecl

src/AST/Decl.hpp

-              r3c64c668
+              r58fe85a
 // Must be included in *all* AST classes; should be #undef'd at the end of the file
+#define MUTATE_FRIEND template<typename node_t> friend node_t * mutate(const node_t * node);
+#define MUTATE_FRIEND \
+    template<typename node_t> friend node_t * mutate(const node_t * node); \
+        template<typename node_t> friend node_t * shallowCopy(const node_t * node);
 namespace ast {
 …
         ptr<Expr> asmName;
         bool isDeleted = false;
+        bool isTypeFixed = false;
         DeclWithType( const CodeLocation& loc, const std::string& name, Storage::Classes storage,
 …
         virtual const Type * get_type() const = 0;
         /// Set type of this declaration. May be verified by subclass
         virtual void set_type(Type *) = 0;
+        virtual void set_type( const Type * ) = 0;
         const DeclWithType * accept( Visitor & v ) const override = 0;
 …
         const Type* get_type() const override { return type; }
         void set_type( Type * ty ) override { type = ty; }
+        void set_type( const Type * ty ) override { type = ty; }
         const DeclWithType * accept( Visitor& v ) const override { return v.visit( this ); }
 …
 class FunctionDecl : public DeclWithType {
 public:
+        std::vector<ptr<DeclWithType>> params;
+        std::vector<ptr<DeclWithType>> returns;
+        std::vector<ptr<TypeDecl>> type_params;
+        std::vector<ptr<DeclWithType>> assertions;
+        // declared type, derived from parameter declarations
         ptr<FunctionType> type;
         ptr<CompoundStmt> stmts;
         std::vector< ptr<Expr> > withExprs;
+        FunctionDecl( const CodeLocation & loc, const std::string & name, FunctionType * type,
+        FunctionDecl( const CodeLocation & loc, const std::string & name, std::vector<ptr<TypeDecl>>&& forall,
+                std::vector<ptr<DeclWithType>>&& params, std::vector<ptr<DeclWithType>>&& returns,
                 CompoundStmt * stmts, Storage::Classes storage = {}, Linkage::Spec linkage = Linkage::C,
                 std::vector<ptr<Attribute>>&& attrs = {}, Function::Specs fs = {})
         : DeclWithType( loc, name, storage, linkage, std::move(attrs), fs ), type( type ),
           stmts( stmts ) {}
+                std::vector<ptr<Attribute>>&& attrs = {}, Function::Specs fs = {}, bool isVarArgs = false);
+        // : DeclWithType( loc, name, storage, linkage, std::move(attrs), fs ), params(std::move(params)), returns(std::move(returns)),
+        //  stmts( stmts ) {}
         const Type * get_type() const override;
         void set_type(Type * t) override;
+        void set_type( const Type * t ) override;
         bool has_body() const { return stmts; }
 …
 public:
         ptr<Type> base;
-        std::vector<ptr<TypeDecl>> params;
         std::vector<ptr<DeclWithType>> assertions;
+        NamedTypeDecl( const CodeLocation& loc, const std::string& name, Storage::Classes storage,
+                Type* b, Linkage::Spec spec = Linkage::Cforall )
+        : Decl( loc, name, storage, spec ), base( b ), params(), assertions() {}
+        NamedTypeDecl(
+                const CodeLocation & loc, const std::string & name, Storage::Classes storage,
+                const Type * b, Linkage::Spec spec = Linkage::Cforall )
+        : Decl( loc, name, storage, spec ), base( b ), assertions() {}
         /// Produces a name for the kind of alias
 …
         };
+        TypeDecl( const CodeLocation & loc, const std::string & name, Storage::Classes storage, Type * b,
+                          Kind k, bool s, Type * i = nullptr )
+                : NamedTypeDecl( loc, name, storage, b ), kind( k ), sized( k == Ttype || s ),
+                init( i ) {}
+        TypeDecl(
+                const CodeLocation & loc, const std::string & name, Storage::Classes storage,
+                const Type * b, TypeDecl::Kind k, bool s, const Type * i = nullptr )
+        : NamedTypeDecl( loc, name, storage, b ), kind( k ), sized( k == TypeDecl::Ttype || s ),
+          init( i ) {}
         const char * typeString() const override;
 …
         bool is_coroutine() { return kind == Coroutine; }
+        bool is_monitor() { return kind == Monitor; }
+        bool is_thread() { return kind == Thread; }
+        bool is_generator() { return kind == Generator; }
+        bool is_monitor  () { return kind == Monitor  ; }
+        bool is_thread   () { return kind == Thread   ; }
         const Decl * accept( Visitor & v ) const override { return v.visit( this ); }

src/AST/DeclReplacer.cpp

-              r3c64c668
+              r58fe85a
                         const ast::TypeInstType * previsit( const ast::TypeInstType * );
                 };
+                struct VarExprReplacer {
+                private:
+                        const ExprMap & exprMap;
+                public:
+                        VarExprReplacer(const ExprMap & exprMap): exprMap (exprMap) {}
+                        const Expr * postvisit (const VariableExpr *);
+                };
+        }
 …
                 DeclMap declMap;
                 return replace( node, declMap, typeMap, debug );
+        }
+        const ast::Node * replace( const ast::Node * node, const ExprMap & exprMap) {
+                Pass<VarExprReplacer> replacer = {exprMap};
+                return node->accept( replacer );
+        }
 …
                         return ninst;
+                }
+                const Expr * VarExprReplacer::postvisit( const VariableExpr * expr ) {
+                        if (!exprMap.count(expr->var)) return expr;
+                        return exprMap.at(expr->var);
+                }
+        }
+}

src/AST/DeclReplacer.hpp

-              r3c64c668
+              r58fe85a
         class DeclWithType;
         class TypeDecl;
+        class Expr;
         namespace DeclReplacer {
                 using DeclMap = std::unordered_map< const DeclWithType *, const DeclWithType * >;
                 using TypeMap = std::unordered_map< const TypeDecl *, const TypeDecl * >;
+                using ExprMap = std::unordered_map< const DeclWithType *, const Expr * >;
                 const Node * replace( const Node * node, const DeclMap & declMap, bool debug = false );
                 const Node * replace( const Node * node, const TypeMap & typeMap, bool debug = false );
                 const Node * replace( const Node * node, const DeclMap & declMap, const TypeMap & typeMap, bool debug = false );
+                const Node * replace( const Node * node, const ExprMap & exprMap);
+        }
+}

src/AST/Expr.cpp

-              r3c64c668
+              r58fe85a
 #include <vector>
+#include "Copy.hpp"                // for shallowCopy
+#include "Eval.hpp"                // for call
 #include "GenericSubstitution.hpp"
+#include "LinkageSpec.hpp"
 #include "Stmt.hpp"
 #include "Type.hpp"
 …
 #include "Common/SemanticError.h"
 #include "GenPoly/Lvalue.h"        // for referencesPermissable
 #include "InitTweak/InitTweak.h"   // for getPointerBase
+#include "InitTweak/InitTweak.h"   // for getFunction, getPointerBase
 #include "ResolvExpr/typeops.h"    // for extractResultType
 #include "Tuples/Tuples.h"         // for makeTupleType
 namespace ast {
+namespace {
+        std::set<std::string> const lvalueFunctionNames = {"*?", "?[?]"};
+}
+// --- Expr
+bool Expr::get_lvalue() const {
+        return false;
+}
 // --- ApplicationExpr
 …
+}
+bool ApplicationExpr::get_lvalue() const {
+        if ( const DeclWithType * func = InitTweak::getFunction( this ) ) {
+                return func->linkage == Linkage::Intrinsic && lvalueFunctionNames.count( func->name );
+        }
+        return false;
+}
 // --- UntypedExpr
 UntypedExpr * UntypedExpr::createDeref( const CodeLocation & loc, Expr * arg ) {
+UntypedExpr * UntypedExpr::createDeref( const CodeLocation & loc, const Expr * arg ) {
         assert( arg );
+        UntypedExpr * ret = new UntypedExpr{
+                loc, new NameExpr{loc, "*?"}, std::vector<ptr<Expr>>{ ptr<Expr>{ arg } }
+        };
+        UntypedExpr * ret = call( loc, "*?", arg );
         if ( const Type * ty = arg->result ) {
                 const Type * base = InitTweak::getPointerBase( ty );
 …
                         // base type
                         ret->result = base;
-                        add_qualifiers( ret->result, CV::Lvalue );
+                }
+        }
 …
+}
+UntypedExpr * UntypedExpr::createAssign( const CodeLocation & loc, Expr * lhs, Expr * rhs ) {
+bool UntypedExpr::get_lvalue() const {
+        std::string fname = InitTweak::getFunctionName( this );
+        return lvalueFunctionNames.count( fname );
+}
+UntypedExpr * UntypedExpr::createAssign( const CodeLocation & loc, const Expr * lhs, const Expr * rhs ) {
         assert( lhs && rhs );
+        UntypedExpr * ret = new UntypedExpr{
+                loc, new NameExpr{loc, "?=?"}, std::vector<ptr<Expr>>{ ptr<Expr>{ lhs }, ptr<Expr>{ rhs } }
+        };
+        UntypedExpr * ret = call( loc, "?=?", lhs, rhs );
         if ( lhs->result && rhs->result ) {
                 // if both expressions are typed, assumes that this assignment is a C bitwise assignment,
 …
+        }
         return ret;
+}
+// --- VariableExpr
+VariableExpr::VariableExpr( const CodeLocation & loc )
+: Expr( loc ), var( nullptr ) {}
+VariableExpr::VariableExpr( const CodeLocation & loc, const DeclWithType * v )
+: Expr( loc ), var( v ) {
+        assert( var );
+        assert( var->get_type() );
+        result = shallowCopy( var->get_type() );
+}
+bool VariableExpr::get_lvalue() const {
+        // It isn't always an lvalue, but it is never an rvalue.
+        return true;
+}
+VariableExpr * VariableExpr::functionPointer(
+                const CodeLocation & loc, const FunctionDecl * decl ) {
+        // wrap usually-determined result type in a pointer
+        VariableExpr * funcExpr = new VariableExpr{ loc, decl };
+        funcExpr->result = new PointerType{ funcExpr->result };
+        return funcExpr;
+}
 …
 AddressExpr::AddressExpr( const CodeLocation & loc, const Expr * a ) : Expr( loc ), arg( a ) {
         if ( arg->result ) {
                 if ( arg->result->is_lvalue() ) {
+                if ( arg->get_lvalue() ) {
                         // lvalue, retains all levels of reference, and gains a pointer inside the references
                         Type * res = addrType( arg->result );
-                        res->set_lvalue( false ); // result of & is never an lvalue
                         result = res;
                 } else {
 …
                                         dynamic_cast< const ReferenceType * >( arg->result.get() ) ) {
                                 Type * res = addrType( refType->base );
-                                res->set_lvalue( false ); // result of & is never an lvalue
                                 result = res;
                         } else {
 …
 : Expr( loc, new VoidType{} ), arg( a ), isGenerated( g ) {}
+bool CastExpr::get_lvalue() const {
+        // This is actually wrong by C, but it works with our current set-up.
+        return arg->get_lvalue();
+}
 // --- KeywordCastExpr
 const char * KeywordCastExpr::targetString() const {
         return AggregateDecl::aggrString( target );
+}
+// --- UntypedMemberExpr
+bool UntypedMemberExpr::get_lvalue() const {
+        return aggregate->get_lvalue();
+}
 …
         assert( aggregate->result );
-        // take ownership of member type
         result = mem->get_type();
         // substitute aggregate generic parameters into member type
         genericSubstitution( aggregate->result ).apply( result );
+        // ensure lvalue and appropriate restrictions from aggregate type
+        add_qualifiers( result, aggregate->result->qualifiers | CV::Lvalue );
+}
+// --- VariableExpr
+VariableExpr::VariableExpr( const CodeLocation & loc )
+: Expr( loc ), var( nullptr ) {}
+VariableExpr::VariableExpr( const CodeLocation & loc, const DeclWithType * v )
+: Expr( loc ), var( v ) {
+        assert( var );
+        assert( var->get_type() );
+        result = var->get_type();
+        add_qualifiers( result, CV::Lvalue );
+}
+VariableExpr * VariableExpr::functionPointer(
+                const CodeLocation & loc, const FunctionDecl * decl ) {
+        // wrap usually-determined result type in a pointer
+        VariableExpr * funcExpr = new VariableExpr{ loc, decl };
+        funcExpr->result = new PointerType{ funcExpr->result };
+        return funcExpr;
+        // ensure appropriate restrictions from aggregate type
+        add_qualifiers( result, aggregate->result->qualifiers );
+}
+MemberExpr::MemberExpr( const CodeLocation & loc, const DeclWithType * mem, const Expr * agg,
+    MemberExpr::NoOpConstruction overloadSelector )
+: Expr( loc ), member( mem ), aggregate( agg ) {
+        assert( member );
+        assert( aggregate );
+        assert( aggregate->result );
+        (void) overloadSelector;
+}
+bool MemberExpr::get_lvalue() const {
+        // This is actually wrong by C, but it works with our current set-up.
+        return true;
+}
 …
 : Expr( loc, new BasicType{ BasicType::SignedInt } ), arg1( a1 ), arg2( a2 ), isAnd( ia ) {}
+// --- CommaExpr
+bool CommaExpr::get_lvalue() const {
+        // This is wrong by C, but the current implementation uses it.
+        // (ex: Specialize, Lvalue and Box)
+        return arg2->get_lvalue();
+}
 // --- ConstructorExpr
 …
         assert( t && i );
         result = t;
+        add_qualifiers( result, CV::Lvalue );
+}
+bool CompoundLiteralExpr::get_lvalue() const {
+        return true;
+}
 …
         // like MemberExpr, TupleIndexExpr is always an lvalue
         result = type->types[ index ];
+        add_qualifiers( result, CV::Lvalue );
+}
+bool TupleIndexExpr::get_lvalue() const {
+        return tuple->get_lvalue();
+}

src/AST/Expr.hpp

-              r3c64c668
+              r58fe85a
 // Must be included in *all* AST classes; should be #undef'd at the end of the file
+#define MUTATE_FRIEND template<typename node_t> friend node_t * mutate(const node_t * node);
+#define MUTATE_FRIEND \
+    template<typename node_t> friend node_t * mutate(const node_t * node); \
+        template<typename node_t> friend node_t * shallowCopy(const node_t * node);
 class ConverterOldToNew;
 …
 struct ParamEntry {
         UniqueId decl;
         ptr<Decl> declptr;
+        readonly<Decl> declptr;
         ptr<Type> actualType;
         ptr<Type> formalType;
 …
 class Expr : public ParseNode {
 public:
+        /// Saves space (~16 bytes) by combining ResnSlots and InferredParams
+        /*
+         * NOTE: the union approach is incorrect until the case of
+         * partial resolution in InferMatcher is eliminated.
+         * it is reverted to allow unresolved and resolved parameters
+         * to coexist in an expression node.
+         */
         struct InferUnion {
+                // mode is now unused
                 enum { Empty, Slots, Params } mode;
+                union data_t {
+                        char def;
+                        ResnSlots resnSlots;
+                        InferredParams inferParams;
+                        data_t() : def('\0') {}
+                        ~data_t() {}
+                struct data_t {
+                        // char def;
+                        ResnSlots * resnSlots;
+                        InferredParams * inferParams;
+                        data_t(): resnSlots(nullptr), inferParams(nullptr) {}
+                        data_t(const data_t &other) = delete;
+                        ~data_t() {
+                                delete resnSlots;
+                                delete inferParams;
+                        }
                 } data;
                 /// initializes from other InferUnion
                 void init_from( const InferUnion& o ) {
+                        switch ( o.mode ) {
+                        case Empty:  return;
+                        case Slots:  new(&data.resnSlots) ResnSlots{ o.data.resnSlots }; return;
+                        case Params: new(&data.inferParams) InferredParams{ o.data.inferParams }; return;
+                        if (o.data.resnSlots) {
+                                data.resnSlots = new ResnSlots(*o.data.resnSlots);
+                        }
+                        if (o.data.inferParams) {
+                                data.inferParams = new InferredParams(*o.data.inferParams);
+                        }
+                }
 …
                 /// initializes from other InferUnion (move semantics)
                 void init_from( InferUnion&& o ) {
+                        switch ( o.mode ) {
+                        case Empty:  return;
+                        case Slots:  new(&data.resnSlots) ResnSlots{ std::move(o.data.resnSlots) }; return;
+                        case Params:
+                                new(&data.inferParams) InferredParams{ std::move(o.data.inferParams) }; return;
+                        }
+                }
+                /// clears variant fields
+                void reset() {
+                        switch( mode ) {
+                        case Empty:  return;
+                        case Slots:  data.resnSlots.~ResnSlots(); return;
+                        case Params: data.inferParams.~InferredParams(); return;
+                        }
+                        data.resnSlots = o.data.resnSlots;
+                        data.inferParams = o.data.inferParams;
+                        o.data.resnSlots = nullptr;
+                        o.data.inferParams = nullptr;
+                }
 …
                 InferUnion& operator= ( const InferUnion& ) = delete;
                 InferUnion& operator= ( InferUnion&& ) = delete;
+                ~InferUnion() { reset(); }
+                bool hasSlots() const { return data.resnSlots; }
+                bool hasParams() const { return data.inferParams; }
                 ResnSlots& resnSlots() {
+                        switch (mode) {
+                        case Empty: new(&data.resnSlots) ResnSlots{}; mode = Slots; // fallthrough
+                        case Slots: return data.resnSlots;
+                        case Params: assertf(false, "Cannot return to resnSlots from Params"); abort();
+                        if (!data.resnSlots) {
+                                data.resnSlots = new ResnSlots();
+                        }
                         assertf(false, "unreachable");
+                        return *data.resnSlots;
+                }
                 const ResnSlots& resnSlots() const {
                         if (mode == Slots) {
                                 return data.resnSlots;
+                        if (data.resnSlots) {
+                                return *data.resnSlots;
+                        }
                         assertf(false, "Mode was not already resnSlots");
 …
                 InferredParams& inferParams() {
+                        switch (mode) {
+                        case Slots: data.resnSlots.~ResnSlots(); // fallthrough
+                        case Empty: new(&data.inferParams) InferredParams{}; mode = Params; // fallthrough
+                        case Params: return data.inferParams;
+                        if (!data.inferParams) {
+                                data.inferParams = new InferredParams();
+                        }
                         assertf(false, "unreachable");
+                        return *data.inferParams;
+                }
                 const InferredParams& inferParams() const {
                         if (mode == Params) {
                                 return data.inferParams;
+                        if (data.inferParams) {
+                                return *data.inferParams;
+                        }
                         assertf(false, "Mode was not already Params");
 …
+                }
+                void set_inferParams( InferredParams && ps ) {
+                        switch(mode) {
+                        case Slots:
+                                data.resnSlots.~ResnSlots();
+                                // fallthrough
+                        case Empty:
+                                new(&data.inferParams) InferredParams{ std::move( ps ) };
+                                mode = Params;
+                                break;
+                        case Params:
+                                data.inferParams = std::move( ps );
+                                break;
+                        }
+                void set_inferParams( InferredParams * ps ) {
+                        delete data.resnSlots;
+                        data.resnSlots = nullptr;
+                        delete data.inferParams;
+                        data.inferParams = ps;
+                }
 …
                 /// and the other is in `Params`.
                 void splice( InferUnion && o ) {
+                        if ( o.mode == Empty ) return;
+                        if ( mode == Empty ) { init_from( o ); return; }
+                        assert( mode == o.mode && "attempt to splice incompatible InferUnion" );
+                        if ( mode == Slots ){
+                                data.resnSlots.insert(
+                                        data.resnSlots.end(), o.data.resnSlots.begin(), o.data.resnSlots.end() );
+                        } else if ( mode == Params ) {
+                                for ( const auto & p : o.data.inferParams ) {
+                                        data.inferParams[p.first] = std::move(p.second);
+                        if (o.data.resnSlots) {
+                                if (data.resnSlots) {
+                                        data.resnSlots->insert(
+                                                data.resnSlots->end(), o.data.resnSlots->begin(), o.data.resnSlots->end() );
+                                        delete o.data.resnSlots;
+                                }
+                        } else assertf(false, "invalid mode");
+                                else {
+                                        data.resnSlots = o.data.resnSlots;
+                                }
+                                o.data.resnSlots = nullptr;
+                        }
+                        if (o.data.inferParams) {
+                                if (data.inferParams) {
+                                        for ( const auto & p : *o.data.inferParams ) {
+                                                (*data.inferParams)[p.first] = std::move(p.second);
+                                        }
+                                        delete o.data.inferParams;
+                                }
+                                else {
+                                        data.inferParams = o.data.inferParams;
+                                }
+                                o.data.inferParams = nullptr;
+                        }
+                }
         };
 …
         Expr * set_extension( bool ex ) { extension = ex; return this; }
+        virtual bool get_lvalue() const;
         virtual const Expr * accept( Visitor & v ) const override = 0;
 …
         ApplicationExpr( const CodeLocation & loc, const Expr * f, std::vector<ptr<Expr>> && as = {} );
+        bool get_lvalue() const final;
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 private:
 …
         : Expr( loc ), func( f ), args( std::move(as) ) {}
+        bool get_lvalue() const final;
         /// Creates a new dereference expression
         static UntypedExpr * createDeref( const CodeLocation & loc, Expr * arg );
+        static UntypedExpr * createDeref( const CodeLocation & loc, const Expr * arg );
         /// Creates a new assignment expression
         static UntypedExpr * createAssign( const CodeLocation & loc, Expr * lhs, Expr * rhs );
+        static UntypedExpr * createAssign( const CodeLocation & loc, const Expr * lhs, const Expr * rhs );
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 …
 };
+/// A reference to a named variable.
+class VariableExpr final : public Expr {
+public:
+        readonly<DeclWithType> var;
+        VariableExpr( const CodeLocation & loc );
+        VariableExpr( const CodeLocation & loc, const DeclWithType * v );
+        bool get_lvalue() const final;
+        /// generates a function pointer for a given function
+        static VariableExpr * functionPointer( const CodeLocation & loc, const FunctionDecl * decl );
+        const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
+private:
+        VariableExpr * clone() const override { return new VariableExpr{ *this }; }
+        MUTATE_FRIEND
+};
 /// Address-of expression `&e`
 class AddressExpr final : public Expr {
 …
 };
+/// Whether a cast existed in the program source or not
+/// Inidicates whether the cast is introduced by the CFA type system.
+/// GeneratedCast for casts that the resolver introduces to force a return type
+/// ExplicitCast for casts from user code
+/// ExplicitCast for casts from desugaring advanced CFA features into simpler CFA
+/// example
+///   int * p;     // declaration
+///   (float *) p; // use, with subject cast
+/// subject cast being GeneratedCast means we are considering an interpretation with a type mismatch
+/// subject cast being ExplicitCast means someone in charge wants it that way
 enum GeneratedFlag { ExplicitCast, GeneratedCast };
 …
         CastExpr( const Expr * a ) : CastExpr( a->location, a, GeneratedCast ) {}
+        bool get_lvalue() const final;
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 private:
 …
 public:
         ptr<Expr> arg;
+        struct Concrete {
+                std::string field;
+                std::string getter;
+                Concrete() = default;
+                Concrete(const Concrete &) = default;
+        };
         ast::AggregateDecl::Aggregate target;
+        Concrete concrete_target;
         KeywordCastExpr( const CodeLocation & loc, const Expr * a, ast::AggregateDecl::Aggregate t )
         : Expr( loc ), arg( a ), target( t ) {}
+        KeywordCastExpr( const CodeLocation & loc, const Expr * a, ast::AggregateDecl::Aggregate t, const Concrete & ct )
+        : Expr( loc ), arg( a ), target( t ), concrete_target( ct ) {}
         /// Get a name for the target type
         const char * targetString() const;
 …
         : Expr( loc ), member( mem ), aggregate( agg ) { assert( aggregate ); }
+        bool get_lvalue() const final;
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 private:
 …
         MemberExpr( const CodeLocation & loc, const DeclWithType * mem, const Expr * agg );
+        bool get_lvalue() const final;
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 private:
         MemberExpr * clone() const override { return new MemberExpr{ *this }; }
         MUTATE_FRIEND
+};
+/// A reference to a named variable.
+class VariableExpr final : public Expr {
+public:
+        readonly<DeclWithType> var;
+        VariableExpr( const CodeLocation & loc );
+        VariableExpr( const CodeLocation & loc, const DeclWithType * v );
+        /// generates a function pointer for a given function
+        static VariableExpr * functionPointer( const CodeLocation & loc, const FunctionDecl * decl );
+        const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
+private:
+        VariableExpr * clone() const override { return new VariableExpr{ *this }; }
+        MUTATE_FRIEND
+        // Custructor overload meant only for AST conversion
+        enum NoOpConstruction { NoOpConstructionChosen };
+        MemberExpr( const CodeLocation & loc, const DeclWithType * mem, const Expr * agg,
+            NoOpConstruction overloadSelector );
+        friend class ::ConverterOldToNew;
+        friend class ::ConverterNewToOld;
 };
 …
                 const CodeLocation & loc, const Type * ty, const std::string & r,
                         std::optional<unsigned long long> i )
         : Expr( loc, ty ), rep( r ), ival( i ) {}
+        : Expr( loc, ty ), rep( r ), ival( i ), underlyer(ty) {}
         /// Gets the integer value of this constant, if one is appropriate to its type.
 …
         CommaExpr( const CodeLocation & loc, const Expr * a1, const Expr * a2 )
+        : Expr( loc ), arg1( a1 ), arg2( a2 ) {}
+        : Expr( loc ), arg1( a1 ), arg2( a2 ) {
+                this->result = a2->result;
+        }
+        bool get_lvalue() const final;
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 …
         ImplicitCopyCtorExpr( const CodeLocation& loc, const ApplicationExpr * call )
         : Expr( loc, call->result ) { assert( call ); }
+        : Expr( loc, call->result ), callExpr(call) { assert( call ); assert(call->result); }
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 …
         CompoundLiteralExpr( const CodeLocation & loc, const Type * t, const Init * i );
+        bool get_lvalue() const final;
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 private:
 …
         TupleIndexExpr( const CodeLocation & loc, const Expr * t, unsigned i );
+        bool get_lvalue() const final;
         const Expr * accept( Visitor & v ) const override { return v.visit( this ); }
 …
         std::vector<ptr<Expr>> dtors;              ///< destructor(s) for return variable(s)
+        readonly<ExprStmt> resultExpr;
         StmtExpr( const CodeLocation & loc, const CompoundStmt * ss );

src/AST/Fwd.hpp

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May  8 16:05:00 2019
 // Last Modified By : Andrew Beach
 // Last Modified On : Mon Jun 24 09:48:00 2019
 // Update Count     : 1
+// Last Modified On : Thr Jul 23 14:15:00 2020
+// Update Count     : 2
 //
 …
 class CatchStmt;
 class FinallyStmt;
+class SuspendStmt;
 class WaitForStmt;
 class WithStmt;
 …
 class QualifiedType;
 class FunctionType;
+class ReferenceToType;
+class StructInstType;
+class UnionInstType;
+class EnumInstType;
+class BaseInstType;
+template<typename decl_t> class SueInstType;
+using StructInstType = SueInstType<StructDecl>;
+using UnionInstType = SueInstType<UnionDecl>;
+using EnumInstType = SueInstType<EnumDecl>;
 class TraitInstType;
 class TypeInstType;
 …
 typedef unsigned int UniqueId;
+struct TranslationUnit;
+// TODO: Get from the TranslationUnit:
+extern ptr<Type> sizeType;
+extern const FunctionDecl * dereferenceOperator;
+extern const StructDecl   * dtorStruct;
+extern const FunctionDecl * dtorStructDestroy;
+}

src/AST/GenericSubstitution.cpp

-              r3c64c668
+              r58fe85a
         private:
                 // make substitution for generic type
                 void makeSub( const ReferenceToType * ty ) {
+                void makeSub( const BaseInstType * ty ) {
                         visit_children = false;
                         const AggregateDecl * aggr = ty->aggr();
 …
         Pass<GenericSubstitutionBuilder> builder;
         maybe_accept( ty, builder );
         return std::move(builder.pass.sub);
+        return std::move(builder.core.sub);
+}

src/AST/Init.hpp

-              r3c64c668
+              r58fe85a
 // Must be included in *all* AST classes; should be #undef'd at the end of the file
+#define MUTATE_FRIEND template<typename node_t> friend node_t * mutate(const node_t * node);
+#define MUTATE_FRIEND \
+    template<typename node_t> friend node_t * mutate(const node_t * node); \
+        template<typename node_t> friend node_t * shallowCopy(const node_t * node);
 namespace ast {
 …
 /// Flag for whether to construct from initialzier
 enum ConstructFlag { DoConstruct, MaybeConstruct };
+enum ConstructFlag { NoConstruct, MaybeConstruct };
 /// Object initializer base class
 …
         ptr<Expr> value;
         SingleInit( const CodeLocation & loc, const Expr * val, ConstructFlag mc = DoConstruct )
+        SingleInit( const CodeLocation & loc, const Expr * val, ConstructFlag mc = NoConstruct )
         : Init( loc, mc ), value( val ) {}
 …
         ListInit( const CodeLocation & loc, std::vector<ptr<Init>> && is,
                 std::vector<ptr<Designation>> && ds = {}, ConstructFlag mc = DoConstruct );
+                std::vector<ptr<Designation>> && ds = {}, ConstructFlag mc = NoConstruct );
         using iterator = std::vector<ptr<Init>>::iterator;
 …
         ConstructorInit(
                 const CodeLocation & loc, const Stmt * ctor, const Stmt * dtor, const Init * init )
         : Init( loc, DoConstruct ), ctor( ctor ), dtor( dtor ), init( init ) {}
+        : Init( loc, MaybeConstruct ), ctor( ctor ), dtor( dtor ), init( init ) {}
         const Init * accept( Visitor & v ) const override { return v.visit( this ); }

src/AST/Node.cpp

-              r3c64c668
+              r58fe85a
 // Author           : Thierry Delisle
 // Created On       : Thu May 16 14:16:00 2019
 // Last Modified By :
 // Last Modified On :
 // Update Count     :
+// Last Modified By : Andrew Beach
+// Last Modified On : Fri Jun  5 10:21:00 2020
+// Update Count     : 1
 //
 …
 #include "Fwd.hpp"
+#include <csignal>  // MEMORY DEBUG -- for raise
 #include <iostream>
 …
 #include "Print.hpp"
+template< typename node_t, enum ast::Node::ref_type ref_t >
+void ast::ptr_base<node_t, ref_t>::_inc( const node_t * node ) { node->increment(ref_t); }
+template< typename node_t, enum ast::Node::ref_type ref_t >
+void ast::ptr_base<node_t, ref_t>::_dec( const node_t * node ) { node->decrement(ref_t); }
+template< typename node_t, enum ast::Node::ref_type ref_t >
+void ast::ptr_base<node_t, ref_t>::_check() const { if(node) assert(node->was_ever_strong == false || node->strong_count > 0); }
+/// MEMORY DEBUG -- allows breaking on ref-count changes of dynamically chosen object.
+/// Process to use in GDB:
+///   break ast::Node::_trap()
+///   run
+///   set variable MEM_TRAP_OBJ = <target>
+///   disable <first breakpoint>
+///   continue
+void * MEM_TRAP_OBJ = nullptr;
+void _trap( const void * node ) {
+        if ( node == MEM_TRAP_OBJ ) std::raise(SIGTRAP);
+}
+[[noreturn]] static inline void strict_fail(const ast::Node * node) {
+        assertf(node, "strict_as had nullptr input.");
+        const ast::ParseNode * parse = dynamic_cast<const ast::ParseNode *>( node );
+        if ( nullptr == parse ) {
+                assertf(nullptr, "%s (no location)", toString(node).c_str());
+        } else if ( parse->location.isUnset() ) {
+                assertf(nullptr, "%s (unset location)", toString(node).c_str());
+        } else {
+                assertf(nullptr, "%s (at %s:%d)", toString(node).c_str(),
+                        parse->location.filename.c_str(), parse->location.first_line);
+        }
+}
+template< typename node_t, enum ast::Node::ref_type ref_t >
+void ast::ptr_base<node_t, ref_t>::_strict_fail() const {
+        strict_fail(node);
+}
+template< typename node_t, enum ast::Node::ref_type ref_t >
+void ast::ptr_base<node_t, ref_t>::_inc( const node_t * node ) {
+        node->increment(ref_t);
+        _trap( node );
+}
+template< typename node_t, enum ast::Node::ref_type ref_t >
+void ast::ptr_base<node_t, ref_t>::_dec( const node_t * node, bool do_delete ) {
+        _trap( node );
+        node->decrement( ref_t, do_delete );
+}
+template< typename node_t, enum ast::Node::ref_type ref_t >
+void ast::ptr_base<node_t, ref_t>::_check() const {
+        // if(node) assert(node->was_ever_strong == false || node->strong_count > 0);
+}
 template< typename node_t, enum ast::Node::ref_type ref_t >
 …
 template class ast::ptr_base< ast::FunctionType, ast::Node::ref_type::weak >;
 template class ast::ptr_base< ast::FunctionType, ast::Node::ref_type::strong >;
 template class ast::ptr_base< ast::ReferenceToType, ast::Node::ref_type::weak >;
 template class ast::ptr_base< ast::ReferenceToType, ast::Node::ref_type::strong >;
+template class ast::ptr_base< ast::BaseInstType, ast::Node::ref_type::weak >;
+template class ast::ptr_base< ast::BaseInstType, ast::Node::ref_type::strong >;
 template class ast::ptr_base< ast::StructInstType, ast::Node::ref_type::weak >;
 template class ast::ptr_base< ast::StructInstType, ast::Node::ref_type::strong >;

src/AST/Node.hpp

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May 8 10:27:04 2019
 // Last Modified By : Andrew Beach
 // Last Modified On : Mon Jun  3 13:26:00 2019
 // Update Count     : 5
+// Last Modified On : Fri Jun 5 9:47:00 2020
+// Update Count     : 6
 //
 …
         Node& operator= (const Node&) = delete;
         Node& operator= (Node&&) = delete;
         virtual ~Node() = default;
+        virtual ~Node() {}
         virtual const Node * accept( Visitor & v ) const = 0;
 …
         bool unique() const { return strong_count == 1; }
+        bool isManaged() const {return strong_count > 0; }
 private:
 …
         template<typename node_t>
         friend node_t * mutate(const node_t * node);
+        template<typename node_t>
+        friend node_t * shallowCopy(const node_t * node);
         mutable size_t strong_count = 0;
 …
+        }
         void decrement(ast::Node::ref_type ref) const {
+        void decrement(ast::Node::ref_type ref, bool do_delete = true) const {
                 switch (ref) {
                         case ref_type::strong: strong_count--; break;
 …
+                }
                 if(!strong_count && !weak_count) {
+                if( do_delete && !strong_count && !weak_count) {
                         delete this;
+                }
 …
         assertf(
                 node->weak_count == 0,
                 "Error: mutating node with weak references to it will invalided some references"
+                "Error: mutating node with weak references to it will invalidate some references"
         );
         return node->clone();
 …
         // skip mutate if equivalent
         if ( node->*field == val ) return node;
         // mutate and return
         node_t * ret = mutate( node );
 …
         (ret->*field)[i] = std::forward< field_t >( val );
         return ret;
+}
+/// Mutate an entire indexed collection by cloning to accepted value
+template<typename node_t, typename parent_t, typename coll_t>
+const node_t * mutate_each( const node_t * node, coll_t parent_t::* field, Visitor & v ) {
+        for ( unsigned i = 0; i < (node->*field).size(); ++i ) {
+                node = mutate_field_index( node, field, i, (node->*field)[i]->accept( v ) );
+        }
+        return node;
+}
 …
         const node_t & operator* () const { _check(); return *node; }
         explicit operator bool() const { _check(); return node; }
+        operator const node_t * () const { _check(); return node; }
+        operator const node_t * () const & { _check(); return node; }
+        operator const node_t * () && = delete;
+        const node_t * release() {
+                const node_t * ret = node;
+                if ( node ) {
+                        _dec(node, false);
+                        node = nullptr;
+                }
+                return ret;
+        }
         /// wrapper for convenient access to dynamic_cast
 …
         const o_node_t * as() const { _check(); return dynamic_cast<const o_node_t *>(node); }
         /// wrapper for convenient access to strict_dynamic_cast
+        /// Wrapper that makes sure dynamic_cast returns non-null.
         template<typename o_node_t>
+        const o_node_t * strict_as() const { _check(); return strict_dynamic_cast<const o_node_t *>(node); }
+        const o_node_t * strict_as() const {
+                if (const o_node_t * ret = as<o_node_t>()) return ret;
+                _strict_fail();
+        }
+        /// Wrapper that makes sure dynamic_cast does not fail.
+        template<typename o_node_t, decltype(nullptr) null>
+        const o_node_t * strict_as() const { return node ? strict_as<o_node_t>() : nullptr; }
         /// Returns a mutable version of the pointer in this node.
 …
         void _inc( const node_t * other );
         void _dec( const node_t * other );
+        void _dec( const node_t * other, bool do_delete = true );
         void _check() const;
+        void _strict_fail() const __attribute__((noreturn));
         const node_t * node;

src/AST/Pass.hpp

-              r3c64c668
+              r58fe85a
 //
 // Author           : Thierry Delisle
 // Created On       : Thu May 09 15::37::05 2019
+// Created On       : Thu May 09 15:37:05 2019
 // Last Modified By :
 // Last Modified On :
 …
 //
 // Several additional features are available through inheritance
+// | WithTypeSubstitution - provides polymorphic const TypeSubstitution * env for the
+//                          current expression
+// | WithStmtsToAdd       - provides the ability to insert statements before or after the current
+//                          statement by adding new statements into stmtsToAddBefore or
+//                          stmtsToAddAfter respectively.
+// | WithDeclsToAdd       - provides the ability to insert declarations before or after the current
+//                          declarations by adding new DeclStmt into declsToAddBefore or
+//                          declsToAddAfter respectively.
+// | WithShortCircuiting  - provides the ability to skip visiting child nodes; set visit_children
+//                          to false in pre{visit,visit} to skip visiting children
+// | WithGuards           - provides the ability to save/restore data like a LIFO stack; to save,
+//                          call GuardValue with the variable to save, the variable will
+//                          automatically be restored to its previous value after the corresponding
+//                          postvisit/postmutate teminates.
+// | WithVisitorRef       - provides an pointer to the templated visitor wrapper
+// | WithSymbolTable      - provides symbol table functionality
+// | PureVisitor           - makes the visitor pure, it never modifies nodes in place and always
+//                           clones nodes it needs to make changes to
+// | WithConstTypeSubstitution - provides polymorphic const TypeSubstitution * typeSubs for the
+//                           current expression
+// | WithStmtsToAdd        - provides the ability to insert statements before or after the current
+//                           statement by adding new statements into stmtsToAddBefore or
+//                           stmtsToAddAfter respectively.
+// | WithDeclsToAdd        - provides the ability to insert declarations before or after the
+//                           current declarations by adding new DeclStmt into declsToAddBefore or
+//                           declsToAddAfter respectively.
+// | WithShortCircuiting   - provides the ability to skip visiting child nodes; set visit_children
+//                           to false in pre{visit,visit} to skip visiting children
+// | WithGuards            - provides the ability to save/restore data like a LIFO stack; to save,
+//                           call GuardValue with the variable to save, the variable will
+//                           automatically be restored to its previous value after the
+//                           corresponding postvisit/postmutate teminates.
+// | WithVisitorRef        - provides an pointer to the templated visitor wrapper
+// | WithSymbolTable       - provides symbol table functionality
+//
+// Other Special Members:
+// | result                - Either a method that takes no parameters or a field. If a method (or
+//                           callable field) get_result calls it, otherwise the value is returned.
 //-------------------------------------------------------------------------------------------------
 template< typename pass_t >
+template< typename core_t >
 class Pass final : public ast::Visitor {
 public:
+        using core_type = core_t;
+        using type = Pass<core_t>;
         /// Forward any arguments to the pass constructor
         /// Propagate 'this' if necessary
         template< typename... Args >
         Pass( Args &&... args)
                 : pass( std::forward<Args>( args )... )
+                : core( std::forward<Args>( args )... )
+        {
                 // After the pass is constructed, check if it wants the have a pointer to the wrapping visitor
+                typedef Pass<pass_t> this_t;
+                this_t * const * visitor = __pass::visitor(pass, 0);
+                type * const * visitor = __pass::visitor(core, 0);
                 if(visitor) {
                         *const_cast<this_t **>( visitor ) = this;
+                        *const_cast<type **>( visitor ) = this;
+                }
+        }
 …
         virtual ~Pass() = default;
+        /// Storage for the actual pass
+        pass_t pass;
+        /// Storage for the actual pass.
+        core_t core;
+        /// If the core defines a result, call it if possible, otherwise return it.
+        inline auto get_result() -> decltype( __pass::get_result( core, '0' ) ) {
+                return __pass::get_result( core, '0' );
+        }
+        /// Construct and run a pass on a translation unit.
+        template< typename... Args >
+        static void run( TranslationUnit & decls, Args &&... args ) {
+                Pass<core_t> visitor( std::forward<Args>( args )... );
+                accept_all( decls, visitor );
+        }
+        /// Contruct and run a pass on a pointer to extract a value.
+        template< typename node_type, typename... Args >
+        static auto read( node_type const * node, Args&&... args ) {
+                Pass<core_t> visitor( std::forward<Args>( args )... );
+                node_type const * temp = node->accept( visitor );
+                assert( temp == node );
+                return visitor.get_result();
+        }
+        // Versions of the above for older compilers.
+        template< typename... Args >
+        static void run( TranslationUnit & decls ) {
+                Pass<core_t> visitor;
+                accept_all( decls, visitor );
+        }
+        template< typename node_type, typename... Args >
+        static auto read( node_type const * node ) {
+                Pass<core_t> visitor;
+                node_type const * temp = node->accept( visitor );
+                assert( temp == node );
+                return visitor.get_result();
+        }
         /// Visit function declarations
 …
         const ast::Stmt *             visit( const ast::CatchStmt            * ) override final;
         const ast::Stmt *             visit( const ast::FinallyStmt          * ) override final;
+        const ast::Stmt *             visit( const ast::SuspendStmt          * ) override final;
         const ast::Stmt *             visit( const ast::WaitForStmt          * ) override final;
         const ast::Decl *             visit( const ast::WithStmt             * ) override final;
 …
         const ast::TypeSubstitution * visit( const ast::TypeSubstitution     * ) override final;
+        template<typename pass_type>
+        friend void accept_all( std::list< ptr<Decl> > & decls, Pass<pass_type>& visitor );
+        template<typename core_type>
+        friend void accept_all( std::list< ptr<Decl> > & decls, Pass<core_type>& visitor );
+        bool isInFunction() const {
+                return inFunction;
+        }
 private:
         bool __visit_children() { __pass::bool_ref * ptr = __pass::visit_children(pass, 0); return ptr ? *ptr : true; }
+        bool __visit_children() { __pass::bool_ref * ptr = __pass::visit_children(core, 0); return ptr ? *ptr : true; }
 private:
         const ast::Stmt * call_accept( const ast::Stmt * );
         const ast::Expr * call_accept( const ast::Expr * );
+        // requests WithStmtsToAdd directly add to this statement, as if it is a compound.
+        const ast::Stmt * call_accept_as_compound(const ast::Stmt *);
         template< typename node_t >
 …
         void maybe_accept(const node_t * &, child_t parent_t::* child);
+        template<typename node_t, typename parent_t, typename child_t>
+        void maybe_accept_as_compound(const node_t * &, child_t parent_t::* child);
 private:
         /// Internal RAII guard for symbol table features
         struct guard_symtab {
                 guard_symtab( Pass<pass_t> & pass ): pass( pass ) { __pass::symtab::enter(pass, 0); }
                 ~guard_symtab()                                   { __pass::symtab::leave(pass, 0); }
                 Pass<pass_t> & pass;
+                guard_symtab( Pass<core_t> & pass ): pass( pass ) { __pass::symtab::enter(pass.core, 0); }
+                ~guard_symtab()                                   { __pass::symtab::leave(pass.core, 0); }
+                Pass<core_t> & pass;
         };
         /// Internal RAII guard for scope features
         struct guard_scope {
+                guard_scope( Pass<pass_t> & pass ): pass( pass ) { __pass::scope::enter(pass, 0); }
+                ~guard_scope()                                   { __pass::scope::leave(pass, 0); }
+                Pass<pass_t> & pass;
+                guard_scope( Pass<core_t> & pass ): pass( pass ) { __pass::scope::enter(pass.core, 0); }
+                ~guard_scope()                                   { __pass::scope::leave(pass.core, 0); }
+                Pass<core_t> & pass;
+        };
+        /// Internal RAII guard for forall substitutions
+        struct guard_forall_subs {
+                guard_forall_subs( Pass<core_t> & pass, const FunctionType * type )
+                : pass( pass ), type( type ) { __pass::forall::enter(pass.core, 0, type ); }
+                ~guard_forall_subs()         { __pass::forall::leave(pass.core, 0, type ); }
+                Pass<core_t> & pass;
+                const FunctionType * type;
         };
 private:
         bool inFunction = false;
+        bool atFunctionTop = false;
 };
 /// Apply a pass to an entire translation unit
+template<typename pass_t>
+void accept_all( std::list< ast::ptr<ast::Decl> > &, ast::Pass<pass_t> & visitor );
+template<typename core_t>
+void accept_all( std::list< ast::ptr<ast::Decl> > &, ast::Pass<core_t> & visitor );
+template<typename core_t>
+void accept_all( ast::TranslationUnit &, ast::Pass<core_t> & visitor );
 //-------------------------------------------------------------------------------------------------
 …
 //-------------------------------------------------------------------------------------------------
+/// Keep track of the polymorphic const TypeSubstitution * env for the current expression
+/// If used the visitor will always clone nodes.
+struct PureVisitor {};
+/// Keep track of the polymorphic const TypeSubstitution * typeSubs for the current expression.
 struct WithConstTypeSubstitution {
         const TypeSubstitution * env = nullptr;
+        const TypeSubstitution * typeSubs = nullptr;
 };
 …
         };
         template< typename pass_t>
         friend auto __pass::at_cleanup( pass_t & pass, int ) -> decltype( &pass.at_cleanup );
+        template< typename core_t>
+        friend auto __pass::at_cleanup( core_t & core, int ) -> decltype( &core.at_cleanup );
 public:
 …
 /// Used to get a pointer to the pass with its wrapped type
 template<typename pass_t>
+template<typename core_t>
 struct WithVisitorRef {
+        Pass<pass_t> * const visitor = nullptr;
+        Pass<core_t> * const visitor = nullptr;
+        bool isInFunction() const {
+                return visitor->isInFunction();
+        }
 };
 …
         SymbolTable symtab;
 };
+}
 …
 extern struct PassVisitorStats {
         size_t depth = 0;
         Stats::Counters::MaxCounter<double> * max = nullptr;
         Stats::Counters::AverageCounter<double> * avg = nullptr;
+        Stats::Counters::MaxCounter<double> * max;
+        Stats::Counters::AverageCounter<double> * avg;
 } pass_visitor_stats;
+}

src/AST/Pass.impl.hpp

-              r3c64c668
+              r58fe85a
 #include <unordered_map>
+#include "AST/TranslationUnit.hpp"
 #include "AST/TypeSubstitution.hpp"
 …
         using namespace ast; \
         /* back-up the visit children */ \
         __attribute__((unused)) ast::__pass::visit_children_guard guard1( ast::__pass::visit_children(pass, 0) ); \
+        __attribute__((unused)) ast::__pass::visit_children_guard guard1( ast::__pass::visit_children(core, 0) ); \
         /* setup the scope for passes that want to run code at exit */ \
+        __attribute__((unused)) ast::__pass::guard_value          guard2( ast::__pass::at_cleanup    (pass, 0) ); \
+        __attribute__((unused)) ast::__pass::guard_value          guard2( ast::__pass::at_cleanup    (core, 0) ); \
+        /* begin tracing memory allocation if requested by this pass */ \
+        __pass::beginTrace( core, 0 ); \
         /* call the implementation of the previsit of this pass */ \
         __pass::previsit( pass, node, 0 );
+        __pass::previsit( core, node, 0 );
 #define VISIT( code... ) \
 …
 #define VISIT_END( type, node ) \
         /* call the implementation of the postvisit of this pass */ \
         auto __return = __pass::postvisit( pass, node, 0 ); \
+        auto __return = __pass::postvisit( core, node, 0 ); \
         assertf(__return, "post visit should never return null"); \
+        /* end tracing memory allocation if requested by this pass */ \
+        __pass::endTrace( core, 0 ); \
         return __return;
 …
 namespace ast {
+        template<typename node_t>
+        node_t * shallowCopy( const node_t * node );
         namespace __pass {
                 // Check if this is either a null pointer or a pointer to an empty container
 …
+                }
+                template< typename core_t, typename node_t >
+                static inline node_t* mutate(const node_t *node) {
+                        return std::is_base_of<PureVisitor, core_t>::value ? ::ast::shallowCopy(node) : ::ast::mutate(node);
+                }
                 //------------------------------
                 template<typename it_t, template <class...> class container_t>
 …
+        }
         template< typename pass_t >
+        template< typename core_t >
         template< typename node_t >
         auto ast::Pass< pass_t >::call_accept( const node_t * node )
+        auto ast::Pass< core_t >::call_accept( const node_t * node )
                 -> typename std::enable_if<
                                 !std::is_base_of<ast::Expr, node_t>::value &&
 …
                         , decltype( node->accept(*this) )
                 >::type
+        {
                 __pedantic_pass_assert( __visit_children() );
                 __pedantic_pass_assert( expr );
+                __pedantic_pass_assert( node );
                 static_assert( !std::is_base_of<ast::Expr, node_t>::value, "ERROR");
 …
+        }
         template< typename pass_t >
         const ast::Expr * ast::Pass< pass_t >::call_accept( const ast::Expr * expr ) {
+        template< typename core_t >
+        const ast::Expr * ast::Pass< core_t >::call_accept( const ast::Expr * expr ) {
                 __pedantic_pass_assert( __visit_children() );
                 __pedantic_pass_assert( expr );
                 const ast::TypeSubstitution ** env_ptr = __pass::env( pass, 0);
                 if ( env_ptr && expr->env ) {
                         *env_ptr = expr->env;
+                const ast::TypeSubstitution ** typeSubs_ptr = __pass::typeSubs( core, 0 );
+                if ( typeSubs_ptr && expr->env ) {
+                        *typeSubs_ptr = expr->env;
+                }
 …
+        }
         template< typename pass_t >
         const ast::Stmt * ast::Pass< pass_t >::call_accept( const ast::Stmt * stmt ) {
+        template< typename core_t >
+        const ast::Stmt * ast::Pass< core_t >::call_accept( const ast::Stmt * stmt ) {
                 __pedantic_pass_assert( __visit_children() );
                 __pedantic_pass_assert( stmt );
+                return stmt->accept( *this );
+        }
+        template< typename core_t >
+        const ast::Stmt * ast::Pass< core_t >::call_accept_as_compound( const ast::Stmt * stmt ) {
+                __pedantic_pass_assert( __visit_children() );
+                __pedantic_pass_assert( stmt );
                 // add a few useful symbols to the scope
                 using __pass::empty;
                 // get the stmts/decls that will need to be spliced in
                 auto stmts_before = __pass::stmtsToAddBefore( pass, 0);
                 auto stmts_after  = __pass::stmtsToAddAfter ( pass, 0);
                 auto decls_before = __pass::declsToAddBefore( pass, 0);
                 auto decls_after  = __pass::declsToAddAfter ( pass, 0);
+                auto stmts_before = __pass::stmtsToAddBefore( core, 0);
+                auto stmts_after  = __pass::stmtsToAddAfter ( core, 0);
+                auto decls_before = __pass::declsToAddBefore( core, 0);
+                auto decls_after  = __pass::declsToAddAfter ( core, 0);
                 // These may be modified by subnode but most be restored once we exit this statemnet.
                 ValueGuardPtr< const ast::TypeSubstitution * > __old_env         ( __pass::env( pass, 0) );
+                ValueGuardPtr< const ast::TypeSubstitution * > __old_env         ( __pass::typeSubs( core, 0 ) );
                 ValueGuardPtr< typename std::remove_pointer< decltype(stmts_before) >::type > __old_decls_before( stmts_before );
                 ValueGuardPtr< typename std::remove_pointer< decltype(stmts_after ) >::type > __old_decls_after ( stmts_after  );
 …
+        }
         template< typename pass_t >
+        template< typename core_t >
         template< template <class...> class container_t >
         container_t< ptr<Stmt> > ast::Pass< pass_t >::call_accept( const container_t< ptr<Stmt> > & statements ) {
+        container_t< ptr<Stmt> > ast::Pass< core_t >::call_accept( const container_t< ptr<Stmt> > & statements ) {
                 __pedantic_pass_assert( __visit_children() );
                 if( statements.empty() ) return {};
 …
                 // get the stmts/decls that will need to be spliced in
                 auto stmts_before = __pass::stmtsToAddBefore( pass, 0);
                 auto stmts_after  = __pass::stmtsToAddAfter ( pass, 0);
                 auto decls_before = __pass::declsToAddBefore( pass, 0);
                 auto decls_after  = __pass::declsToAddAfter ( pass, 0);
+                auto stmts_before = __pass::stmtsToAddBefore( core, 0);
+                auto stmts_after  = __pass::stmtsToAddAfter ( core, 0);
+                auto decls_before = __pass::declsToAddBefore( core, 0);
+                auto decls_after  = __pass::declsToAddAfter ( core, 0);
                 // These may be modified by subnode but most be restored once we exit this statemnet.
 …
+        }
         template< typename pass_t >
+        template< typename core_t >
         template< template <class...> class container_t, typename node_t >
         container_t< ast::ptr<node_t> > ast::Pass< pass_t >::call_accept( const container_t< ast::ptr<node_t> > & container ) {
+        container_t< ast::ptr<node_t> > ast::Pass< core_t >::call_accept( const container_t< ast::ptr<node_t> > & container ) {
                 __pedantic_pass_assert( __visit_children() );
                 if( container.empty() ) return {};
 …
+        }
         template< typename pass_t >
+        template< typename core_t >
         template<typename node_t, typename parent_t, typename child_t>
         void ast::Pass< pass_t >::maybe_accept(
+        void ast::Pass< core_t >::maybe_accept(
                 const node_t * & parent,
                 child_t parent_t::*child
 …
                 if( __pass::differs(old_val, new_val) ) {
+                        auto new_parent = mutate(parent);
+                        auto new_parent = __pass::mutate<core_t>(parent);
+                        new_parent->*child = new_val;
+                        parent = new_parent;
+                }
+        }
+        template< typename core_t >
+        template<typename node_t, typename parent_t, typename child_t>
+        void ast::Pass< core_t >::maybe_accept_as_compound(
+                const node_t * & parent,
+                child_t parent_t::*child
+        ) {
+                static_assert( std::is_base_of<parent_t, node_t>::value, "Error deducing member object" );
+                if(__pass::skip(parent->*child)) return;
+                const auto & old_val = __pass::get(parent->*child, 0);
+                static_assert( !std::is_same<const ast::Node * &, decltype(old_val)>::value, "ERROR");
+                auto new_val = call_accept_as_compound( old_val );
+                static_assert( !std::is_same<const ast::Node *, decltype(new_val)>::value || std::is_same<int, decltype(old_val)>::value, "ERROR");
+                if( __pass::differs(old_val, new_val) ) {
+                        auto new_parent = __pass::mutate<core_t>(parent);
                         new_parent->*child = new_val;
                         parent = new_parent;
 …
 //------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 template< typename pass_t >
 inline void ast::accept_all( std::list< ast::ptr<ast::Decl> > & decls, ast::Pass< pass_t > & visitor ) {
+template< typename core_t >
+inline void ast::accept_all( std::list< ast::ptr<ast::Decl> > & decls, ast::Pass< core_t > & visitor ) {
         // We are going to aggregate errors for all these statements
         SemanticErrorException errors;
 …
         // get the stmts/decls that will need to be spliced in
         auto decls_before = __pass::declsToAddBefore( visitor.pass, 0);
         auto decls_after  = __pass::declsToAddAfter ( visitor.pass, 0);
+        auto decls_before = __pass::declsToAddBefore( visitor.core, 0);
+        auto decls_after  = __pass::declsToAddAfter ( visitor.core, 0);
         // update pass statitistics
 …
+                }
                 catch( SemanticErrorException &e ) {
+                        errors.append( e );
+                        if (__pass::on_error (visitor.core, *i, 0))
+                                errors.append( e );
+                }
 …
         pass_visitor_stats.depth--;
         if ( !errors.isEmpty() ) { throw errors; }
+}
+template< typename core_t >
+inline void ast::accept_all( ast::TranslationUnit & unit, ast::Pass< core_t > & visitor ) {
+        return ast::accept_all( unit.decls, visitor );
+}
 …
 //--------------------------------------------------------------------------
 // ObjectDecl
 template< typename pass_t >
 const ast::DeclWithType * ast::Pass< pass_t >::visit( const ast::ObjectDecl * node ) {
+template< typename core_t >
+const ast::DeclWithType * ast::Pass< core_t >::visit( const ast::ObjectDecl * node ) {
         VISIT_START( node );
 …
+        )
         __pass::symtab::addId( pass, 0, node );
+        __pass::symtab::addId( core, 0, node );
         VISIT_END( DeclWithType, node );
 …
 //--------------------------------------------------------------------------
 // FunctionDecl
 template< typename pass_t >
 const ast::DeclWithType * ast::Pass< pass_t >::visit( const ast::FunctionDecl * node ) {
         VISIT_START( node );
         __pass::symtab::addId( pass, 0, node );
+template< typename core_t >
+const ast::DeclWithType * ast::Pass< core_t >::visit( const ast::FunctionDecl * node ) {
+        VISIT_START( node );
+        __pass::symtab::addId( core, 0, node );
         VISIT(maybe_accept( node, &FunctionDecl::withExprs );)
 …
                 // shadow with exprs and not the other way around.
                 guard_symtab guard { *this };
                 __pass::symtab::addWith( pass, 0, node->withExprs, node );
+                __pass::symtab::addWith( core, 0, node->withExprs, node );
+                {
                         guard_symtab guard { *this };
                         // implicit add __func__ identifier as specified in the C manual 6.4.2.2
                         static ast::ObjectDecl func(
                                 node->location, "__func__",
                                 new ast::ArrayType(
                                         new ast::BasicType( ast::BasicType::Char, ast::CV::Qualifiers( ast::CV::Const ) ),
+                        static ast::ptr< ast::ObjectDecl > func{ new ast::ObjectDecl{
+                                CodeLocation{}, "__func__",
+                                new ast::ArrayType{
+                                        new ast::BasicType{ ast::BasicType::Char, ast::CV::Const },
                                         nullptr, VariableLen, DynamicDim
+                                )
                         );
                         __pass::symtab::addId( pass, 0, &func );
+                                }
+                        } };
+                        __pass::symtab::addId( core, 0, func );
                         VISIT(
+                                maybe_accept( node, &FunctionDecl::type );
+                                // function body needs to have the same scope as parameters - CompoundStmt will not enter
+                                // a new scope if inFunction is true
+                                // parameter declarations
+                                maybe_accept( node, &FunctionDecl::params );
+                                maybe_accept( node, &FunctionDecl::returns );
+                                // type params and assertions
+                                maybe_accept( node, &FunctionDecl::type_params );
+                                maybe_accept( node, &FunctionDecl::assertions );
+                                // First remember that we are now within a function.
                                 ValueGuard< bool > oldInFunction( inFunction );
                                 inFunction = true;
+                                // The function body needs to have the same scope as parameters.
+                                // A CompoundStmt will not enter a new scope if atFunctionTop is true.
+                                ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+                                atFunctionTop = true;
                                 maybe_accept( node, &FunctionDecl::stmts );
                                 maybe_accept( node, &FunctionDecl::attributes );
 …
 //--------------------------------------------------------------------------
 // StructDecl
 template< typename pass_t >
 const ast::Decl * ast::Pass< pass_t >::visit( const ast::StructDecl * node ) {
+template< typename core_t >
+const ast::Decl * ast::Pass< core_t >::visit( const ast::StructDecl * node ) {
         VISIT_START( node );
         // make up a forward declaration and add it before processing the members
         // needs to be on the heap because addStruct saves the pointer
         __pass::symtab::addStructFwd( pass, 0, node );
+        __pass::symtab::addStructFwd( core, 0, node );
         VISIT({
 …
         // this addition replaces the forward declaration
         __pass::symtab::addStruct( pass, 0, node );
+        __pass::symtab::addStruct( core, 0, node );
         VISIT_END( Decl, node );
 …
 //--------------------------------------------------------------------------
 // UnionDecl
 template< typename pass_t >
 const ast::Decl * ast::Pass< pass_t >::visit( const ast::UnionDecl * node ) {
+template< typename core_t >
+const ast::Decl * ast::Pass< core_t >::visit( const ast::UnionDecl * node ) {
         VISIT_START( node );
         // make up a forward declaration and add it before processing the members
         __pass::symtab::addUnionFwd( pass, 0, node );
+        __pass::symtab::addUnionFwd( core, 0, node );
         VISIT({
 …
         })
         __pass::symtab::addUnion( pass, 0, node );
+        __pass::symtab::addUnion( core, 0, node );
         VISIT_END( Decl, node );
 …
 //--------------------------------------------------------------------------
 // EnumDecl
 template< typename pass_t >
 const ast::Decl * ast::Pass< pass_t >::visit( const ast::EnumDecl * node ) {
         VISIT_START( node );
         __pass::symtab::addEnum( pass, 0, node );
+template< typename core_t >
+const ast::Decl * ast::Pass< core_t >::visit( const ast::EnumDecl * node ) {
+        VISIT_START( node );
+        __pass::symtab::addEnum( core, 0, node );
         VISIT(
 …
 //--------------------------------------------------------------------------
 // TraitDecl
 template< typename pass_t >
 const ast::Decl * ast::Pass< pass_t >::visit( const ast::TraitDecl * node ) {
+template< typename core_t >
+const ast::Decl * ast::Pass< core_t >::visit( const ast::TraitDecl * node ) {
         VISIT_START( node );
 …
         })
         __pass::symtab::addTrait( pass, 0, node );
+        __pass::symtab::addTrait( core, 0, node );
         VISIT_END( Decl, node );
 …
 //--------------------------------------------------------------------------
 // TypeDecl
 template< typename pass_t >
 const ast::Decl * ast::Pass< pass_t >::visit( const ast::TypeDecl * node ) {
+template< typename core_t >
+const ast::Decl * ast::Pass< core_t >::visit( const ast::TypeDecl * node ) {
         VISIT_START( node );
         VISIT({
                 guard_symtab guard { *this };
-                maybe_accept( node, &TypeDecl::params );
                 maybe_accept( node, &TypeDecl::base   );
         })
 …
         // note that assertions come after the type is added to the symtab, since they are not part of the type proper
         // and may depend on the type itself
         __pass::symtab::addType( pass, 0, node );
+        __pass::symtab::addType( core, 0, node );
         VISIT(
 …
 //--------------------------------------------------------------------------
 // TypedefDecl
 template< typename pass_t >
 const ast::Decl * ast::Pass< pass_t >::visit( const ast::TypedefDecl * node ) {
+template< typename core_t >
+const ast::Decl * ast::Pass< core_t >::visit( const ast::TypedefDecl * node ) {
         VISIT_START( node );
         VISIT({
                 guard_symtab guard { *this };
-                maybe_accept( node, &TypedefDecl::params );
                 maybe_accept( node, &TypedefDecl::base   );
         })
         __pass::symtab::addType( pass, 0, node );
+        __pass::symtab::addType( core, 0, node );
         VISIT( maybe_accept( node, &TypedefDecl::assertions ); )
 …
 //--------------------------------------------------------------------------
 // AsmDecl
 template< typename pass_t >
 const ast::AsmDecl * ast::Pass< pass_t >::visit( const ast::AsmDecl * node ) {
+template< typename core_t >
+const ast::AsmDecl * ast::Pass< core_t >::visit( const ast::AsmDecl * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // StaticAssertDecl
 template< typename pass_t >
 const ast::StaticAssertDecl * ast::Pass< pass_t >::visit( const ast::StaticAssertDecl * node ) {
+template< typename core_t >
+const ast::StaticAssertDecl * ast::Pass< core_t >::visit( const ast::StaticAssertDecl * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // CompoundStmt
+template< typename pass_t >
+const ast::CompoundStmt * ast::Pass< pass_t >::visit( const ast::CompoundStmt * node ) {
+        VISIT_START( node );
+        VISIT({
+                // do not enter a new scope if inFunction is true - needs to check old state before the assignment
+                auto guard1 = makeFuncGuard( [this, inFunction = this->inFunction]() {
+                        if ( ! inFunction ) __pass::symtab::enter(pass, 0);
+                }, [this, inFunction = this->inFunction]() {
+                        if ( ! inFunction ) __pass::symtab::leave(pass, 0);
+template< typename core_t >
+const ast::CompoundStmt * ast::Pass< core_t >::visit( const ast::CompoundStmt * node ) {
+        VISIT_START( node );
+        VISIT(
+                // Do not enter (or leave) a new scope if atFunctionTop. Remember to save the result.
+                auto guard1 = makeFuncGuard( [this, enterScope = !this->atFunctionTop]() {
+                        if ( enterScope ) {
+                                __pass::symtab::enter(core, 0);
+                                __pass::scope::enter(core, 0);
+                        }
+                }, [this, leaveScope = !this->atFunctionTop]() {
+                        if ( leaveScope ) {
+                                __pass::symtab::leave(core, 0);
+                                __pass::scope::leave(core, 0);
+                        }
                 });
+                ValueGuard< bool > guard2( inFunction );
+                ValueGuard< bool > guard2( atFunctionTop );
+                atFunctionTop = false;
                 guard_scope guard3 { *this };
-                inFunction = false;
                 maybe_accept( node, &CompoundStmt::kids );
         })
+        )
         VISIT_END( CompoundStmt, node );
+}
 …
 //--------------------------------------------------------------------------
 // ExprStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::ExprStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::ExprStmt * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // AsmStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::AsmStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::AsmStmt * node ) {
         VISIT_START( node )
 …
 //--------------------------------------------------------------------------
 // DirectiveStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::DirectiveStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::DirectiveStmt * node ) {
         VISIT_START( node )
 …
 //--------------------------------------------------------------------------
 // IfStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::IfStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::IfStmt * node ) {
         VISIT_START( node );
 …
                 maybe_accept( node, &IfStmt::inits    );
                 maybe_accept( node, &IfStmt::cond     );
                 maybe_accept( node, &IfStmt::thenPart );
                 maybe_accept( node, &IfStmt::elsePart );
+                maybe_accept_as_compound( node, &IfStmt::thenPart );
+                maybe_accept_as_compound( node, &IfStmt::elsePart );
         })
 …
 //--------------------------------------------------------------------------
 // WhileStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::WhileStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::WhileStmt * node ) {
         VISIT_START( node );
 …
                 maybe_accept( node, &WhileStmt::inits );
                 maybe_accept( node, &WhileStmt::cond  );
                 maybe_accept( node, &WhileStmt::body  );
+                maybe_accept_as_compound( node, &WhileStmt::body  );
         })
 …
 //--------------------------------------------------------------------------
 // ForStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::ForStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::ForStmt * node ) {
         VISIT_START( node );
 …
                 // for statements introduce a level of scope (for the initialization)
                 guard_symtab guard { *this };
+                // xxx - old ast does not create WithStmtsToAdd scope for loop inits. should revisit this later.
                 maybe_accept( node, &ForStmt::inits );
                 maybe_accept( node, &ForStmt::cond  );
                 maybe_accept( node, &ForStmt::inc   );
                 maybe_accept( node, &ForStmt::body  );
+                maybe_accept_as_compound( node, &ForStmt::body  );
         })
 …
 //--------------------------------------------------------------------------
 // SwitchStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::SwitchStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::SwitchStmt * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // CaseStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::CaseStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::CaseStmt * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // BranchStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::BranchStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::BranchStmt * node ) {
         VISIT_START( node );
         VISIT_END( Stmt, node );
 …
 //--------------------------------------------------------------------------
 // ReturnStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::ReturnStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::ReturnStmt * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ThrowStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::ThrowStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::ThrowStmt * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // TryStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::TryStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::TryStmt * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // CatchStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::CatchStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::CatchStmt * node ) {
         VISIT_START( node );
 …
                 maybe_accept( node, &CatchStmt::decl );
                 maybe_accept( node, &CatchStmt::cond );
                 maybe_accept( node, &CatchStmt::body );
+                maybe_accept_as_compound( node, &CatchStmt::body );
         })
 …
 //--------------------------------------------------------------------------
 // FinallyStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::FinallyStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::FinallyStmt * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
+// FinallyStmt
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::SuspendStmt * node ) {
+        VISIT_START( node );
+        VISIT(
+                maybe_accept( node, &SuspendStmt::then   );
+        )
+        VISIT_END( Stmt, node );
+}
+//--------------------------------------------------------------------------
 // WaitForStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::WaitForStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::WaitForStmt * node ) {
         VISIT_START( node );
                 // for( auto & clause : node->clauses ) {
 …
                 if(mutated) {
                         auto n = mutate(node);
+                        auto n = __pass::mutate<core_t>(node);
                         n->clauses = std::move( new_clauses );
                         node = n;
 …
                         auto nval = call_accept( node->field ); \
                         if(nval != node->field ) { \
                                 auto nparent = mutate(node); \
+                                auto nparent = __pass::mutate<core_t>(node); \
                                 nparent->field = nval; \
                                 node = nparent; \
 …
 //--------------------------------------------------------------------------
 // WithStmt
 template< typename pass_t >
 const ast::Decl * ast::Pass< pass_t >::visit( const ast::WithStmt * node ) {
+template< typename core_t >
+const ast::Decl * ast::Pass< core_t >::visit( const ast::WithStmt * node ) {
         VISIT_START( node );
 …
                         // catch statements introduce a level of scope (for the caught exception)
                         guard_symtab guard { *this };
                         __pass::symtab::addWith( pass, 0, node->exprs, node );
+                        __pass::symtab::addWith( core, 0, node->exprs, node );
                         maybe_accept( node, &WithStmt::stmt );
+                }
 …
 //--------------------------------------------------------------------------
 // NullStmt
 template< typename pass_t >
 const ast::NullStmt * ast::Pass< pass_t >::visit( const ast::NullStmt * node ) {
+template< typename core_t >
+const ast::NullStmt * ast::Pass< core_t >::visit( const ast::NullStmt * node ) {
         VISIT_START( node );
         VISIT_END( NullStmt, node );
 …
 //--------------------------------------------------------------------------
 // DeclStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::DeclStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::DeclStmt * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ImplicitCtorDtorStmt
 template< typename pass_t >
 const ast::Stmt * ast::Pass< pass_t >::visit( const ast::ImplicitCtorDtorStmt * node ) {
+template< typename core_t >
+const ast::Stmt * ast::Pass< core_t >::visit( const ast::ImplicitCtorDtorStmt * node ) {
         VISIT_START( node );
         // For now this isn't visited, it is unclear if this causes problem
         // if all tests are known to pass, remove this code
         // VISIT(
         //      maybe_accept( node, &ImplicitCtorDtorStmt::callStmt );
         // )
+        VISIT(
+                maybe_accept( node, &ImplicitCtorDtorStmt::callStmt );
+        )
         VISIT_END( Stmt, node );
 …
 //--------------------------------------------------------------------------
 // ApplicationExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::ApplicationExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::ApplicationExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // UntypedExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::UntypedExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::UntypedExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // NameExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::NameExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::NameExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // CastExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::CastExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::CastExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // KeywordCastExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::KeywordCastExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::KeywordCastExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // VirtualCastExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::VirtualCastExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::VirtualCastExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // AddressExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::AddressExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::AddressExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // LabelAddressExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::LabelAddressExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::LabelAddressExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // UntypedMemberExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::UntypedMemberExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::UntypedMemberExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // MemberExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::MemberExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::MemberExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // VariableExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::VariableExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::VariableExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ConstantExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::ConstantExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::ConstantExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // SizeofExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::SizeofExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::SizeofExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // AlignofExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::AlignofExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::AlignofExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // UntypedOffsetofExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::UntypedOffsetofExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::UntypedOffsetofExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // OffsetofExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::OffsetofExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::OffsetofExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // OffsetPackExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::OffsetPackExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::OffsetPackExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // LogicalExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::LogicalExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::LogicalExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ConditionalExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::ConditionalExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::ConditionalExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // CommaExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::CommaExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::CommaExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // TypeExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::TypeExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::TypeExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // AsmExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::AsmExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::AsmExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ImplicitCopyCtorExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::ImplicitCopyCtorExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::ImplicitCopyCtorExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ConstructorExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::ConstructorExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::ConstructorExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // CompoundLiteralExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::CompoundLiteralExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::CompoundLiteralExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // RangeExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::RangeExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::RangeExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // UntypedTupleExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::UntypedTupleExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::UntypedTupleExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // TupleExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::TupleExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::TupleExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // TupleIndexExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::TupleIndexExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::TupleIndexExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // TupleAssignExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::TupleAssignExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::TupleAssignExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // StmtExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::StmtExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::StmtExpr * node ) {
         VISIT_START( node );
         VISIT(// don't want statements from outer CompoundStmts to be added to this StmtExpr
                 // get the stmts that will need to be spliced in
                 auto stmts_before = __pass::stmtsToAddBefore( pass, 0);
                 auto stmts_after  = __pass::stmtsToAddAfter ( pass, 0);
+                auto stmts_before = __pass::stmtsToAddBefore( core, 0);
+                auto stmts_after  = __pass::stmtsToAddAfter ( core, 0);
                 // These may be modified by subnode but most be restored once we exit this statemnet.
                 ValueGuardPtr< const ast::TypeSubstitution * > __old_env( __pass::env( pass, 0) );
+                ValueGuardPtr< const ast::TypeSubstitution * > __old_env( __pass::typeSubs( core, 0 ) );
                 ValueGuardPtr< typename std::remove_pointer< decltype(stmts_before) >::type > __old_decls_before( stmts_before );
                 ValueGuardPtr< typename std::remove_pointer< decltype(stmts_after ) >::type > __old_decls_after ( stmts_after  );
 …
 //--------------------------------------------------------------------------
 // UniqueExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::UniqueExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::UniqueExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // UntypedInitExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::UntypedInitExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::UntypedInitExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // InitExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::InitExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::InitExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // DeletedExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::DeletedExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::DeletedExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // DefaultArgExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::DefaultArgExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::DefaultArgExpr * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // GenericExpr
 template< typename pass_t >
 const ast::Expr * ast::Pass< pass_t >::visit( const ast::GenericExpr * node ) {
+template< typename core_t >
+const ast::Expr * ast::Pass< core_t >::visit( const ast::GenericExpr * node ) {
         VISIT_START( node );
 …
                 if(mutated) {
                         auto n = mutate(node);
+                        auto n = __pass::mutate<core_t>(node);
                         n->associations = std::move( new_kids );
                         node = n;
 …
 //--------------------------------------------------------------------------
 // VoidType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::VoidType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::VoidType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // BasicType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::BasicType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::BasicType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // PointerType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::PointerType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::PointerType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ArrayType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::ArrayType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::ArrayType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ReferenceType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::ReferenceType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::ReferenceType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // QualifiedType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::QualifiedType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::QualifiedType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // FunctionType
+template< typename pass_t >
+const ast::Type * ast::Pass< pass_t >::visit( const ast::FunctionType * node ) {
+        VISIT_START( node );
+        VISIT(
+                maybe_accept( node, &FunctionType::forall  );
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::FunctionType * node ) {
+        VISIT_START( node );
+        VISIT({
+                // guard_forall_subs forall_guard { *this, node };
+                // mutate_forall( node );
+                maybe_accept( node, &FunctionType::assertions );
                 maybe_accept( node, &FunctionType::returns );
                 maybe_accept( node, &FunctionType::params  );
+        )
+        })
         VISIT_END( Type, node );
 …
 //--------------------------------------------------------------------------
 // StructInstType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::StructInstType * node ) {
         VISIT_START( node );
         __pass::symtab::addStruct( pass, 0, node->name );
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::StructInstType * node ) {
+        VISIT_START( node );
+        __pass::symtab::addStruct( core, 0, node->name );
         VISIT({
                 guard_symtab guard { *this };
-                maybe_accept( node, &StructInstType::forall );
                 maybe_accept( node, &StructInstType::params );
         })
 …
 //--------------------------------------------------------------------------
 // UnionInstType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::UnionInstType * node ) {
         VISIT_START( node );
         __pass::symtab::addStruct( pass, 0, node->name );
+        {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::UnionInstType * node ) {
+        VISIT_START( node );
+        __pass::symtab::addUnion( core, 0, node->name );
+        VISIT({
                 guard_symtab guard { *this };
-                maybe_accept( node, &UnionInstType::forall );
                 maybe_accept( node, &UnionInstType::params );
+        }
+        })
         VISIT_END( Type, node );
 …
 //--------------------------------------------------------------------------
 // EnumInstType
+template< typename pass_t >
+const ast::Type * ast::Pass< pass_t >::visit( const ast::EnumInstType * node ) {
+        VISIT_START( node );
+        VISIT(
+                maybe_accept( node, &EnumInstType::forall );
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::EnumInstType * node ) {
+        VISIT_START( node );
+        VISIT({
                 maybe_accept( node, &EnumInstType::params );
+        )
+        })
         VISIT_END( Type, node );
 …
 //--------------------------------------------------------------------------
 // TraitInstType
+template< typename pass_t >
+const ast::Type * ast::Pass< pass_t >::visit( const ast::TraitInstType * node ) {
+        VISIT_START( node );
+        VISIT(
+                maybe_accept( node, &TraitInstType::forall );
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::TraitInstType * node ) {
+        VISIT_START( node );
+        VISIT({
                 maybe_accept( node, &TraitInstType::params );
+        )
+        })
         VISIT_END( Type, node );
 …
 //--------------------------------------------------------------------------
 // TypeInstType
+template< typename pass_t >
+const ast::Type * ast::Pass< pass_t >::visit( const ast::TypeInstType * node ) {
+        VISIT_START( node );
+        VISIT(
+                maybe_accept( node, &TypeInstType::forall );
+                maybe_accept( node, &TypeInstType::params );
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::TypeInstType * node ) {
+        VISIT_START( node );
+        VISIT(
+                {
+                        maybe_accept( node, &TypeInstType::params );
+                }
+                // ensure that base re-bound if doing substitution
+                __pass::forall::replace( core, 0, node );
+        )
 …
 //--------------------------------------------------------------------------
 // TupleType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::TupleType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::TupleType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // TypeofType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::TypeofType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::TypeofType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // VarArgsType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::VarArgsType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::VarArgsType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ZeroType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::ZeroType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::ZeroType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // OneType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::OneType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::OneType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // GlobalScopeType
 template< typename pass_t >
 const ast::Type * ast::Pass< pass_t >::visit( const ast::GlobalScopeType * node ) {
+template< typename core_t >
+const ast::Type * ast::Pass< core_t >::visit( const ast::GlobalScopeType * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // Designation
 template< typename pass_t >
 const ast::Designation * ast::Pass< pass_t >::visit( const ast::Designation * node ) {
+template< typename core_t >
+const ast::Designation * ast::Pass< core_t >::visit( const ast::Designation * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // SingleInit
 template< typename pass_t >
 const ast::Init * ast::Pass< pass_t >::visit( const ast::SingleInit * node ) {
+template< typename core_t >
+const ast::Init * ast::Pass< core_t >::visit( const ast::SingleInit * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ListInit
 template< typename pass_t >
 const ast::Init * ast::Pass< pass_t >::visit( const ast::ListInit * node ) {
+template< typename core_t >
+const ast::Init * ast::Pass< core_t >::visit( const ast::ListInit * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // ConstructorInit
 template< typename pass_t >
 const ast::Init * ast::Pass< pass_t >::visit( const ast::ConstructorInit * node ) {
+template< typename core_t >
+const ast::Init * ast::Pass< core_t >::visit( const ast::ConstructorInit * node ) {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // Attribute
 template< typename pass_t >
 const ast::Attribute * ast::Pass< pass_t >::visit( const ast::Attribute * node  )  {
+template< typename core_t >
+const ast::Attribute * ast::Pass< core_t >::visit( const ast::Attribute * node  )  {
         VISIT_START( node );
 …
 //--------------------------------------------------------------------------
 // TypeSubstitution
 template< typename pass_t >
 const ast::TypeSubstitution * ast::Pass< pass_t >::visit( const ast::TypeSubstitution * node ) {
+template< typename core_t >
+const ast::TypeSubstitution * ast::Pass< core_t >::visit( const ast::TypeSubstitution * node ) {
         VISIT_START( node );
 …
+                {
                         bool mutated = false;
                         std::unordered_map< std::string, ast::ptr< ast::Type > > new_map;
+                        std::unordered_map< ast::TypeInstType::TypeEnvKey, ast::ptr< ast::Type > > new_map;
                         for ( const auto & p : node->typeEnv ) {
                                 guard_symtab guard { *this };
                                 auto new_node = p.second->accept( *this );
                                 if (new_node != p.second) mutated = false;
+                                if (new_node != p.second) mutated = true;
                                 new_map.insert({ p.first, new_node });
+                        }
                         if (mutated) {
                                 auto new_node = mutate( node );
+                                auto new_node = __pass::mutate<core_t>( node );
                                 new_node->typeEnv.swap( new_map );
                                 node = new_node;
+                        }
+                }
+                {
-                        bool mutated = false;
-                        std::unordered_map< std::string, ast::ptr< ast::Expr > > new_map;
-                        for ( const auto & p : node->varEnv ) {
-                                guard_symtab guard { *this };
-                                auto new_node = p.second->accept( *this );
-                                if (new_node != p.second) mutated = false;
-                                new_map.insert({ p.first, new_node });
+                        }
-                        if (mutated) {
-                                auto new_node = mutate( node );
-                                new_node->varEnv.swap( new_map );
-                                node = new_node;
+                        }
+                }
+        )

src/AST/Pass.proto.hpp

-              r3c64c668
+              r58fe85a
 // IWYU pragma: private, include "Pass.hpp"
+#include "Common/Stats/Heap.h"
 namespace ast {
 template<typename pass_type>
+template<typename core_t>
 class Pass;
+struct TranslationUnit;
+struct PureVisitor;
 namespace __pass {
 …
                 };
                 std::stack< cleanup_t > cleanups;
+                std::stack< cleanup_t, std::vector<cleanup_t> > cleanups;
         };
 …
         /// "Short hand" to check if this is a valid previsit function
         /// Mostly used to make the static_assert look (and print) prettier
         template<typename pass_t, typename node_t>
+        template<typename core_t, typename node_t>
         struct is_valid_previsit {
                 using ret_t = decltype( ((pass_t*)nullptr)->previsit( (const node_t *)nullptr ) );
+                using ret_t = decltype( ((core_t*)nullptr)->previsit( (const node_t *)nullptr ) );
                 static constexpr bool value = std::is_void< ret_t >::value ||
 …
         template<>
         struct __assign<true> {
                 template<typename pass_t, typename node_t>
                 static inline void result( pass_t & pass, const node_t * & node ) {
                         pass.previsit( node );
+                template<typename core_t, typename node_t>
+                static inline void result( core_t & core, const node_t * & node ) {
+                        core.previsit( node );
+                }
         };
 …
         template<>
         struct __assign<false> {
                 template<typename pass_t, typename node_t>
                 static inline void result( pass_t & pass, const node_t * & node ) {
                         node = pass.previsit( node );
+                template<typename core_t, typename node_t>
+                static inline void result( core_t & core, const node_t * & node ) {
+                        node = core.previsit( node );
                         assertf(node, "Previsit must not return NULL");
+                }
 …
         template<>
         struct __return<true> {
                 template<typename pass_t, typename node_t>
                 static inline const node_t * result( pass_t & pass, const node_t * & node ) {
                         pass.postvisit( node );
+                template<typename core_t, typename node_t>
+                static inline const node_t * result( core_t & core, const node_t * & node ) {
+                        core.postvisit( node );
                         return node;
+                }
 …
         template<>
         struct __return<false> {
                 template<typename pass_t, typename node_t>
                 static inline auto result( pass_t & pass, const node_t * & node ) {
                         return pass.postvisit( node );
+                template<typename core_t, typename node_t>
+                static inline auto result( core_t & core, const node_t * & node ) {
+                        return core.postvisit( node );
+                }
         };
 …
         //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
         // PreVisit : may mutate the pointer passed in if the node is mutated in the previsit call
         template<typename pass_t, typename node_t>
         static inline auto previsit( pass_t & pass, const node_t * & node, int ) -> decltype( pass.previsit( node ), void() ) {
+        template<typename core_t, typename node_t>
+        static inline auto previsit( core_t & core, const node_t * & node, int ) -> decltype( core.previsit( node ), void() ) {
                 static_assert(
                         is_valid_previsit<pass_t, node_t>::value,
+                        is_valid_previsit<core_t, node_t>::value,
                         "Previsit may not change the type of the node. It must return its paremeter or void."
                 );
 …
                 __assign<
                         std::is_void<
                                 decltype( pass.previsit( node ) )
+                                decltype( core.previsit( node ) )
                         >::value
                 >::result( pass, node );
+        }
         template<typename pass_t, typename node_t>
         static inline auto previsit( pass_t &, const node_t *, long ) {}
+                >::result( core, node );
+        }
+        template<typename core_t, typename node_t>
+        static inline auto previsit( core_t &, const node_t *, long ) {}
         // PostVisit : never mutates the passed pointer but may return a different node
         template<typename pass_t, typename node_t>
         static inline auto postvisit( pass_t & pass, const node_t * node, int ) ->
                 decltype( pass.postvisit( node ), node->accept( *(Visitor*)nullptr ) )
+        template<typename core_t, typename node_t>
+        static inline auto postvisit( core_t & core, const node_t * node, int ) ->
+                decltype( core.postvisit( node ), node->accept( *(Visitor*)nullptr ) )
+        {
                 return __return<
                         std::is_void<
                                 decltype( pass.postvisit( node ) )
+                                decltype( core.postvisit( node ) )
                         >::value
                 >::result( pass, node );
+        }
         template<typename pass_t, typename node_t>
         static inline const node_t * postvisit( pass_t &, const node_t * node, long ) { return node; }
+                >::result( core, node );
+        }
+        template<typename core_t, typename node_t>
+        static inline const node_t * postvisit( core_t &, const node_t * node, long ) { return node; }
         //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 …
         // The type is not strictly enforced but does match the accessory
         #define FIELD_PTR( name, default_type ) \
         template< typename pass_t > \
         static inline auto name( pass_t & pass, int ) -> decltype( &pass.name ) { return &pass.name; } \
+        template< typename core_t > \
+        static inline auto name( core_t & core, int ) -> decltype( &core.name ) { return &core.name; } \
+        \
         template< typename pass_t > \
         static inline default_type * name( pass_t &, long ) { return nullptr; }
+        template< typename core_t > \
+        static inline default_type * name( core_t &, long ) { return nullptr; }
         // List of fields and their expected types
         FIELD_PTR( env, const ast::TypeSubstitution * )
+        FIELD_PTR( typeSubs, const ast::TypeSubstitution * )
         FIELD_PTR( stmtsToAddBefore, std::list< ast::ptr< ast::Stmt > > )
         FIELD_PTR( stmtsToAddAfter , std::list< ast::ptr< ast::Stmt > > )
 …
         FIELD_PTR( visit_children, __pass::bool_ref )
         FIELD_PTR( at_cleanup, __pass::at_cleanup_t )
         FIELD_PTR( visitor, ast::Pass<pass_t> * const )
+        FIELD_PTR( visitor, ast::Pass<core_t> * const )
         // Remove the macro to make sure we don't clash
         #undef FIELD_PTR
+        template< typename core_t >
+        static inline auto beginTrace(core_t &, int) -> decltype( core_t::traceId, void() ) {
+                // Stats::Heap::stacktrace_push(core_t::traceId);
+        }
+        template< typename core_t >
+        static inline auto endTrace(core_t &, int) -> decltype( core_t::traceId, void() ) {
+                // Stats::Heap::stacktrace_pop();
+        }
+        template< typename core_t >
+        static void beginTrace(core_t &, long) {}
+        template< typename core_t >
+        static void endTrace(core_t &, long) {}
+        // Allows visitor to handle an error on top-level declarations, and possibly suppress the error.
+        // If onError() returns false, the error will be ignored. By default, it returns true.
+        template< typename core_t >
+        static bool on_error (core_t &, ptr<Decl> &, long) { return true; }
+        template< typename core_t >
+        static auto on_error (core_t & core, ptr<Decl> & decl, int) -> decltype(core.on_error(decl)) {
+                return core.on_error(decl);
+        }
         // Another feature of the templated visitor is that it calls beginScope()/endScope() for compound statement.
 …
         // detect it using the same strategy
         namespace scope {
                 template<typename pass_t>
                 static inline auto enter( pass_t & pass, int ) -> decltype( pass.beginScope(), void() ) {
                         pass.beginScope();
+                }
                 template<typename pass_t>
                 static inline void enter( pass_t &, long ) {}
                 template<typename pass_t>
                 static inline auto leave( pass_t & pass, int ) -> decltype( pass.endScope(), void() ) {
                         pass.endScope();
+                }
                 template<typename pass_t>
                 static inline void leave( pass_t &, long ) {}
         };
         // Finally certain pass desire an up to date symbol table automatically
+                template<typename core_t>
+                static inline auto enter( core_t & core, int ) -> decltype( core.beginScope(), void() ) {
+                        core.beginScope();
+                }
+                template<typename core_t>
+                static inline void enter( core_t &, long ) {}
+                template<typename core_t>
+                static inline auto leave( core_t & core, int ) -> decltype( core.endScope(), void() ) {
+                        core.endScope();
+                }
+                template<typename core_t>
+                static inline void leave( core_t &, long ) {}
+        } // namespace scope
+        // Certain passes desire an up to date symbol table automatically
         // detect the presence of a member name `symtab` and call all the members appropriately
         namespace symtab {
                 // Some simple scoping rules
                 template<typename pass_t>
                 static inline auto enter( pass_t & pass, int ) -> decltype( pass.symtab.enterScope(), void() ) {
                         pass.symtab.enterScope();
+                }
                 template<typename pass_t>
                 static inline auto enter( pass_t &, long ) {}
                 template<typename pass_t>
                 static inline auto leave( pass_t & pass, int ) -> decltype( pass.symtab.leaveScope(), void() ) {
                         pass.symtab.leaveScope();
+                }
                 template<typename pass_t>
                 static inline auto leave( pass_t &, long ) {}
+                template<typename core_t>
+                static inline auto enter( core_t & core, int ) -> decltype( core.symtab, void() ) {
+                        core.symtab.enterScope();
+                }
+                template<typename core_t>
+                static inline auto enter( core_t &, long ) {}
+                template<typename core_t>
+                static inline auto leave( core_t & core, int ) -> decltype( core.symtab, void() ) {
+                        core.symtab.leaveScope();
+                }
+                template<typename core_t>
+                static inline auto leave( core_t &, long ) {}
                 // The symbol table has 2 kind of functions mostly, 1 argument and 2 arguments
                 // Create macro to condense these common patterns
                 #define SYMTAB_FUNC1( func, type ) \
                 template<typename pass_t> \
                 static inline auto func( pass_t & pass, int, type arg ) -> decltype( pass.symtab.func( arg ), void() ) {\
                         pass.symtab.func( arg ); \
+                template<typename core_t> \
+                static inline auto func( core_t & core, int, type arg ) -> decltype( core.symtab.func( arg ), void() ) {\
+                        core.symtab.func( arg ); \
                 } \
+                \
                 template<typename pass_t> \
                 static inline void func( pass_t &, long, type ) {}
+                template<typename core_t> \
+                static inline void func( core_t &, long, type ) {}
                 #define SYMTAB_FUNC2( func, type1, type2 ) \
                 template<typename pass_t> \
                 static inline auto func( pass_t & pass, int, type1 arg1, type2 arg2 ) -> decltype( pass.symtab.func( arg1, arg2 ), void () ) {\
                         pass.symtab.func( arg1, arg2 ); \
+                template<typename core_t> \
+                static inline auto func( core_t & core, int, type1 arg1, type2 arg2 ) -> decltype( core.symtab.func( arg1, arg2 ), void () ) {\
+                        core.symtab.func( arg1, arg2 ); \
                 } \
+                        \
                 template<typename pass_t> \
                 static inline void func( pass_t &, long, type1, type2 ) {}
+                template<typename core_t> \
+                static inline void func( core_t &, long, type1, type2 ) {}
                 SYMTAB_FUNC1( addId     , const DeclWithType *  );
 …
                 SYMTAB_FUNC1( addUnion  , const UnionDecl *     );
                 SYMTAB_FUNC1( addTrait  , const TraitDecl *     );
                 SYMTAB_FUNC2( addWith   , const std::vector< ptr<Expr> > &, const Node * );
+                SYMTAB_FUNC2( addWith   , const std::vector< ptr<Expr> > &, const Decl * );
                 // A few extra functions have more complicated behaviour, they are hand written
                 template<typename pass_t>
                 static inline auto addStructFwd( pass_t & pass, int, const ast::StructDecl * decl ) -> decltype( pass.symtab.addStruct( decl ), void() ) {
+                template<typename core_t>
+                static inline auto addStructFwd( core_t & core, int, const ast::StructDecl * decl ) -> decltype( core.symtab.addStruct( decl ), void() ) {
                         ast::StructDecl * fwd = new ast::StructDecl( decl->location, decl->name );
                         fwd->params = decl->params;
                         pass.symtab.addStruct( fwd );
+                }
                 template<typename pass_t>
                 static inline void addStructFwd( pass_t &, long, const ast::StructDecl * ) {}
                 template<typename pass_t>
                 static inline auto addUnionFwd( pass_t & pass, int, const ast::UnionDecl * decl ) -> decltype( pass.symtab.addUnion( decl ), void() ) {
+                        core.symtab.addStruct( fwd );
+                }
+                template<typename core_t>
+                static inline void addStructFwd( core_t &, long, const ast::StructDecl * ) {}
+                template<typename core_t>
+                static inline auto addUnionFwd( core_t & core, int, const ast::UnionDecl * decl ) -> decltype( core.symtab.addUnion( decl ), void() ) {
                         UnionDecl * fwd = new UnionDecl( decl->location, decl->name );
                         fwd->params = decl->params;
                         pass.symtab.addUnion( fwd );
+                }
                 template<typename pass_t>
                 static inline void addUnionFwd( pass_t &, long, const ast::UnionDecl * ) {}
                 template<typename pass_t>
                 static inline auto addStruct( pass_t & pass, int, const std::string & str ) -> decltype( pass.symtab.addStruct( str ), void() ) {
                         if ( ! pass.symtab.lookupStruct( str ) ) {
                                 pass.symtab.addStruct( str );
+                        core.symtab.addUnion( fwd );
+                }
+                template<typename core_t>
+                static inline void addUnionFwd( core_t &, long, const ast::UnionDecl * ) {}
+                template<typename core_t>
+                static inline auto addStruct( core_t & core, int, const std::string & str ) -> decltype( core.symtab.addStruct( str ), void() ) {
+                        if ( ! core.symtab.lookupStruct( str ) ) {
+                                core.symtab.addStruct( str );
+                        }
+                }
                 template<typename pass_t>
                 static inline void addStruct( pass_t &, long, const std::string & ) {}
                 template<typename pass_t>
                 static inline auto addUnion( pass_t & pass, int, const std::string & str ) -> decltype( pass.symtab.addUnion( str ), void() ) {
                         if ( ! pass.symtab.lookupUnion( str ) ) {
                                 pass.symtab.addUnion( str );
+                template<typename core_t>
+                static inline void addStruct( core_t &, long, const std::string & ) {}
+                template<typename core_t>
+                static inline auto addUnion( core_t & core, int, const std::string & str ) -> decltype( core.symtab.addUnion( str ), void() ) {
+                        if ( ! core.symtab.lookupUnion( str ) ) {
+                                core.symtab.addUnion( str );
+                        }
+                }
                 template<typename pass_t>
                 static inline void addUnion( pass_t &, long, const std::string & ) {}
+                template<typename core_t>
+                static inline void addUnion( core_t &, long, const std::string & ) {}
                 #undef SYMTAB_FUNC1
                 #undef SYMTAB_FUNC2
+        };
+};
+};
+        } // namespace symtab
+        // Some passes need to mutate TypeDecl and properly update their pointing TypeInstType.
+        // Detect the presence of a member name `subs` and call all members appropriately
+        namespace forall {
+                // Some simple scoping rules
+                template<typename core_t>
+                static inline auto enter( core_t & core, int, const ast::FunctionType * type )
+                -> decltype( core.subs, void() ) {
+                        if ( ! type->forall.empty() ) core.subs.beginScope();
+                }
+                template<typename core_t>
+                static inline auto enter( core_t &, long, const ast::FunctionType * ) {}
+                template<typename core_t>
+                static inline auto leave( core_t & core, int, const ast::FunctionType * type )
+                -> decltype( core.subs, void() ) {
+                        if ( ! type->forall.empty() ) { core.subs.endScope(); }
+                }
+                template<typename core_t>
+                static inline auto leave( core_t &, long, const ast::FunctionType * ) {}
+                // Replaces a TypeInstType's base TypeDecl according to the table
+                template<typename core_t>
+                static inline auto replace( core_t & core, int, const ast::TypeInstType *& inst )
+                -> decltype( core.subs, void() ) {
+                        inst = ast::mutate_field(
+                                inst, &ast::TypeInstType::base, core.subs.replace( inst->base ) );
+                }
+                template<typename core_t>
+                static inline auto replace( core_t &, long, const ast::TypeInstType *& ) {}
+        } // namespace forall
+        template<typename core_t>
+        static inline auto get_result( core_t & core, char ) -> decltype( core.result() ) {
+                return core.result();
+        }
+        template<typename core_t>
+        static inline auto get_result( core_t & core, int ) -> decltype( core.result ) {
+                return core.result;
+        }
+        template<typename core_t>
+        static inline void get_result( core_t &, long ) {}
+} // namespace __pass
+} // namespace ast

src/AST/Print.cpp

-              r3c64c668
+              r58fe85a
 #include "Type.hpp"
 #include "TypeSubstitution.hpp"
+#include "CompilationState.h"
 #include "Common/utility.h" // for group_iterate
 …
 template <typename C, typename... T>
+constexpr auto make_array(T&&... values) ->
+        array<C,sizeof...(T)>
+constexpr array<C,sizeof...(T)> make_array(T&&... values)
+{
         return array<C,sizeof...(T)>{
 …
         void print( const ast::Expr::InferUnion & inferred, unsigned level = 0 ) {
+                switch ( inferred.mode ) {
+                case ast::Expr::InferUnion::Empty: return;
+                case ast::Expr::InferUnion::Slots: {
+                        os << indent << "with " << inferred.data.resnSlots.size()
+                if (inferred.data.resnSlots && !inferred.data.resnSlots->empty()) {
+                        os << indent << "with " << inferred.data.resnSlots->size()
                            << " pending inference slots" << endl;
+                        return;
+                }
+                case ast::Expr::InferUnion::Params: {
+                }
+                if (inferred.data.inferParams && !inferred.data.inferParams->empty()) {
                         os << indent << "with inferred parameters " << level << ":" << endl;
                         ++indent;
                         for ( const auto & i : inferred.data.inferParams ) {
+                        for ( const auto & i : *inferred.data.inferParams ) {
                                 os << indent;
                                 short_print( Decl::fromId( i.second.decl ) );
+                                short_print( i.second.declptr );
                                 os << endl;
                                 print( i.second.expr->inferred, level+1 );
+                        }
                         --indent;
+                        return;
+                }
+                }
+        }
+        void print( const ast::ParameterizedType::ForallList & forall ) {
+                }
+        }
+        void print( const ast::FunctionType::ForallList & forall ) {
                 if ( forall.empty() ) return;
                 os << "forall" << endl;
                 ++indent;
                 printAll( forall );
+                os << indent;
+                --indent;
+        }
+        void print( const ast::FunctionType::AssertionList & assts ) {
+                if (assts.empty()) return;
+                os << "with assertions" << endl;
+                ++indent;
+                printAll(assts);
                 os << indent;
                 --indent;
 …
         void preprint( const ast::NamedTypeDecl * node ) {
+                if ( ! node->name.empty() ) os << node->name << ": ";
+                if ( ! node->name.empty() ) {
+                        os << node->name << ": ";
+                }
                 if ( ! short_mode && node->linkage != Linkage::Cforall ) {
 …
+                }
+                if ( ! node->params.empty() ) {
+                        os << endl << indent << "... with parameters" << endl;
+                        ++indent;
+                        printAll( node->params );
+                        --indent;
+                }
+                if ( ! short_mode && ! node->assertions.empty() ) {
+                if ( ! node->assertions.empty() ) {
                         os << endl << indent << "... with assertions" << endl;
                         ++indent;
 …
                 print( node->inferred );
+                if ( node->result ) {
+                        os << endl << indent << "... with resolved type:" << endl;
+                        ++indent;
+                        os << indent;
+                        node->result->accept( *this );
+                        --indent;
+                }
                 if ( node->env ) {
                         os << endl << indent << "... with environment:" << endl;
 …
+        }
         void preprint( const ast::ParameterizedType * node ) {
+        void preprint( const ast::FunctionType * node ) {
                 print( node->forall );
+                print( node->assertions );
                 print( node->qualifiers );
+        }
+        void preprint( const ast::ReferenceToType * node ) {
+                print( node->forall );
+        void preprint( const ast::BaseInstType * node ) {
                 print( node->attributes );
                 print( node->qualifiers );
 …
+        }
+        virtual const ast::Stmt * visit( const ast::SuspendStmt * node ) override final {
+                os << "Suspend Statement";
+                switch (node->type) {
+                        case ast::SuspendStmt::None     : os << " with implicit target"; break;
+                        case ast::SuspendStmt::Generator: os << " for generator"; break;
+                        case ast::SuspendStmt::Coroutine: os << " for coroutine"; break;
+                }
+                os << endl;
+                ++indent;
+                if(node->then) {
+                        os << indent << " with post statement :" << endl;
+                        safe_print( node->then );
+                }
+                ++indent;
+                return node;
+        }
         virtual const ast::Stmt * visit( const ast::WaitForStmt * node ) override final {
                 os << "Waitfor Statement" << endl;
 …
         virtual const ast::Expr * visit( const ast::CastExpr * node ) override final {
                 ++indent;
                 os << (node->isGenerated ? "Generated" : "Explicit") << " cast of:" << endl << indent;
+                os << (node->isGenerated ? "Generated" : "Explicit") << " Cast of:" << endl << indent;
                 safe_print( node->arg );
                 os << endl << indent-1 << "... to:";
 …
         virtual const ast::Type * visit( const ast::TypeInstType * node ) override final {
                 preprint( node );
+                os << "instance of type " << node->name
+                const auto & _name = deterministic_output && isUnboundType(node) ? "[unbound]" : node->typeString();
+                os << "instance of type " << _name
                    << " (" << (node->kind == ast::TypeDecl::Ftype ? "" : "not ") << "function type)";
                 print( node->params );
 …
                 os << indent << "Types:" << endl;
                 for ( const auto& i : *node ) {
                         os << indent+1 << i.first << " -> ";
+                        os << indent+1 << i.first.typeString() << " -> ";
                         indent += 2;
                         safe_print( i.second );
-                        indent -= 2;
-                        os << endl;
+                }
-                os << indent << "Non-types:" << endl;
-                for ( auto i = node->beginVar(); i != node->endVar(); ++i ) {
-                        os << indent+1 << i->first << " -> ";
-                        indent += 2;
-                        safe_print( i->second );
                         indent -= 2;
                         os << endl;

src/AST/Stmt.hpp

-              r3c64c668
+              r58fe85a
 // Must be included in *all* AST classes; should be #undef'd at the end of the file
+#define MUTATE_FRIEND template<typename node_t> friend node_t * mutate(const node_t * node);
+#define MUTATE_FRIEND \
+    template<typename node_t> friend node_t * mutate(const node_t * node); \
+        template<typename node_t> friend node_t * shallowCopy(const node_t * node);
 namespace ast {
 …
 };
+/// Suspend statement
+class SuspendStmt final : public Stmt {
+public:
+        ptr<CompoundStmt> then;
+        enum Type { None, Coroutine, Generator } type = None;
+        SuspendStmt( const CodeLocation & loc, const CompoundStmt * then, Type type, std::vector<Label> && labels = {} )
+        : Stmt(loc, std::move(labels)), then(then), type(type) {}
+        const Stmt * accept( Visitor & v ) const override { return v.visit( this ); }
+private:
+        SuspendStmt * clone() const override { return new SuspendStmt{ *this }; }
+        MUTATE_FRIEND
+};
 /// Wait for concurrency statement `when (...) waitfor (... , ...) ... timeout(...) ... else ...`
 class WaitForStmt final : public Stmt {
 …
 class ImplicitCtorDtorStmt final : public Stmt {
 public:
         readonly<Stmt> callStmt;
+        ptr<Stmt> callStmt;
         ImplicitCtorDtorStmt( const CodeLocation & loc, const Stmt * callStmt,

src/AST/SymbolTable.cpp

-              r3c64c668
+              r58fe85a
+}
+SymbolTable::SpecialFunctionKind SymbolTable::getSpecialFunctionKind(const std::string & name) {
+        if (name == "?{}") return CTOR;
+        if (name == "^?{}") return DTOR;
+        if (name == "?=?") return ASSIGN;
+        return NUMBER_OF_KINDS;
+}
 std::vector<SymbolTable::IdData> SymbolTable::lookupId( const std::string &id ) const {
+        static Stats::Counters::CounterGroup * name_lookup_stats = Stats::Counters::build<Stats::Counters::CounterGroup>("Name Lookup Stats");
+        static std::map<std::string, Stats::Counters::SimpleCounter *> lookups_by_name;
+        static std::map<std::string, Stats::Counters::SimpleCounter *> candidates_by_name;
+        SpecialFunctionKind kind = getSpecialFunctionKind(id);
+        if (kind != NUMBER_OF_KINDS) return specialLookupId(kind);
         ++*stats().lookup_calls;
         if ( ! idTable ) return {};
 …
                 out.push_back( decl.second );
+        }
+        if (Stats::Counters::enabled) {
+                if (! lookups_by_name.count(id)) {
+                        // leaks some strings, but it is because Counters do not hold them
+                        auto lookupCounterName = new std::string(id + "%count");
+                        auto candidatesCounterName = new std::string(id + "%candidate");
+                        lookups_by_name.emplace(id, new Stats::Counters::SimpleCounter(lookupCounterName->c_str(), name_lookup_stats));
+                        candidates_by_name.emplace(id, new Stats::Counters::SimpleCounter(candidatesCounterName->c_str(), name_lookup_stats));
+                }
+                (*lookups_by_name[id]) ++;
+                *candidates_by_name[id] += out.size();
+        }
+        return out;
+}
+std::vector<SymbolTable::IdData> SymbolTable::specialLookupId( SymbolTable::SpecialFunctionKind kind, const std::string & otypeKey ) const {
+        static Stats::Counters::CounterGroup * special_stats = Stats::Counters::build<Stats::Counters::CounterGroup>("Special Lookups");
+        static Stats::Counters::SimpleCounter * stat_counts[3] = {
+                Stats::Counters::build<Stats::Counters::SimpleCounter>("constructor - count", special_stats),
+                Stats::Counters::build<Stats::Counters::SimpleCounter>("destructor - count", special_stats),
+                Stats::Counters::build<Stats::Counters::SimpleCounter>("assignment - count", special_stats)
+        };
+        static Stats::Counters::SimpleCounter * stat_candidates[3] = {
+                Stats::Counters::build<Stats::Counters::SimpleCounter>("constructor - candidates", special_stats),
+                Stats::Counters::build<Stats::Counters::SimpleCounter>("destructor - candidates", special_stats),
+                Stats::Counters::build<Stats::Counters::SimpleCounter>("assignment - candidates", special_stats)
+        };
+        static Stats::Counters::SimpleCounter * num_lookup_with_key
+                = Stats::Counters::build<Stats::Counters::SimpleCounter>("keyed lookups", special_stats);
+        static Stats::Counters::SimpleCounter * num_lookup_without_key
+                = Stats::Counters::build<Stats::Counters::SimpleCounter>("unkeyed lookups", special_stats);
+        assert (kind != NUMBER_OF_KINDS);
+        ++*stats().lookup_calls;
+        if ( ! specialFunctionTable[kind] ) return {};
+        std::vector<IdData> out;
+        if (otypeKey.empty()) { // returns everything
+                ++*num_lookup_without_key;
+                for (auto & table : *specialFunctionTable[kind]) {
+                        for (auto & decl : *table.second) {
+                                out.push_back(decl.second);
+                        }
+                }
+        }
+        else {
+                ++*num_lookup_with_key;
+                ++*stats().map_lookups;
+                auto decls = specialFunctionTable[kind]->find(otypeKey);
+                if (decls == specialFunctionTable[kind]->end()) return {};
+                for (auto decl : *(decls->second)) {
+                        out.push_back(decl.second);
+                }
+        }
+        ++*stat_counts[kind];
+        *stat_candidates[kind] += out.size();
         return out;
+}
 …
                 if ( ! expr->result ) continue;
                 const Type * resTy = expr->result->stripReferences();
                 auto aggrType = dynamic_cast< const ReferenceToType * >( resTy );
+                auto aggrType = dynamic_cast< const BaseInstType * >( resTy );
                 assertf( aggrType, "WithStmt expr has non-aggregate type: %s",
                         toString( expr->result ).c_str() );
 …
+}
+void SymbolTable::addFunctionType( const FunctionType * ftype ) {
+        addTypes( ftype->forall );
+        addIds( ftype->returns );
+        addIds( ftype->params );
+}
+void SymbolTable::addFunction( const FunctionDecl * func ) {
+        for (auto & td : func->type_params) {
+                addType(td);
+        }
+        for (auto & asst : func->assertions) {
+                addId(asst);
+        }
+        // addTypes( func->type->forall );
+        addIds( func->returns );
+        addIds( func->params );
+}
 void SymbolTable::lazyInitScope() {
 …
 namespace {
         /// gets the base type of the first parameter; decl must be a ctor/dtor/assignment function
         std::string getOtypeKey( const FunctionDecl * function ) {
                 const auto & params = function->type->params;
+        std::string getOtypeKey( const FunctionType * ftype, bool stripParams = true ) {
+                const auto & params = ftype->params;
                 assert( ! params.empty() );
                 // use base type of pointer, so that qualifiers on the pointer type aren't considered.
                 const Type * base = InitTweak::getPointerBase( params.front()->get_type() );
+                const Type * base = InitTweak::getPointerBase( params.front() );
                 assert( base );
+                return Mangle::mangle( base );
+                if (stripParams) {
+                        if (dynamic_cast<const PointerType *>(base)) return Mangle::Encoding::pointer;
+                        return Mangle::mangle( base, Mangle::Type | Mangle::NoGenericParams );
+                }
+                else
+                        return Mangle::mangle( base );
+        }
 …
                         const DeclWithType * decl, const std::string & otypeKey ) {
                 auto func = dynamic_cast< const FunctionDecl * >( decl );
                 if ( ! func || otypeKey != getOtypeKey( func ) ) return nullptr;
+                if ( ! func || otypeKey != getOtypeKey( func->type, false ) ) return nullptr;
                 return func;
+        }
 …
         bool dataIsUserDefinedFunc = ! function->linkage.is_overrideable;
         bool dataIsCopyFunc = InitTweak::isCopyFunction( function );
         std::string dataOtypeKey = getOtypeKey( function );
+        std::string dataOtypeKey = getOtypeKey( function->type, false ); // requires exact match to override autogen
         if ( dataIsUserDefinedFunc && dataIsCopyFunc ) {
 …
                 const DeclWithType * decl, SymbolTable::OnConflict handleConflicts, const Expr * baseExpr,
                 const Decl * deleter ) {
+        SpecialFunctionKind kind = getSpecialFunctionKind(decl->name);
+        if (kind == NUMBER_OF_KINDS) { // not a special decl
+                addId(decl, decl->name, idTable, handleConflicts, baseExpr, deleter);
+        }
+        else {
+                std::string key;
+                if (auto func = dynamic_cast<const FunctionDecl *>(decl)) {
+                        key = getOtypeKey(func->type);
+                }
+                else if (auto obj = dynamic_cast<const ObjectDecl *>(decl)) {
+                        key = getOtypeKey(obj->type.strict_as<PointerType>()->base.strict_as<FunctionType>());
+                }
+                else {
+                        assertf(false, "special decl with non-function type");
+                }
+                addId(decl, key, specialFunctionTable[kind], handleConflicts, baseExpr, deleter);
+        }
+}
+void SymbolTable::addId(
+                const DeclWithType * decl, const std::string & lookupKey, IdTable::Ptr & table, SymbolTable::OnConflict handleConflicts, const Expr * baseExpr,
+                const Decl * deleter ) {
         ++*stats().add_calls;
         const std::string &name = decl->name;
 …
         // ensure tables exist and add identifier
         MangleTable::Ptr mangleTable;
         if ( ! idTable ) {
                 idTable = IdTable::new_ptr();
+        if ( ! table ) {
+                table = IdTable::new_ptr();
                 mangleTable = MangleTable::new_ptr();
         } else {
                 ++*stats().map_lookups;
                 auto decls = idTable->find( name );
                 if ( decls == idTable->end() ) {
+                auto decls = table->find( lookupKey );
+                if ( decls == table->end() ) {
                         mangleTable = MangleTable::new_ptr();
                 } else {
 …
                                                 lazyInitScope();
                                                 *stats().map_mutations += 2;
                                                 idTable = idTable->set(
                                                         name,
+                                                table = table->set(
+                                                        lookupKey,
                                                         mangleTable->set(
                                                                 mangleName,
 …
         IdData data{ decl, baseExpr, deleter, scope };
         // Ensure that auto-generated ctor/dtor/assignment are deleted if necessary
+        if ( ! removeSpecialOverrides( data, mangleTable ) ) return;
+        if (table != idTable) { // adding to special table
+                if ( ! removeSpecialOverrides( data, mangleTable ) ) return;
+        }
         *stats().map_mutations += 2;
         idTable = idTable->set( name, mangleTable->set( mangleName, std::move(data) ) );
+        table = table->set( lookupKey, mangleTable->set( mangleName, std::move(data) ) );
+}
 …
                         if ( dwt->name == "" ) {
                                 const Type * t = dwt->get_type()->stripReferences();
                                 if ( auto rty = dynamic_cast<const ReferenceToType *>( t ) ) {
+                                if ( auto rty = dynamic_cast<const BaseInstType *>( t ) ) {
                                         if ( ! dynamic_cast<const StructInstType *>(rty)
                                                 && ! dynamic_cast<const UnionInstType *>(rty) ) continue;

src/AST/SymbolTable.hpp

-              r3c64c668
+              r58fe85a
 class SymbolTable final : public std::enable_shared_from_this<ast::SymbolTable> {
 public:
+        /// special functions stored in dedicated tables, with different lookup keys
+        enum SpecialFunctionKind {CTOR, DTOR, ASSIGN, NUMBER_OF_KINDS};
+        static SpecialFunctionKind getSpecialFunctionKind(const std::string & name);
         /// Stored information about a declaration
         struct IdData {
 …
         UnionTable::Ptr unionTable;    ///< union namespace
         TraitTable::Ptr traitTable;    ///< trait namespace
+        IdTable::Ptr specialFunctionTable[NUMBER_OF_KINDS];
+        // using SpecialFuncTable = PersistentMap< std::string, IdTable::Ptr >; // fname (ctor/dtor/assign) - otypekey
+        // SpecialFuncTable::Ptr specialFuncTable;
         using Ptr = std::shared_ptr<const SymbolTable>;
 …
         /// Gets all declarations with the given ID
         std::vector<IdData> lookupId( const std::string &id ) const;
+        /// Gets special functions associated with a type; if no key is given, returns everything
+        std::vector<IdData> specialLookupId( SpecialFunctionKind kind, const std::string & otypeKey = "" ) const;
         /// Gets the top-most type declaration with the given ID
         const NamedTypeDecl * lookupType( const std::string &id ) const;
 …
         /// convenience function for adding all of the declarations in a function type to the indexer
         void addFunctionType( const FunctionType * ftype );
+        void addFunction( const FunctionDecl * );
 private:
 …
                 const Decl * deleter = nullptr );
+        /// common code for addId when special decls are placed into separate tables
+        void addId(
+                const DeclWithType * decl, const std::string & lookupKey, IdTable::Ptr & idTable, OnConflict handleConflicts,
+                const Expr * baseExpr = nullptr, const Decl * deleter = nullptr);
         /// adds all of the members of the Aggregate (addWith helper)
         void addMembers( const AggregateDecl * aggr, const Expr * expr, OnConflict handleConflicts );

src/AST/Type.cpp

-              r3c64c668
+              r58fe85a
 // Author           : Aaron B. Moss
 // Created On       : Mon May 13 15:00:00 2019
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Dec 15 16:56:28 2019
 // Update Count     : 4
+// Last Modified By : Andrew Beach
+// Last Modified On : Thu Jul 23 14:16:00 2020
+// Update Count     : 5
 //
 …
 #include "Decl.hpp"
 #include "Init.hpp"
+#include "Common/utility.h"      // for copy, move
 #include "InitTweak/InitTweak.h" // for getPointerBase
 #include "Tuples/Tuples.h"       // for isTtype
 …
 // --- FunctionType
 namespace {
         bool containsTtype( const std::vector<ptr<DeclWithType>> & l ) {
+        bool containsTtype( const std::vector<ptr<Type>> & l ) {
                 if ( ! l.empty() ) {
                         return Tuples::isTtype( l.back()->get_type() );
+                        return Tuples::isTtype( l.back() );
+                }
                 return false;
 …
+}
+// --- ReferenceToType
+std::vector<readonly<Decl>> ReferenceToType::lookup( const std::string& name ) const {
+std::vector<readonly<Decl>> BaseInstType::lookup( const std::string& name ) const {
         assertf( aggr(), "Must have aggregate to perform lookup" );
 …
+}
 // --- StructInstType
+// --- SueInstType (StructInstType, UnionInstType, EnumInstType)
+StructInstType::StructInstType( const StructDecl * b, CV::Qualifiers q,
+        std::vector<ptr<Attribute>>&& as )
+: ReferenceToType( b->name, q, std::move(as) ), base( b ) {}
+template<typename decl_t>
+SueInstType<decl_t>::SueInstType(
+        const decl_t * b, CV::Qualifiers q, std::vector<ptr<Attribute>>&& as )
+: BaseInstType( b->name, q, move(as) ), base( b ) {}
+bool StructInstType::isComplete() const { return base ? base->body : false; }
+template<typename decl_t>
+SueInstType<decl_t>::SueInstType(
+        const base_type * b, std::vector<ptr<Expr>> && params,
+        CV::Qualifiers q, std::vector<ptr<Attribute>> && as )
+: BaseInstType( b->name, std::move(params), q, std::move(as) ), base( b ) {}
+// --- UnionInstType
+template<typename decl_t>
+bool SueInstType<decl_t>::isComplete() const {
+        return base ? base->body : false;
+}
+UnionInstType::UnionInstType( const UnionDecl * b, CV::Qualifiers q,
+        std::vector<ptr<Attribute>>&& as )
+: ReferenceToType( b->name, q, std::move(as) ), base( b ) {}
+bool UnionInstType::isComplete() const { return base ? base->body : false; }
+// --- EnumInstType
+EnumInstType::EnumInstType( const EnumDecl * b, CV::Qualifiers q,
+        std::vector<ptr<Attribute>>&& as )
+: ReferenceToType( b->name, q, std::move(as) ), base( b ) {}
+bool EnumInstType::isComplete() const { return base ? base->body : false; }
+template class SueInstType<StructDecl>;
+template class SueInstType<UnionDecl>;
+template class SueInstType<EnumDecl>;
 // --- TraitInstType
+TraitInstType::TraitInstType( const TraitDecl * b, CV::Qualifiers q,
+        std::vector<ptr<Attribute>>&& as )
+: ReferenceToType( b->name, q, std::move(as) ), base( b ) {}
+// --- TypeInstType
+TraitInstType::TraitInstType(
+        const TraitDecl * b, CV::Qualifiers q, std::vector<ptr<Attribute>>&& as )
+: BaseInstType( b->name, q, move(as) ), base( b ) {}
 void TypeInstType::set_base( const TypeDecl * b ) {
 …
 TupleType::TupleType( std::vector<ptr<Type>> && ts, CV::Qualifiers q )
 : Type( q ), types( std::move(ts) ), members() {
+: Type( q ), types( move(ts) ), members() {
         // This constructor is awkward. `TupleType` needs to contain objects so that members can be
         // named, but members without initializer nodes end up getting constructors, which breaks
 …
         for ( const Type * ty : types ) {
                 members.emplace_back( new ObjectDecl{
                         CodeLocation{}, "", ty, new ListInit( CodeLocation{}, {}, {}, MaybeConstruct ),
+                        CodeLocation{}, "", ty, new ListInit( CodeLocation{}, {}, {}, NoConstruct ),
                         Storage::Classes{}, Linkage::Cforall } );
+        }
+}
+bool isUnboundType(const Type * type) {
+        if (auto typeInst = dynamic_cast<const TypeInstType *>(type)) {
+                // xxx - look for a type name produced by renameTyVars.
+                // TODO: once TypeInstType representation is updated, it should properly check
+                // if the context id is filled. this is a temporary hack for now
+                return typeInst->formal_usage > 0;
+        }
+        return false;
+}

src/AST/Type.hpp

-              r3c64c668
+              r58fe85a
 // Author           : Aaron B. Moss
 // Created On       : Thu May 9 10:00:00 2019
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec 11 21:56:46 2019
 // Update Count     : 5
+// Last Modified By : Andrew Beach
+// Last Modified On : Thu Jul 23 14:15:00 2020
+// Update Count     : 6
 //
 …
 // Must be included in *all* AST classes; should be #undef'd at the end of the file
+#define MUTATE_FRIEND template<typename node_t> friend node_t * mutate(const node_t * node);
+#define MUTATE_FRIEND \
+    template<typename node_t> friend node_t * mutate(const node_t * node); \
+        template<typename node_t> friend node_t * shallowCopy(const node_t * node);
 namespace ast {
+template< typename T > class Pass;
 class Type : public Node {
 …
         bool is_volatile() const { return qualifiers.is_volatile; }
         bool is_restrict() const { return qualifiers.is_restrict; }
-        bool is_lvalue() const { return qualifiers.is_lvalue; }
         bool is_mutex() const { return qualifiers.is_mutex; }
         bool is_atomic() const { return qualifiers.is_atomic; }
 …
         Type * set_volatile( bool v ) { qualifiers.is_volatile = v; return this; }
         Type * set_restrict( bool v ) { qualifiers.is_restrict = v; return this; }
-        Type * set_lvalue( bool v ) { qualifiers.is_lvalue = v; return this; }
         Type * set_mutex( bool v ) { qualifiers.is_mutex = v; return this; }
         Type * set_atomic( bool v ) { qualifiers.is_atomic = v; return this; }
 …
         static const char *typeNames[];
         BasicType( Kind k, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} )
+        BasicType( Kind k, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} )
         : Type(q, std::move(as)), kind(k) {}
 …
 };
-/// Base type for potentially forall-qualified types
-class ParameterizedType : public Type {
-public:
-        using ForallList = std::vector<ptr<TypeDecl>>;
-        ForallList forall;
-        ParameterizedType( ForallList&& fs = {}, CV::Qualifiers q = {},
-                std::vector<ptr<Attribute>> && as = {} )
-        : Type(q, std::move(as)), forall(std::move(fs)) {}
-        ParameterizedType( CV::Qualifiers q, std::vector<ptr<Attribute>> && as = {} )
-        : Type(q, std::move(as)), forall() {}
-private:
-        virtual ParameterizedType * clone() const override = 0;
-        MUTATE_FRIEND
-};
 /// Function variable arguments flag
 enum ArgumentFlag { FixedArgs, VariableArgs };
 /// Type of a function `[R1, R2](*)(P1, P2, P3)`
+class FunctionType final : public ParameterizedType {
+public:
+        std::vector<ptr<DeclWithType>> returns;
+        std::vector<ptr<DeclWithType>> params;
+class FunctionType final : public Type {
+public:
+        using ForallList = std::vector<ptr<TypeInstType>>;
+        using AssertionList = std::vector<ptr<VariableExpr>>;
+        ForallList forall;
+        AssertionList assertions;
+        std::vector<ptr<Type>> returns;
+        std::vector<ptr<Type>> params;
         /// Does the function accept a variable number of arguments following the arguments specified
 …
         FunctionType( ArgumentFlag va = FixedArgs, CV::Qualifiers q = {} )
+        : ParameterizedType(q), returns(), params(), isVarArgs(va) {}
+        : Type(q), returns(), params(), isVarArgs(va) {}
+        FunctionType( const FunctionType & o ) = default;
         /// true if either the parameters or return values contain a tttype
 …
 /// base class for types that refer to types declared elsewhere (aggregates and typedefs)
 class ReferenceToType : public ParameterizedType {
+class BaseInstType : public Type {
 public:
         std::vector<ptr<Expr>> params;
 …
         bool hoistType = false;
+        ReferenceToType( const std::string& n, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} )
+        : ParameterizedType(q, std::move(as)), params(), name(n) {}
+        BaseInstType(
+                const std::string& n, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} )
+        : Type(q, std::move(as)), params(), name(n) {}
+        BaseInstType(
+                const std::string& n, std::vector<ptr<Expr>> && params,
+                CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} )
+        : Type(q, std::move(as)), params(std::move(params)), name(n) {}
+        BaseInstType( const BaseInstType & o ) = default;
         /// Gets aggregate declaration this type refers to
 …
 private:
+        virtual ReferenceToType * clone() const override = 0;
+        MUTATE_FRIEND
+};
+/// instance of struct type
+class StructInstType final : public ReferenceToType {
+public:
+        readonly<StructDecl> base;
+        StructInstType( const std::string& n, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} )
+        : ReferenceToType( n, q, std::move(as) ), base() {}
+        StructInstType( const StructDecl * b, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} );
+        virtual BaseInstType * clone() const override = 0;
+        MUTATE_FRIEND
+};
+// Common implementation for the SUE instance types. Not to be used directly.
+template<typename decl_t>
+class SueInstType final : public BaseInstType {
+public:
+        using base_type = decl_t;
+        readonly<decl_t> base;
+        SueInstType(
+                const std::string& n, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} )
+        : BaseInstType( n, q, std::move(as) ), base() {}
+        SueInstType(
+                const base_type * b, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} );
+        SueInstType(
+                const base_type * b, std::vector<ptr<Expr>> && params,
+                CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} );
         bool isComplete() const override;
+        const StructDecl * aggr() const override { return base; }
+        const Type * accept( Visitor & v ) const override { return v.visit( this ); }
+private:
+        StructInstType * clone() const override { return new StructInstType{ *this }; }
+        MUTATE_FRIEND
+};
+/// instance of union type
+class UnionInstType final : public ReferenceToType {
+public:
+        readonly<UnionDecl> base;
+        UnionInstType( const std::string& n, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} )
+        : ReferenceToType( n, q, std::move(as) ), base() {}
+        UnionInstType( const UnionDecl * b, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} );
+        bool isComplete() const override;
+        const UnionDecl * aggr() const override { return base; }
+        const Type * accept( Visitor & v ) const override { return v.visit( this ); }
+private:
+        UnionInstType * clone() const override { return new UnionInstType{ *this }; }
+        MUTATE_FRIEND
+};
+/// instance of enum type
+class EnumInstType final : public ReferenceToType {
+public:
+        readonly<EnumDecl> base;
+        EnumInstType( const std::string& n, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} )
+        : ReferenceToType( n, q, std::move(as) ), base() {}
+        EnumInstType( const EnumDecl * b, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} );
+        bool isComplete() const override;
+        const EnumDecl * aggr() const override { return base; }
+        const Type * accept( Visitor & v ) const override { return v.visit( this ); }
+private:
+        EnumInstType * clone() const override { return new EnumInstType{ *this }; }
+        MUTATE_FRIEND
+};
+/// instance of trait type
+class TraitInstType final : public ReferenceToType {
+        const decl_t * aggr() const override { return base; }
+        const Type * accept( Visitor & v ) const override { return v.visit( this ); }
+private:
+        SueInstType<decl_t> * clone() const override { return new SueInstType<decl_t>{ *this }; }
+        MUTATE_FRIEND
+};
+/// An instance of a struct type.
+using StructInstType = SueInstType<StructDecl>;
+/// An instance of a union type.
+using UnionInstType = SueInstType<UnionDecl>;
+/// An instance of an enum type.
+using EnumInstType = SueInstType<EnumDecl>;
+/// An instance of a trait type.
+class TraitInstType final : public BaseInstType {
 public:
         readonly<TraitDecl> base;
+        TraitInstType( const std::string& n, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} )
+        : ReferenceToType( n, q, std::move(as) ), base() {}
+        TraitInstType( const TraitDecl * b, CV::Qualifiers q = {},
+                std::vector<ptr<Attribute>> && as = {} );
+        TraitInstType(
+                const std::string& n, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} )
+        : BaseInstType( n, q, std::move(as) ), base() {}
+        TraitInstType(
+                const TraitDecl * b, CV::Qualifiers q = {}, std::vector<ptr<Attribute>> && as = {} );
         // not meaningful for TraitInstType
 …
 /// instance of named type alias (typedef or variable)
 class TypeInstType final : public ReferenceToType {
+class TypeInstType final : public BaseInstType {
 public:
         readonly<TypeDecl> base;
+        // previously from renameTyVars; now directly use integer fields instead of synthesized strings
+        // a nonzero value of formal_usage indicates a formal type (only used in function type)
+        // a zero value of formal_usage indicates an actual type (referenced inside body of parametric structs and functions)
         TypeDecl::Kind kind;
+        TypeInstType( const std::string& n, const TypeDecl * b, CV::Qualifiers q = {},
+        int formal_usage = 0;
+        int expr_id = 0;
+        // compact representation used for map lookups.
+        struct TypeEnvKey {
+                const TypeDecl * base;
+                int formal_usage;
+                int expr_id;
+                TypeEnvKey() = default;
+                TypeEnvKey(const TypeDecl * base, int formal_usage = 0, int expr_id = 0): base(base), formal_usage(formal_usage), expr_id(expr_id) {}
+                TypeEnvKey(const TypeInstType & inst): base(inst.base), formal_usage(inst.formal_usage), expr_id(inst.expr_id) {}
+                std::string typeString() const { return std::string("_") + std::to_string(formal_usage) + "_" + std::to_string(expr_id) + "_" + base->name; }
+                bool operator==(const TypeEnvKey & other) const { return base == other.base && formal_usage == other.formal_usage && expr_id == other.expr_id; }
+        };
+        bool operator==(const TypeInstType & other) const { return base == other.base && formal_usage == other.formal_usage && expr_id == other.expr_id; }
+        TypeInstType(
+                const std::string& n, const TypeDecl * b, CV::Qualifiers q = {},
                 std::vector<ptr<Attribute>> && as = {} )
         : ReferenceToType( n, q, std::move(as) ), base( b ), kind( b->kind ) {}
+        : BaseInstType( n, q, std::move(as) ), base( b ), kind( b->kind ) {}
         TypeInstType( const std::string& n, TypeDecl::Kind k, CV::Qualifiers q = {},
                 std::vector<ptr<Attribute>> && as = {} )
+        : ReferenceToType( n, q, std::move(as) ), base(), kind( k ) {}
+        : BaseInstType( n, q, std::move(as) ), base(), kind( k ) {}
+        TypeInstType( const TypeInstType & o ) = default;
+        TypeInstType( const TypeEnvKey & key )
+        : BaseInstType(key.base->name), base(key.base), kind(key.base->kind), formal_usage(key.formal_usage), expr_id(key.expr_id) {}
         /// sets `base`, updating `kind` correctly
 …
         const Type * accept( Visitor & v ) const override { return v.visit( this ); }
+        std::string typeString() const {
+                if (formal_usage > 0) return std::string("_") + std::to_string(formal_usage) + "_" + std::to_string(expr_id) + "_" + name;
+                else return name;
+        }
 private:
         TypeInstType * clone() const override { return new TypeInstType{ *this }; }
 …
 };
+bool isUnboundType(const Type * type);
+}
+namespace std {
+        template<>
+        struct hash<typename ast::TypeInstType::TypeEnvKey> {
+                size_t operator() (const ast::TypeInstType::TypeEnvKey & x) const {
+                        const size_t p = 1000007;
+                        size_t res = reinterpret_cast<size_t>(x.base);
+                        res = p * res + x.formal_usage;
+                        res = p * res + x.expr_id;
+                        return res;
+                }
+        };
+}

src/AST/TypeEnvironment.cpp

-              r3c64c668
+              r58fe85a
 #include "ResolvExpr/Unify.h"      // for unifyInexact
 #include "Tuples/Tuples.h"         // for isTtype
+#include "CompilationState.h"
 using ResolvExpr::WidenMode;
 …
         for ( const auto & i : open ) {
                 if ( first ) { first = false; } else { out << ' '; }
                 out << i.first << "(" << i.second << ")";
+                out << i.first.typeString() << "(" << i.second << ")";
+        }
+}
 void print( std::ostream & out, const EqvClass & clz, Indenter indent ) {
+        out << "( ";
+        std::copy( clz.vars.begin(), clz.vars.end(), std::ostream_iterator< std::string >( out, " " ) );
+        out << "(";
+        bool first = true;
+        for(const auto & var : clz.vars) {
+                if(first) first = false;
+                else out << " ";
+                if( deterministic_output ) out << "[unbound]";
+                else out << "_" << var.formal_usage << "_" << var.expr_id << "_";
+                out << var.base->name;
+        }
         out << ")";
         if ( clz.bound ) {
                 out << " -> ";
 …
+}
 const EqvClass * TypeEnvironment::lookup( const std::string & var ) const {
+const EqvClass * TypeEnvironment::lookup( const TypeInstType::TypeEnvKey & var ) const {
         for ( ClassList::const_iterator i = env.begin(); i != env.end(); ++i ) {
                 if ( i->vars.find( var ) != i->vars.end() ) return &*i;
 …
+                                }
+                        }
                         i = next;  // go to next node even if this removed
+                }
 …
+}
 void TypeEnvironment::add( const ParameterizedType::ForallList & tyDecls ) {
         for ( const TypeDecl * tyDecl : tyDecls ) {
+void TypeEnvironment::add( const FunctionType::ForallList & tyDecls ) {
+        for ( auto & tyDecl : tyDecls ) {
                 env.emplace_back( tyDecl );
+        }
 …
 void TypeEnvironment::writeToSubstitution( TypeSubstitution & sub ) const {
         for ( const auto & clz : env ) {
+                std::string clzRep;
+                TypeInstType::TypeEnvKey clzRep;
+                bool first = true;
                 for ( const auto & var : clz.vars ) {
                         if ( clz.bound ) {
                                 sub.add( var, clz.bound );
                         } else if ( clzRep.empty() ) {
+                        } else if ( first ) {
                                 clzRep = var;
+                                first = false;
                         } else {
                                 sub.add( var, new TypeInstType{ clzRep, clz.data.kind } );
+                                sub.add( var, new TypeInstType{ clzRep } );
+                        }
+                }
 …
         struct Occurs : public ast::WithVisitorRef<Occurs> {
                 bool result;
                 std::set< std::string > vars;
+                std::unordered_set< TypeInstType::TypeEnvKey > vars;
                 const TypeEnvironment & tenv;
                 Occurs( const std::string & var, const TypeEnvironment & env )
+                Occurs( const TypeInstType::TypeEnvKey & var, const TypeEnvironment & env )
                 : result( false ), vars(), tenv( env ) {
                         if ( const EqvClass * clz = tenv.lookup( var ) ) {
 …
                 void previsit( const TypeInstType * typeInst ) {
                         if ( vars.count( typeInst->name ) ) {
+                        if ( vars.count( *typeInst ) ) {
                                 result = true;
                         } else if ( const EqvClass * clz = tenv.lookup( typeInst->name ) ) {
+                        } else if ( const EqvClass * clz = tenv.lookup( *typeInst ) ) {
                                 if ( clz->bound ) {
                                         clz->bound->accept( *visitor );
 …
         /// true if `var` occurs in `ty` under `env`
         bool occurs( const Type * ty, const std::string & var, const TypeEnvironment & env ) {
+        bool occurs( const Type * ty, const TypeInstType::TypeEnvKey & var, const TypeEnvironment & env ) {
                 Pass<Occurs> occur{ var, env };
                 maybe_accept( ty, occur );
                 return occur.pass.result;
+        }
+}
 bool TypeEnvironment::combine(
+                return occur.core.result;
+        }
+}
+bool TypeEnvironment::combine(
                 const TypeEnvironment & o, OpenVarSet & open, const SymbolTable & symtab ) {
         // short-circuit easy cases
 …
                                 auto st = internal_lookup( *vt );
                                 if ( st == env.end() ) {
                                         // unbound, safe to add if occurs
+                                        // unbound, safe to add if occurs
                                         if ( r.bound && occurs( r.bound, *vt, *this ) ) return false;
                                         r.vars.emplace( *vt );
 …
+}
 bool TypeEnvironment::bindVar(
                 const TypeInstType * typeInst, const Type * bindTo, const TypeDecl::Data & data,
                 AssertionSet & need, AssertionSet & have, const OpenVarSet & open, WidenMode widen,
                 const SymbolTable & symtab
+bool TypeEnvironment::bindVar(
+                const TypeInstType * typeInst, const Type * bindTo, const TypeDecl::Data & data,
+                AssertionSet & need, AssertionSet & have, const OpenVarSet & open, WidenMode widen,
+                const SymbolTable & symtab
 ) {
         // remove references from bound type, so that type variables can only bind to value types
         ptr<Type> target = bindTo->stripReferences();
         auto tyvar = open.find( typeInst->name );
+        auto tyvar = open.find( *typeInst );
         assert( tyvar != open.end() );
         if ( ! tyVarCompatible( tyvar->second, target ) ) return false;
         if ( occurs( target, typeInst->name, *this ) ) return false;
         auto it = internal_lookup( typeInst->name );
+        if ( occurs( target, *typeInst, *this ) ) return false;
+        auto it = internal_lookup( *typeInst );
         if ( it != env.end() ) {
                 if ( it->bound ) {
 …
                         ptr<Type> newType = it->bound;
                         reset_qualifiers( newType, typeInst->qualifiers );
                         if ( unifyInexact(
                                         newType, target, *this, need, have, open,
+                        if ( unifyInexact(
+                                        newType, target, *this, need, have, open,
                                         widen & WidenMode{ it->allowWidening, true }, symtab, common ) ) {
                                 if ( common ) {
 …
+                }
         } else {
                 env.emplace_back(
                         typeInst->name, target, widen.first && widen.second, data );
+                env.emplace_back(
+                        *typeInst, target, widen.first && widen.second, data );
+        }
         return true;
+}
 bool TypeEnvironment::bindVarToVar(
                 const TypeInstType * var1, const TypeInstType * var2, TypeDecl::Data && data,
                 AssertionSet & need, AssertionSet & have, const OpenVarSet & open,
                 WidenMode widen, const SymbolTable & symtab
+bool TypeEnvironment::bindVarToVar(
+                const TypeInstType * var1, const TypeInstType * var2, TypeDecl::Data && data,
+                AssertionSet & need, AssertionSet & have, const OpenVarSet & open,
+                WidenMode widen, const SymbolTable & symtab
 ) {
         auto c1 = internal_lookup( var1->name );
         auto c2 = internal_lookup( var2->name );
+        auto c1 = internal_lookup( *var1 );
+        auto c2 = internal_lookup( *var2 );
         // exit early if variables already bound together
         if ( c1 != env.end() && c1 == c2 ) {
 …
         if ( c1 != env.end() ) {
                 if ( c1->bound ) {
                         if ( occurs( c1->bound, var2->name, *this ) ) return false;
+                        if ( occurs( c1->bound, *var2, *this ) ) return false;
                         type1 = c1->bound;
+                }
 …
         if ( c2 != env.end() ) {
                 if ( c2->bound ) {
                         if ( occurs( c2->bound, var1->name, *this ) ) return false;
+                        if ( occurs( c2->bound, *var1, *this ) ) return false;
                         type2 = c2->bound;
+                }
 …
         } else if ( c1 != env.end() ) {
                 // var2 unbound, add to env[c1]
                 c1->vars.emplace( var2->name );
+                c1->vars.emplace( *var2 );
                 c1->allowWidening = widen1;
                 c1->data.isComplete |= data.isComplete;
         } else if ( c2 != env.end() ) {
                 // var1 unbound, add to env[c2]
                 c2->vars.emplace( var1->name );
+                c2->vars.emplace( *var1 );
                 c2->allowWidening = widen2;
                 c2->data.isComplete |= data.isComplete;
         } else {
                 // neither var bound, create new class
                 env.emplace_back( var1->name, var2->name, widen1 && widen2, data );
+                env.emplace_back( *var1, *var2, widen1 && widen2, data );
+        }
 …
+}
 bool TypeEnvironment::mergeBound(
+bool TypeEnvironment::mergeBound(
                 EqvClass & to, const EqvClass & from, OpenVarSet & open, const SymbolTable & symtab ) {
         if ( from.bound ) {
 …
                         AssertionSet need, have;
                         if ( unifyInexact(
+                        if ( unifyInexact(
                                         toType, fromType, *this, need, have, open, widen, symtab, common ) ) {
                                 // unifies, set common type if necessary
 …
+}
 bool TypeEnvironment::mergeClasses(
+bool TypeEnvironment::mergeClasses(
         ClassList::iterator to, ClassList::iterator from, OpenVarSet & open, const SymbolTable & symtab
 ) {
 …
+}
 TypeEnvironment::ClassList::iterator TypeEnvironment::internal_lookup( const std::string & var ) {
+TypeEnvironment::ClassList::iterator TypeEnvironment::internal_lookup( const TypeInstType::TypeEnvKey & var ) {
         for ( ClassList::iterator i = env.begin(); i != env.end(); ++i ) {
                 if ( i->vars.count( var ) ) return i;

src/AST/TypeEnvironment.hpp

-              r3c64c668
+              r58fe85a
 /// Adding this comparison operator significantly improves assertion satisfaction run time for
 /// some cases. The current satisfaction algorithm's speed partially depends on the order of
 /// assertions. Assertions which have fewer possible matches should appear before assertions
 /// which have more possible matches. This seems to imply that this could be further improved
 /// by providing an indexer as an additional argument and ordering based on the number of
+/// assertions. Assertions which have fewer possible matches should appear before assertions
+/// which have more possible matches. This seems to imply that this could be further improved
+/// by providing an indexer as an additional argument and ordering based on the number of
 /// matches of the same kind (object, function) for the names of the declarations.
 ///
 /// I've seen a TU go from 54 minutes to 1 minute 34 seconds with the addition of this
+/// I've seen a TU go from 54 minutes to 1 minute 34 seconds with the addition of this
 /// comparator.
 ///
 /// Note: since this compares pointers for position, minor changes in the source file that
 /// affect memory layout can alter compilation time in unpredictable ways. For example, the
 /// placement of a line directive can reorder type pointers with respect to each other so that
 /// assertions are seen in different orders, causing a potentially different number of
 /// unification calls when resolving assertions. I've seen a TU go from 36 seconds to 27
 /// seconds by reordering line directives alone, so it would be nice to fix this comparison so
 /// that assertions compare more consistently. I've tried to modify this to compare on mangle
 /// name instead of type as the second comparator, but this causes some assertions to never be
+/// Note: since this compares pointers for position, minor changes in the source file that
+/// affect memory layout can alter compilation time in unpredictable ways. For example, the
+/// placement of a line directive can reorder type pointers with respect to each other so that
+/// assertions are seen in different orders, causing a potentially different number of
+/// unification calls when resolving assertions. I've seen a TU go from 36 seconds to 27
+/// seconds by reordering line directives alone, so it would be nice to fix this comparison so
+/// that assertions compare more consistently. I've tried to modify this to compare on mangle
+/// name instead of type as the second comparator, but this causes some assertions to never be
 /// recorded. More investigation is needed.
 struct AssertCompare {
         bool operator()( const DeclWithType * d1, const DeclWithType * d2 ) const {
                 int cmp = d1->name.compare( d2->name );
                 return cmp < 0 || ( cmp == 0 && d1->get_type() < d2->get_type() );
+        bool operator()( const VariableExpr * d1, const VariableExpr * d2 ) const {
+                int cmp = d1->var->name.compare( d2->var->name );
+                return cmp < 0 || ( cmp == 0 && d1->result < d2->result );
+        }
 };
 …
 /// Set of assertions pending satisfaction
 using AssertionSet = std::map< readonly<DeclWithType>, AssertionSetValue, AssertCompare >;
+using AssertionSet = std::map< const VariableExpr *, AssertionSetValue, AssertCompare >;
 /// Set of open variables
 using OpenVarSet = std::unordered_map< std::string, TypeDecl::Data >;
+using OpenVarSet = std::unordered_map< TypeInstType::TypeEnvKey, TypeDecl::Data >;
 /// Merges one set of open vars into another
 …
 void print( std::ostream &, const OpenVarSet &, Indenter indent = {} );
 /// Represents an equivalence class of bound type variables, optionally with the concrete type
+/// Represents an equivalence class of bound type variables, optionally with the concrete type
 /// they bind to.
 struct EqvClass {
         std::set< std::string > vars;
+        std::unordered_set< TypeInstType::TypeEnvKey > vars;
         ptr<Type> bound;
         bool allowWidening;
 …
         EqvClass() : vars(), bound(), allowWidening( true ), data() {}
         /// Copy-with-bound constructor
         EqvClass( const EqvClass & o, const Type * b )
+        EqvClass( const EqvClass & o, const Type * b )
         : vars( o.vars ), bound( b ), allowWidening( o.allowWidening ), data( o.data ) {}
         /// Singleton class constructor from TypeDecl
         EqvClass( const TypeDecl * decl )
         : vars{ decl->name }, bound(), allowWidening( true ), data( decl ) {}
+        EqvClass( const TypeInstType * inst )
+        : vars{ *inst }, bound(), allowWidening( true ), data( inst->base ) {}
         /// Singleton class constructor from substitution
         EqvClass( const std::string & v, const Type * b )
+        EqvClass( const TypeInstType::TypeEnvKey & v, const Type * b )
         : vars{ v }, bound( b ), allowWidening( false ), data( TypeDecl::Dtype, false ) {}
         /// Single-var constructor (strips qualifiers from bound type)
         EqvClass( const std::string & v, const Type * b, bool w, const TypeDecl::Data & d )
+        EqvClass( const TypeInstType::TypeEnvKey & v, const Type * b, bool w, const TypeDecl::Data & d )
         : vars{ v }, bound( b ), allowWidening( w ), data( d ) {
                 reset_qualifiers( bound );
 …
         /// Double-var constructor
         EqvClass( const std::string & v, const std::string & u, bool w, const TypeDecl::Data & d )
+        EqvClass( const TypeInstType::TypeEnvKey & v, const TypeInstType::TypeEnvKey & u, bool w, const TypeDecl::Data & d )
         : vars{ v, u }, bound(), allowWidening( w ), data( d ) {}
 …
 public:
         /// Finds the equivalence class containing a variable; nullptr for none such
         const EqvClass * lookup( const std::string & var ) const;
+        const EqvClass * lookup( const TypeInstType::TypeEnvKey & var ) const;
         /// Add a new equivalence class for each type variable
         void add( const ParameterizedType::ForallList & tyDecls );
+        void add( const FunctionType::ForallList & tyDecls );
         /// Add a new equivalence class for each branch of the substitution, checking for conflicts
 …
         void writeToSubstitution( TypeSubstitution & sub ) const;
         template< typename node_t, enum Node::ref_type ref_t >
         int apply( ptr_base< node_t, ref_t > & type ) const {
+        template< typename node_t >
+        auto apply( node_t && type ) const {
                 TypeSubstitution sub;
                 writeToSubstitution( sub );
                 return sub.apply( type );
+        }
         template< typename node_t, enum Node::ref_type ref_t >
         int applyFree( ptr_base< node_t, ref_t > & type ) const {
+                return sub.apply( std::forward<node_t>(type) );
+        }
+        template< typename node_t >
+        auto applyFree( node_t && type ) const {
                 TypeSubstitution sub;
                 writeToSubstitution( sub );
                 return sub.applyFree( type );
+                return sub.applyFree( std::forward<node_t>(type) );
+        }
 …
         void addActual( const TypeEnvironment & actualEnv, OpenVarSet & openVars );
         /// Binds the type class represented by `typeInst` to the type `bindTo`; will add the class if
+        /// Binds the type class represented by `typeInst` to the type `bindTo`; will add the class if
         /// needed. Returns false on failure.
         bool bindVar(
                 const TypeInstType * typeInst, const Type * bindTo, const TypeDecl::Data & data,
                 AssertionSet & need, AssertionSet & have, const OpenVarSet & openVars,
+        bool bindVar(
+                const TypeInstType * typeInst, const Type * bindTo, const TypeDecl::Data & data,
+                AssertionSet & need, AssertionSet & have, const OpenVarSet & openVars,
                 ResolvExpr::WidenMode widen, const SymbolTable & symtab );
         /// Binds the type classes represented by `var1` and `var2` together; will add one or both
+        /// Binds the type classes represented by `var1` and `var2` together; will add one or both
         /// classes if needed. Returns false on failure.
         bool bindVarToVar(
                 const TypeInstType * var1, const TypeInstType * var2, TypeDecl::Data && data,
                 AssertionSet & need, AssertionSet & have, const OpenVarSet & openVars,
+        bool bindVarToVar(
+                const TypeInstType * var1, const TypeInstType * var2, TypeDecl::Data && data,
+                AssertionSet & need, AssertionSet & have, const OpenVarSet & openVars,
                 ResolvExpr::WidenMode widen, const SymbolTable & symtab );
 …
         /// Unifies the type bound of `to` with the type bound of `from`, returning false if fails
         bool mergeBound(
+        bool mergeBound(
                 EqvClass & to, const EqvClass & from, OpenVarSet & openVars, const SymbolTable & symtab );
         /// Merges two type classes from local environment, returning false if fails
         bool mergeClasses(
                 ClassList::iterator to, ClassList::iterator from, OpenVarSet & openVars,
+        bool mergeClasses(
+                ClassList::iterator to, ClassList::iterator from, OpenVarSet & openVars,
                 const SymbolTable & symtab );
         /// Private lookup API; returns array index of string, or env.size() for not found
         ClassList::iterator internal_lookup( const std::string & );
+        ClassList::iterator internal_lookup( const TypeInstType::TypeEnvKey & );
 };

src/AST/TypeSubstitution.cpp

-              r3c64c668
+              r58fe85a
 namespace ast {
+// size_t TypeSubstitution::Substituter::traceId = Stats::Heap::new_stacktrace_id("TypeSubstitution");
 TypeSubstitution::TypeSubstitution() {
+}
 …
 void TypeSubstitution::initialize( const TypeSubstitution &src, TypeSubstitution &dest ) {
         dest.typeEnv.clear();
-        dest.varEnv.clear();
         dest.add( src );
+}
 …
                 typeEnv[ i->first ] = i->second;
         } // for
-        for ( VarEnvType::const_iterator i = other.varEnv.begin(); i != other.varEnv.end(); ++i ) {
-                varEnv[ i->first ] = i->second;
-        } // for
+}
 void TypeSubstitution::add( std::string formalType, const Type *actualType ) {
         typeEnv[ formalType ] = actualType;
+void TypeSubstitution::add( const TypeInstType * formalType, const Type *actualType ) {
+        typeEnv[ *formalType ] = actualType;
+}
 void TypeSubstitution::addVar( std::string formalExpr, const Expr *actualExpr ) {
         varEnv[ formalExpr ] = actualExpr;
+void TypeSubstitution::add( const TypeInstType::TypeEnvKey & key, const Type * actualType) {
+        typeEnv[ key ] = actualType;
+}
 void TypeSubstitution::remove( std::string formalType ) {
         TypeEnvType::iterator i = typeEnv.find( formalType );
+void TypeSubstitution::remove( const TypeInstType * formalType ) {
+        TypeEnvType::iterator i = typeEnv.find( *formalType );
         if ( i != typeEnv.end() ) {
                 typeEnv.erase( formalType );
+                typeEnv.erase( *formalType );
         } // if
+}
 const Type *TypeSubstitution::lookup( std::string formalType ) const {
         TypeEnvType::const_iterator i = typeEnv.find( formalType );
+const Type *TypeSubstitution::lookup( const TypeInstType * formalType ) const {
+        TypeEnvType::const_iterator i = typeEnv.find( *formalType );
         // break on not in substitution set
 …
         // attempt to transitively follow TypeInstType links.
         while ( const TypeInstType *actualType = i->second.as<TypeInstType>()) {
-                const std::string& typeName = actualType->name;
                 // break cycles in the transitive follow
                 if ( formalType == typeName ) break;
+                if ( *formalType == *actualType ) break;
                 // Look for the type this maps to, returning previous mapping if none-such
                 i = typeEnv.find( typeName );
+                i = typeEnv.find( *actualType );
                 if ( i == typeEnv.end() ) return actualType;
+        }
 …
 bool TypeSubstitution::empty() const {
         return typeEnv.empty() && varEnv.empty();
+        return typeEnv.empty();
+}
 namespace {
         struct EnvTrimmer {
                 ptr<TypeSubstitution> env;
+                const TypeSubstitution * env;
                 TypeSubstitution * newEnv;
                 EnvTrimmer( const TypeSubstitution * env, TypeSubstitution * newEnv ) : env( env ), newEnv( newEnv ){}
                 void previsit( TypeDecl * tyDecl ) {
+                void previsit( FunctionType * ftype ) {
                         // transfer known bindings for seen type variables
+                        if ( const Type * t = env->lookup( tyDecl->name ) ) {
+                                newEnv->add( tyDecl->name, t );
+                        for (auto & formal : ftype->forall) {
+                                if ( const Type * t = env->lookup( formal ) ) {
+                                        newEnv->add( formal, t );
+                                }
+                        }
+                }
 …
         if ( env ) {
                 TypeSubstitution * newEnv = new TypeSubstitution();
-#if TIME_TO_CONVERT_PASSES
                 Pass<EnvTrimmer> trimmer( env, newEnv );
                 expr->accept( trimmer );
-#else
-                (void)expr;
-                (void)env;
-#endif
                 return newEnv;
+        }
 …
 void TypeSubstitution::normalize() {
+#if TIME_TO_CONVERT_PASSES
+        PassVisitor<Substituter> sub( *this, true );
+        Pass<Substituter> sub( *this, true );
         do {
                 sub.pass.subCount = 0;
                 sub.pass.freeOnly = true;
+                sub.core.subCount = 0;
+                sub.core.freeOnly = true;
                 for ( TypeEnvType::iterator i = typeEnv.begin(); i != typeEnv.end(); ++i ) {
                         i->second = i->second->acceptMutator( sub );
+                        i->second = i->second->accept( sub );
+                }
+        } while ( sub.pass.subCount );
+#endif
+        } while ( sub.core.subCount );
+}
+#if TIME_TO_CONVERT_PASSES
+Type * TypeSubstitution::Substituter::postmutate( TypeInstType *inst ) {
+        BoundVarsType::const_iterator bound = boundVars.find( inst->name );
+const Type * TypeSubstitution::Substituter::postvisit( const TypeInstType *inst ) {
+        BoundVarsType::const_iterator bound = boundVars.find( *inst );
         if ( bound != boundVars.end() ) return inst;
         TypeEnvType::const_iterator i = sub.typeEnv.find( inst->name );
+        TypeEnvType::const_iterator i = sub.typeEnv.find( *inst );
         if ( i == sub.typeEnv.end() ) {
                 return inst;
 …
                 // Note: this does not prevent cycles in the general case, so it may be necessary to do something more sophisticated here.
                 // TODO: investigate preventing type variables from being bound to themselves in the first place.
                 if ( TypeInstType * replacement = i->second.as<TypeInstType>() ) {
                         if ( inst->name == replacement->name ) {
+                if ( const TypeInstType * replacement = i->second.as<TypeInstType>() ) {
+                        if ( *inst == *replacement ) {
                                 return inst;
+                        }
 …
                 // std::cerr << "found " << inst->name << ", replacing with " << i->second << std::endl;
                 subCount++;
+                Type * newtype = i->second->clone();
+                newtype->get_qualifiers() |= inst->get_qualifiers();
+                delete inst;
+                // Note: need to recursively apply substitution to the new type because normalize does not substitute bound vars, but bound vars must be substituted when not in freeOnly mode.
+                return newtype->acceptMutator( *visitor );
+                ptr<Type> newType = i->second; // force clone if needed
+                add_qualifiers( newType, inst->qualifiers );
+                // Note: need to recursively apply substitution to the new type because normalize does not
+                // substitute bound vars, but bound vars must be substituted when not in freeOnly mode.
+                newType = newType->accept( *visitor );
+                return newType.release();
         } // if
+}
+Expression * TypeSubstitution::Substituter::postmutate( NameExpr * nameExpr ) {
+        VarEnvType::const_iterator i = sub.varEnv.find( nameExpr->name );
+        if ( i == sub.varEnv.end() ) {
+                return nameExpr;
+        } else {
+                subCount++;
+                delete nameExpr;
+                return i->second->clone();
+        } // if
+}
+void TypeSubstitution::Substituter::premutate( Type * type ) {
+void TypeSubstitution::Substituter::previsit( const FunctionType * ptype ) {
         GuardValue( boundVars );
         // bind type variables from forall-qualifiers
         if ( freeOnly ) {
                 for ( Type::ForallList::const_iterator tyvar = type->forall.begin(); tyvar != type->forall.end(); ++tyvar ) {
                         boundVars.insert( (*tyvar)->name );
+                for ( auto & tyvar : ptype->forall ) {
+                                boundVars.insert( *tyvar );
                 } // for
         } // if
+}
+template< typename TypeClass >
 void TypeSubstitution::Substituter::handleAggregateType( TypeClass * type ) {
+/*
+void TypeSubstitution::Substituter::handleAggregateType( const BaseInstType * type ) {
         GuardValue( boundVars );
         // bind type variables from forall-qualifiers
         if ( freeOnly ) {
-                for ( Type::ForallList::const_iterator tyvar = type->forall.begin(); tyvar != type->forall.end(); ++tyvar ) {
-                        boundVars.insert( (*tyvar)->name );
-                } // for
                 // bind type variables from generic type instantiations
+                std::list< TypeDecl* > *baseParameters = type->get_baseParameters();
+                if ( baseParameters && ! type->parameters.empty() ) {
+                        for ( std::list< TypeDecl* >::const_iterator tyvar = baseParameters->begin(); tyvar != baseParameters->end(); ++tyvar ) {
+                                boundVars.insert( (*tyvar)->name );
+                        } // for
+                } // if
+                if ( auto decl = type->aggr() ) {
+                        if ( ! type->params.empty() ) {
+                                for ( const TypeDecl * tyvar : decl->params ) {
+                                        boundVars.insert( *tyvar );
+                                } // for
+                        } // if
+                }
         } // if
+}
 void TypeSubstitution::Substituter::premutate( StructInstType * aggregateUseType ) {
+void TypeSubstitution::Substituter::previsit( const StructInstType * aggregateUseType ) {
         handleAggregateType( aggregateUseType );
+}
 void TypeSubstitution::Substituter::premutate( UnionInstType *aggregateUseType ) {
+void TypeSubstitution::Substituter::previsit( const UnionInstType *aggregateUseType ) {
         handleAggregateType( aggregateUseType );
+}
+#endif
+*/
 } // namespace ast

src/AST/TypeSubstitution.hpp

-              r3c64c668
+              r58fe85a
         TypeSubstitution &operator=( const TypeSubstitution &other );
+        template< typename SynTreeClass > int apply( const SynTreeClass *& input ) const;
+        template< typename SynTreeClass > int applyFree( const SynTreeClass *& input ) const;
+        template< typename SynTreeClass >
+        struct ApplyResult {
+                ast::ptr<SynTreeClass> node;
+                int count;
+        };
+        template< typename SynTreeClass > ApplyResult<SynTreeClass> apply( const SynTreeClass * input ) const;
+        template< typename SynTreeClass > ApplyResult<SynTreeClass> applyFree( const SynTreeClass * input ) const;
         template< typename node_t, enum Node::ref_type ref_t >
         int apply( ptr_base< node_t, ref_t > & input ) const {
                 const node_t * p = input.get();
                 int ret = apply(p);
                 input = p;
                 return ret;
+                auto ret = apply(p);
+                input = ret.node;
+                return ret.count;
+        }
 …
         int applyFree( ptr_base< node_t, ref_t > & input ) const {
                 const node_t * p = input.get();
                 int ret = applyFree(p);
                 input = p;
                 return ret;
+                auto ret = applyFree(p);
+                input = ret.node;
+                return ret.count;
+        }
+        void add( std::string formalType, const Type *actualType );
+        void add( const TypeInstType * formalType, const Type *actualType );
+        void add( const TypeInstType::TypeEnvKey & key, const Type *actualType );
         void add( const TypeSubstitution &other );
         void remove( std::string formalType );
         const Type *lookup( std::string formalType ) const;
+        void remove( const TypeInstType * formalType );
+        const Type *lookup( const TypeInstType * formalType ) const;
         bool empty() const;
-        void addVar( std::string formalExpr, const Expr *actualExpr );
         template< typename FormalIterator, typename ActualIterator >
 …
         void initialize( const TypeSubstitution &src, TypeSubstitution &dest );
         template<typename pass_type>
+        template<typename core_t>
         friend class Pass;
+        typedef std::unordered_map< std::string, ptr<Type> > TypeEnvType;
+        typedef std::unordered_map< std::string, ptr<Expr> > VarEnvType;
+        typedef std::unordered_map< TypeInstType::TypeEnvKey, ptr<Type> > TypeEnvType;
         TypeEnvType typeEnv;
-        VarEnvType varEnv;
   public:
 …
         auto   end() const -> decltype( typeEnv.  end() ) { return typeEnv.  end(); }
-        auto beginVar()       -> decltype( varEnv.begin() ) { return varEnv.begin(); }
-        auto   endVar()       -> decltype( varEnv.  end() ) { return varEnv.  end(); }
-        auto beginVar() const -> decltype( varEnv.begin() ) { return varEnv.begin(); }
-        auto   endVar() const -> decltype( varEnv.  end() ) { return varEnv.  end(); }
 };
+// this is the only place where type parameters outside a function formal may be substituted.
 template< typename FormalIterator, typename ActualIterator >
 void TypeSubstitution::add( FormalIterator formalBegin, FormalIterator formalEnd, ActualIterator actualBegin ) {
 …
                         if ( const TypeExpr *actual = actualIt->template as<TypeExpr>() ) {
                                 if ( formal->name != "" ) {
                                         typeEnv[ formal->name ] = actual->type;
+                                        typeEnv[ formal ] = actual->type;
                                 } // if
                         } else {
 …
                         } // if
                 } else {
+                        // TODO: type check the formal and actual parameters
+                        if ( (*formalIt)->name != "" ) {
+                                varEnv[ (*formalIt)->name ] = *actualIt;
+                        } // if
                 } // if
         } // for
+}
 template< typename FormalIterator, typename ActualIterator >
 …
+}
 } // namespace ast
 …
 // PassVisitor are defined before PassVisitor implementation accesses TypeSubstitution internals.
 #include "Pass.hpp"
+#include "Copy.hpp"
 namespace ast {
 // definitition must happen after PassVisitor is included so that WithGuards can be used
+struct TypeSubstitution::Substituter : public WithGuards, public WithVisitorRef<Substituter> {
+struct TypeSubstitution::Substituter : public WithGuards, public WithVisitorRef<Substituter>, public PureVisitor {
+                static size_t traceId;
                 Substituter( const TypeSubstitution & sub, bool freeOnly ) : sub( sub ), freeOnly( freeOnly ) {}
+#if TIME_TO_CONVERT_PASSES
+                Type * postmutate( TypeInstType * aggregateUseType );
+                Expression * postmutate( NameExpr * nameExpr );
+                const Type * postvisit( const TypeInstType * aggregateUseType );
                 /// Records type variable bindings from forall-statements
                 void premutate( Type * type );
+                void previsit( const FunctionType * type );
                 /// Records type variable bindings from forall-statements and instantiations of generic types
                 template< typename TypeClass > void handleAggregateType( TypeClass * type );
+                // void handleAggregateType( const BaseInstType * type );
+                void premutate( StructInstType * aggregateUseType );
+                void premutate( UnionInstType * aggregateUseType );
+#endif
+                // void previsit( const StructInstType * aggregateUseType );
+                // void previsit( const UnionInstType * aggregateUseType );
                 const TypeSubstitution & sub;
                 int subCount = 0;
                 bool freeOnly;
                 typedef std::unordered_set< std::string > BoundVarsType;
+                typedef std::unordered_set< TypeInstType::TypeEnvKey > BoundVarsType;
                 BoundVarsType boundVars;
 …
 template< typename SynTreeClass >
 int TypeSubstitution::apply( const SynTreeClass *& input ) const {
+TypeSubstitution::ApplyResult<SynTreeClass> TypeSubstitution::apply( const SynTreeClass * input ) const {
         assert( input );
         Pass<Substituter> sub( *this, false );
         input = strict_dynamic_cast< const SynTreeClass * >( input->accept( sub ) );
+///     std::cerr << "substitution result is: ";
+///     newType->print( std::cerr );
+///     std::cerr << std::endl;
+        return sub.pass.subCount;
+        return { input, sub.core.subCount };
+}
 template< typename SynTreeClass >
 int TypeSubstitution::applyFree( const SynTreeClass *& input ) const {
+TypeSubstitution::ApplyResult<SynTreeClass> TypeSubstitution::applyFree( const SynTreeClass * input ) const {
         assert( input );
         Pass<Substituter> sub( *this, true );
         input = strict_dynamic_cast< const SynTreeClass * >( input->accept( sub ) );
+///     std::cerr << "substitution result is: ";
+///     newType->print( std::cerr );
+///     std::cerr << std::endl;
+        return sub.pass.subCount;
+        return { input, sub.core.subCount };
+}

src/AST/Visitor.hpp

r3c64c668	r58fe85a
47	47	virtual const ast::Stmt * visit( const ast::CatchStmt * ) = 0;
48	48	virtual const ast::Stmt * visit( const ast::FinallyStmt * ) = 0;
	49	virtual const ast::Stmt * visit( const ast::SuspendStmt * ) = 0;
49	50	virtual const ast::Stmt * visit( const ast::WaitForStmt * ) = 0;
50	51	virtual const ast::Decl * visit( const ast::WithStmt * ) = 0;

src/AST/module.mk

-              r3c64c668
+              r58fe85a
 SRC_AST = \
         AST/AssertAcyclic.cpp \
+        AST/AssertAcyclic.hpp \
         AST/Attribute.cpp \
+        AST/Attribute.hpp \
+        AST/Bitfield.hpp \
+        AST/Chain.hpp \
         AST/Convert.cpp \
+        AST/Convert.hpp \
+        AST/Copy.hpp \
+        AST/CVQualifiers.hpp \
         AST/Decl.cpp \
+        AST/Decl.hpp \
         AST/DeclReplacer.cpp \
+        AST/DeclReplacer.hpp \
+        AST/Eval.hpp \
         AST/Expr.cpp \
+        AST/Expr.hpp \
+        AST/FunctionSpec.hpp \
+        AST/Fwd.hpp \
         AST/GenericSubstitution.cpp \
+        AST/GenericSubstitution.hpp \
         AST/Init.cpp \
+        AST/Init.hpp \
+        AST/Label.hpp \
         AST/LinkageSpec.cpp \
+        AST/LinkageSpec.hpp \
         AST/Node.cpp \
+        AST/Node.hpp \
+        AST/ParseNode.hpp \
         AST/Pass.cpp \
+        AST/Pass.hpp \
+        AST/Pass.impl.hpp \
+        AST/Pass.proto.hpp \
         AST/Print.cpp \
+        AST/Print.hpp \
         AST/Stmt.cpp \
+        AST/Stmt.hpp \
+        AST/StorageClasses.hpp \
         AST/SymbolTable.cpp \
+        AST/SymbolTable.hpp \
+        AST/TranslationUnit.hpp \
         AST/Type.cpp \
+        AST/Type.hpp \
         AST/TypeEnvironment.cpp \
+        AST/TypeSubstitution.cpp
+        AST/TypeEnvironment.hpp \
+        AST/TypeSubstitution.cpp \
+        AST/TypeSubstitution.hpp \
+        AST/Visitor.hpp
 SRC += $(SRC_AST)

src/AST/porting.md

-              r3c64c668
+              r58fe85a
   * Base nodes now override `const Node * accept( Visitor & v ) const = 0` with, e.g. `const Stmt * accept( Visitor & v ) const override = 0`
 * `PassVisitor` is replaced with `ast::Pass`
+  * Most one shot uses can use `ast::Pass::run` and `ast::Pass::read`.
+`WithConstTypeSubstitution`
+* `env` => `typeSubs`
 ## Structural Changes ##
 …
       template<typename node_t>
       friend node_t * mutate(const node_t * node);
+      template<typename node_t>
+      friend node_t * shallowCopy(const node_t * node);
+    or equilant.
+* You should use the `mutate` function where possible as it avoids extra copies.
+  * If you must copy use `shallowCopy` or `deepCopy` as required.
 All leaves of the `Node` inheritance tree are now declared `final`
 …
   * allows `newObject` as just default settings
+`FunctionDecl`
+* `params` and `returns` added.
+  * Contain the declarations of the parameters and return variables.
+  * Types should match (even be shared with) the fields of `type`.
 `NamedTypeDecl`
 * `parameters` => `params`
 …
 `AggregateDecl`
 * `parameters` => `params`
+`StructDecl`
+* `makeInst` replaced by better constructor on `StructInstType`.
 `Expr`
 …
 * **TODO** move `kind`, `typeNames` into code generator
 `ReferenceToType`
+`ReferenceToType` => `BaseInstType`
 * deleted `get_baseParameters()` from children
   * replace with `aggr() ? aggr()->params : nullptr`
 …
 * `returnVals` => `returns`
 * `parameters` => `params`
+  * Both now just point at types.
 * `bool isVarArgs;` => `enum ArgumentFlag { FixedArgs, VariableArgs }; ArgumentFlag isVarArgs;`
+`SueInstType`
+* Template class, with specializations and using to implement some other types:
+  * `StructInstType`, `UnionInstType` & `EnumInstType`
 `TypeInstType`

src/CodeGen/CodeGenerator.cc

r3c64c668	r58fe85a
120	120	// GCC builtins should always be printed unmangled
121	121	if ( options.pretty \|\| decl->linkage.is_gcc_builtin ) return decl->name;
122		if ( decl->mangleName != "" ) {
	122	if ( LinkageSpec::isMangled(decl->linkage) && decl->mangleName != "" ) {
123	123	// need to incorporate scope level in order to differentiate names for destructors
124	124	return decl->get_scopedMangleName();

src/CodeGen/FixMain.cc

-              r3c64c668
+              r58fe85a
 #include "SynTree/Declaration.h"   // for FunctionDecl, operator<<
 #include "SynTree/Type.h"          // for FunctionType
+#include "SymTab/Mangler.h"
 namespace CodeGen {
 …
                 if( main_signature ) {
                         os << "static inline int invoke_main(int argc, char* argv[], char* envp[]) { (void)argc; (void)argv; (void)envp; return ";
+                        main_signature->mangleName = SymTab::Mangler::mangle(main_signature.get());
                         os << main_signature->get_scopedMangleName() << "(";

src/CodeGen/FixNames.cc

-              r3c64c668
+              r58fe85a
 #include "SynTree/Type.h"          // for Type, BasicType, Type::Qualifiers
 #include "SynTree/Visitor.h"       // for Visitor, acceptAll
+#include "CompilationState.h"
 namespace CodeGen {
 …
                 if ( dwt->get_name() != "" ) {
                         if ( LinkageSpec::isMangled( dwt->get_linkage() ) ) {
+                                dwt->set_mangleName( SymTab::Mangler::mangle( dwt ) );
+                                if (!useNewAST) {
+                                        dwt->set_mangleName( SymTab::Mangler::mangle( dwt ) );
+                                }
                                 dwt->set_scopeLevel( scopeLevel );
                         } // if

src/CodeGen/module.mk

-              r3c64c668
+              r58fe85a
 SRC_CODEGEN = \
         CodeGen/CodeGenerator.cc \
+        CodeGen/CodeGenerator.h \
         CodeGen/FixMain.cc \
+        CodeGen/FixMain.h \
         CodeGen/GenType.cc \
+        CodeGen/OperatorTable.cc
+        CodeGen/GenType.h \
+        CodeGen/OperatorTable.cc \
+        CodeGen/OperatorTable.h \
+        CodeGen/Options.h
 SRC += $(SRC_CODEGEN) CodeGen/Generate.cc CodeGen/FixNames.cc
+SRC += $(SRC_CODEGEN) CodeGen/Generate.cc CodeGen/Generate.h CodeGen/FixNames.cc CodeGen/FixNames.h
 SRCDEMANGLE += $(SRC_CODEGEN)

src/CodeTools/TrackLoc.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Tues May 2 15:46:00 2017
 // Last Modified By : Andrew Beach
 // Last Modified On : Wed May 3 14:43:00 2017
 // Update Count     : 0
+// Last Modified On : Fri Nov 27 18:00:00 2020
+// Update Count     : 1
 //
 …
 #include <string>                    // for operator<<, string
 #include <typeindex>                 // for type_index
+#include <vector>                    // for vector
 #include "Common/PassVisitor.h"      // for PassVisitor
 …
                 CodeLocation *lastNode;
                 std::stack< CodeLocation * > parents;
+                std::stack< CodeLocation *, std::vector< CodeLocation * > > parents;
         public:
                 LocationPrinter(size_t printLevel) :

src/CodeTools/module.mk

-              r3c64c668
+              r58fe85a
 ###############################################################################
+SRC += CodeTools/DeclStats.cc \
+SRC += \
+        CodeTools/DeclStats.cc \
+        CodeTools/DeclStats.h \
         CodeTools/ResolvProtoDump.cc \
+        CodeTools/TrackLoc.cc
+        CodeTools/ResolvProtoDump.h \
+        CodeTools/TrackLoc.cc \
+        CodeTools/TrackLoc.h

src/Common/CodeLocation.h

-              r3c64c668
+              r58fe85a
+        }
+        bool followedBy( CodeLocation const & other, int seperation ) {
+        bool startsBefore( CodeLocation const & other ) const {
+                if( filename < other.filename ) return true;
+                if( filename > other.filename ) return false;
+                if( first_line < other.first_line ) return true;
+                if( first_line > other.first_line ) return false;
+                if( last_line < other.last_line ) return true;
+                return false;
+        }
+        bool followedBy( CodeLocation const & other, int seperation ) const {
                 return (first_line + seperation == other.first_line &&
                         filename == other.filename);
+        }
         bool operator==( CodeLocation const & other ) {
+        bool operator==( CodeLocation const & other ) const {
                 return followedBy( other, 0 );
+        }
         bool operator!=( CodeLocation const & other ) {
+        bool operator!=( CodeLocation const & other ) const {
                 return !(*this == other);
+        }

src/Common/Eval.cc

r3c64c668	r58fe85a
168	168	if (expr) {
169	169	expr->accept(ev);
170		return std::make_pair(ev.~~pass.value, ev.pass~~.valid);
	170	return std::make_pair(ev.core.value, ev.core.valid);
171	171	} else {
172	172	return std::make_pair(0, false);

src/Common/PassVisitor.h

-              r3c64c668
+              r58fe85a
         virtual void visit( FinallyStmt * finallyStmt ) override final;
         virtual void visit( const FinallyStmt * finallyStmt ) override final;
+        virtual void visit( SuspendStmt * suspendStmt ) override final;
+        virtual void visit( const SuspendStmt * suspendStmt ) override final;
         virtual void visit( WaitForStmt * waitforStmt ) override final;
         virtual void visit( const WaitForStmt * waitforStmt ) override final;
 …
         virtual Statement * mutate( CatchStmt * catchStmt ) override final;
         virtual Statement * mutate( FinallyStmt * finallyStmt ) override final;
+        virtual Statement * mutate( SuspendStmt * suspendStmt ) override final;
         virtual Statement * mutate( WaitForStmt * waitforStmt ) override final;
         virtual Declaration * mutate( WithStmt * withStmt ) override final;
 …
         virtual TypeSubstitution * mutate( TypeSubstitution * sub ) final;
+        bool isInFunction() const {
+                return inFunction;
+        }
 private:
         bool inFunction = false;
+        bool atFunctionTop = false;
         template<typename pass_t> friend void acceptAll( std::list< Declaration* > &decls, PassVisitor< pass_t >& visitor );
 …
 public:
         PassVisitor<pass_type> * const visitor = nullptr;
+        bool isInFunction() const {
+                return visitor->isInFunction();
+        }
 };

src/Common/PassVisitor.impl.h

-              r3c64c668
+              r58fe85a
                         indexerAddId( &func );
                         maybeAccept_impl( node->type, *this );
+                        // function body needs to have the same scope as parameters - CompoundStmt will not enter
+                        // a new scope if inFunction is true
+                        // First remember that we are now within a function.
                         ValueGuard< bool > oldInFunction( inFunction );
                         inFunction = true;
+                        // The function body needs to have the same scope as parameters.
+                        // A CompoundStmt will not enter a new scope if atFunctionTop is true.
+                        ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+                        atFunctionTop = true;
                         maybeAccept_impl( node->statements, *this );
                         maybeAccept_impl( node->attributes, *this );
 …
                         indexerAddId( &func );
                         maybeAccept_impl( node->type, *this );
+                        // function body needs to have the same scope as parameters - CompoundStmt will not enter
+                        // a new scope if inFunction is true
+                        // First remember that we are now within a function.
                         ValueGuard< bool > oldInFunction( inFunction );
                         inFunction = true;
+                        // The function body needs to have the same scope as parameters.
+                        // A CompoundStmt will not enter a new scope if atFunctionTop is true.
+                        ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+                        atFunctionTop = true;
                         maybeAccept_impl( node->statements, *this );
                         maybeAccept_impl( node->attributes, *this );
 …
                         indexerAddId( &func );
                         maybeMutate_impl( node->type, *this );
+                        // function body needs to have the same scope as parameters - CompoundStmt will not enter
+                        // a new scope if inFunction is true
+                        // First remember that we are now within a function.
                         ValueGuard< bool > oldInFunction( inFunction );
                         inFunction = true;
+                        // The function body needs to have the same scope as parameters.
+                        // A CompoundStmt will not enter a new scope if atFunctionTop is true.
+                        ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+                        atFunctionTop = true;
                         maybeMutate_impl( node->statements, *this );
                         maybeMutate_impl( node->attributes, *this );
 …
+        {
                 auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
-                maybeAccept_impl( node->parameters, *this );
                 maybeAccept_impl( node->base      , *this );
+        }
 …
+        {
                 auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
-                maybeAccept_impl( node->parameters, *this );
                 maybeAccept_impl( node->base      , *this );
+        }
 …
+        {
                 auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
-                maybeMutate_impl( node->parameters, *this );
                 maybeMutate_impl( node->base      , *this );
+        }
 …
+        {
                 auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
-                maybeAccept_impl( node->parameters, *this );
                 maybeAccept_impl( node->base      , *this );
+        }
 …
+        {
                 auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
-                maybeAccept_impl( node->parameters, *this );
                 maybeAccept_impl( node->base      , *this );
+        }
 …
+        {
                 auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
-                maybeMutate_impl( node->parameters, *this );
                 maybeMutate_impl( node->base      , *this );
+        }
 …
         VISIT_START( node );
+        {
                 // do not enter a new scope if inFunction is true - needs to check old state before the assignment
                 ValueGuard< bool > oldInFunction( inFunction );
                 auto guard1 = makeFuncGuard( [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeEnter(); }, [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeLeave(); } );
+                // Do not enter a new scope if atFunctionTop is true, don't leave one either.
+                ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+                auto guard1 = makeFuncGuard( [this, go = !atFunctionTop]() { if ( go ) indexerScopeEnter(); }, [this, go = !atFunctionTop]() { if ( go ) indexerScopeLeave(); } );
                 auto guard2 = makeFuncGuard( [this]() { call_beginScope();   }, [this]() { call_endScope();     } );
                 inFunction = false;
+                atFunctionTop = false;
                 visitStatementList( node->kids );
+        }
 …
         VISIT_START( node );
+        {
                 // do not enter a new scope if inFunction is true - needs to check old state before the assignment
                 ValueGuard< bool > oldInFunction( inFunction );
                 auto guard1 = makeFuncGuard( [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeEnter(); }, [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeLeave(); } );
+                // Do not enter a new scope if atFunctionTop is true, don't leave one either.
+                ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+                auto guard1 = makeFuncGuard( [this, go = !atFunctionTop]() { if ( go ) indexerScopeEnter(); }, [this, go = !atFunctionTop]() { if ( go ) indexerScopeLeave(); } );
                 auto guard2 = makeFuncGuard( [this]() { call_beginScope();   }, [this]() { call_endScope();     } );
                 inFunction = false;
+                atFunctionTop = false;
                 visitStatementList( node->kids );
+        }
 …
         MUTATE_START( node );
+        {
                 // do not enter a new scope if inFunction is true - needs to check old state before the assignment
                 ValueGuard< bool > oldInFunction( inFunction );
                 auto guard1 = makeFuncGuard( [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeEnter(); }, [this, &oldInFunction]() { if ( ! oldInFunction.old ) indexerScopeLeave(); } );
+                // Do not enter a new scope if atFunctionTop is true, don't leave one either.
+                ValueGuard< bool > oldAtFunctionTop( atFunctionTop );
+                auto guard1 = makeFuncGuard( [this, go = !atFunctionTop]() { if ( go ) indexerScopeEnter(); }, [this, go = !atFunctionTop]() { if ( go ) indexerScopeLeave(); } );
                 auto guard2 = makeFuncGuard( [this]() { call_beginScope();   }, [this]() { call_endScope();     } );
                 inFunction = false;
+                atFunctionTop = false;
                 mutateStatementList( node->kids );
+        }
 …
         maybeMutate_impl( node->block, *this );
+        MUTATE_END( Statement, node );
+}
+//--------------------------------------------------------------------------
+// SuspendStmt
+template< typename pass_type >
+void PassVisitor< pass_type >::visit( SuspendStmt * node ) {
+        VISIT_START( node );
+        maybeAccept_impl( node->then  , *this );
+        VISIT_END( node );
+}
+template< typename pass_type >
+void PassVisitor< pass_type >::visit( const SuspendStmt * node ) {
+        VISIT_START( node );
+        maybeAccept_impl( node->then  , *this );
+        VISIT_END( node );
+}
+template< typename pass_type >
+Statement * PassVisitor< pass_type >::mutate( SuspendStmt * node ) {
+        MUTATE_START( node );
+        maybeMutate_impl( node->then  , *this );
         MUTATE_END( Statement, node );

src/Common/PassVisitor.proto.h

r3c64c668	r58fe85a
38	38	};
39	39
40		std::stack< cleanup_t > cleanups;
	40	std::stack< cleanup_t, std::vector< cleanup_t > > cleanups;
41	41	};
42	42

src/Common/ScopedMap.h

-              r3c64c668
+              r58fe85a
                 reference operator* () { return *it; }
                 pointer operator-> () { return it.operator->(); }
+                pointer operator-> () const { return it.operator->(); }
                 iterator& operator++ () {
 …
         /// Gets the note at the given scope
+        Note& getNote() { return scopes.back().note; }
+        const Note& getNote() const { return scopes.back().note; }
         Note& getNote( size_type i ) { return scopes[i].note; }
         const Note& getNote( size_type i ) const { return scopes[i].note; }

src/Common/SemanticError.cc

-              r3c64c668
+              r58fe85a
 void SemanticErrorException::print() {
         using std::to_string;
+        errors.sort([](const error & lhs, const error & rhs) -> bool {
+                if(lhs.location.startsBefore(rhs.location)) return true;
+                if(rhs.location.startsBefore(lhs.location)) return false;
+                return lhs.description < rhs.description;
+        });
         for( auto err : errors ) {
                 std::cerr << ErrorHelpers::bold() << err.location << ErrorHelpers::error_str() << ErrorHelpers::reset_font() << err.description << std::endl;

src/Common/Stats/Heap.cc

-              r3c64c668
+              r58fe85a
                 const size_t passes_size = sizeof(passes) / sizeof(passes[0]);
                 size_t       passes_cnt = 1;
+                StatBlock    stacktrace_stats[100];
+                size_t       stacktrace_stats_count = 0;
+                bool         stacktrace_stats_enabled = true;
+                size_t       trace[1000];
+                const size_t stacktrace_max_depth = sizeof(trace) / sizeof(size_t);
+                size_t       stacktrace_depth;
+                size_t new_stacktrace_id(const char * const name) {
+                        stacktrace_stats[stacktrace_stats_count].name = name;
+                        return stacktrace_stats_count++;
+                }
+                void stacktrace_push(size_t id) {
+                        ++stacktrace_depth;
+                        assertf(stacktrace_depth < stacktrace_max_depth, "Stack trace too deep: increase size of array in Heap.cc");
+                        trace[stacktrace_depth] = id;
+                }
+                void stacktrace_pop() {
+                        assertf(stacktrace_depth > 0, "Invalid stack tracing operation: trace is empty");
+                        --stacktrace_depth;
+                }
                 void newPass( const char * const name ) {
 …
                         for(size_t i = 0; i < passes_cnt; i++) {
                                 print(passes[i], nc, total_mallocs, total_frees, overall_peak);
+                        }
+                        print('-', nct);
+                        std::cerr << std::setw(nc) << "Trace";
+                        std::cerr << " |       Malloc Count |         Free Count |        Peak Allocs |" << std::endl;
+                        print('-', nct);
+                        for (size_t i = 0; i < stacktrace_stats_count; i++) {
+                                print(stacktrace_stats[i], nc, total_mallocs, total_frees, overall_peak);
+                        }
                         print('-', nct);
 …
                                                 = std::max(passes[passes_cnt - 1].peak_allocs, passes[passes_cnt - 1].n_allocs);
+                                }
+                                if ( stacktrace_stats_enabled && stacktrace_depth > 0) {
+                                        stacktrace_stats[trace[stacktrace_depth]].mallocs++;
+                                }
                                 return __malloc( size );
+                        }
 …
                                         passes[passes_cnt - 1].frees++;
                                         passes[passes_cnt - 1].n_allocs--;
+                                }
+                                if ( stacktrace_stats_enabled && stacktrace_depth > 0) {
+                                        stacktrace_stats[trace[stacktrace_depth]].frees++;
+                                }
                                 return __free( ptr );
 …
                                                 = std::max(passes[passes_cnt - 1].peak_allocs, passes[passes_cnt - 1].n_allocs);
+                                }
+                                if ( stacktrace_stats_enabled && stacktrace_depth > 0) {
+                                        stacktrace_stats[trace[stacktrace_depth]].mallocs++;
+                                }
                                 return __calloc( nelem, size );
+                        }
 …
                                         passes[passes_cnt - 1].frees++;
                                 } // if
+                                if ( stacktrace_stats_enabled && stacktrace_depth > 0) {
+                                        stacktrace_stats[trace[stacktrace_depth]].mallocs++;
+                                        stacktrace_stats[trace[stacktrace_depth]].frees++;
+                                }
                                 return s;
+                        }

src/Common/Stats/Heap.h

r3c64c668	r58fe85a
20	20	void newPass( const char * const name );
21	21	void print();
	22
	23	size_t new_stacktrace_id(const char * const name);
	24	void stacktrace_push(size_t id);
	25	void stacktrace_pop();
22	26	}
23	27	}

src/Common/Stats/Stats.cc

-              r3c64c668
+              r58fe85a
+        }
+        namespace ResolveTime {
+                bool enabled = false;
+        }
         struct {
                 const char * const opt;
 …
                 { "heap"    , Heap::enabled },
                 { "time"    , Time::enabled },
+                { "resolve" , ResolveTime::enabled },
         };

src/Common/module.mk

-              r3c64c668
+              r58fe85a
 SRC_COMMON = \
       Common/Assert.cc \
+      Common/CodeLocation.h \
+      Common/CodeLocationTools.hpp \
+      Common/CodeLocationTools.cpp \
+      Common/CompilerError.h \
+      Common/Debug.h \
+      Common/ErrorObjects.h \
       Common/Eval.cc \
+      Common/Examine.cc \
+      Common/Examine.h \
+      Common/FilterCombos.h \
+      Common/Indenter.h \
       Common/PassVisitor.cc \
+      Common/PassVisitor.h \
+      Common/PassVisitor.impl.h \
+      Common/PassVisitor.proto.h \
+      Common/PersistentMap.h \
+      Common/ScopedMap.h \
       Common/SemanticError.cc \
+      Common/SemanticError.h \
+      Common/Stats.h \
+      Common/Stats/Base.h \
       Common/Stats/Counter.cc \
+      Common/Stats/Counter.h \
       Common/Stats/Heap.cc \
+      Common/Stats/Heap.h \
+      Common/Stats/ResolveTime.cc \
+      Common/Stats/ResolveTime.h \
       Common/Stats/Stats.cc \
       Common/Stats/Time.cc \
+      Common/UniqueName.cc
+      Common/Stats/Time.h \
+      Common/UnimplementedError.h \
+      Common/UniqueName.cc \
+      Common/UniqueName.h \
+      Common/utility.h \
+      Common/VectorMap.h
 SRC += $(SRC_COMMON) Common/DebugMalloc.cc

src/Common/utility.h

-              r3c64c668
+              r58fe85a
         reverse_iterate_t( T & ref ) : ref(ref) {}
+        typedef typename T::reverse_iterator iterator;
+        iterator begin() { return ref.rbegin(); }
+        iterator end() { return ref.rend(); }
+        // this does NOT work on const T!!!
+        // typedef typename T::reverse_iterator iterator;
+        auto begin() { return ref.rbegin(); }
+        auto end() { return ref.rend(); }
 };

src/CompilationState.cc

-              r3c64c668
+              r58fe85a
 //
+#include "config.h"
 int
         astp = false,
 …
         nopreludep = false,
         genproto = false,
+        deterministic_output = false,
+        useNewAST = CFA_USE_NEW_AST,
         nomainp = false,
         parsep = false,

src/CompilationState.h

r3c64c668	r58fe85a
28	28	nopreludep,
29	29	genproto,
	30	deterministic_output,
	31	useNewAST,
30	32	nomainp,
31	33	parsep,

src/Concurrency/Keywords.cc

-              r3c64c668
+              r58fe85a
 #include "Concurrency/Keywords.h"
+#include <cassert>                 // for assert
+#include <string>                  // for string, operator==
+#include "Common/PassVisitor.h"    // for PassVisitor
+#include "Common/SemanticError.h"  // for SemanticError
+#include "Common/utility.h"        // for deleteAll, map_range
+#include "CodeGen/OperatorTable.h" // for isConstructor
+#include "InitTweak/InitTweak.h"   // for getPointerBase
+#include "SynTree/LinkageSpec.h"   // for Cforall
+#include "SynTree/Constant.h"      // for Constant
+#include "SynTree/Declaration.h"   // for StructDecl, FunctionDecl, ObjectDecl
+#include "SynTree/Expression.h"    // for VariableExpr, ConstantExpr, Untype...
+#include "SynTree/Initializer.h"   // for SingleInit, ListInit, Initializer ...
+#include "SynTree/Label.h"         // for Label
+#include "SynTree/Statement.h"     // for CompoundStmt, DeclStmt, ExprStmt
+#include "SynTree/Type.h"          // for StructInstType, Type, PointerType
+#include "SynTree/Visitor.h"       // for Visitor, acceptAll
+#include <cassert>                        // for assert
+#include <string>                         // for string, operator==
+#include <iostream>
+#include "Common/Examine.h"               // for isMainFor
+#include "Common/PassVisitor.h"           // for PassVisitor
+#include "Common/SemanticError.h"         // for SemanticError
+#include "Common/utility.h"               // for deleteAll, map_range
+#include "CodeGen/OperatorTable.h"        // for isConstructor
+#include "ControlStruct/LabelGenerator.h" // for LebelGenerator
+#include "InitTweak/InitTweak.h"          // for getPointerBase
+#include "SynTree/LinkageSpec.h"          // for Cforall
+#include "SynTree/Constant.h"             // for Constant
+#include "SynTree/Declaration.h"          // for StructDecl, FunctionDecl, ObjectDecl
+#include "SynTree/Expression.h"           // for VariableExpr, ConstantExpr, Untype...
+#include "SynTree/Initializer.h"          // for SingleInit, ListInit, Initializer ...
+#include "SynTree/Label.h"                // for Label
+#include "SynTree/Statement.h"            // for CompoundStmt, DeclStmt, ExprStmt
+#include "SynTree/Type.h"                 // for StructInstType, Type, PointerType
+#include "SynTree/Visitor.h"              // for Visitor, acceptAll
+#include "Virtual/Tables.h"
 class Attribute;
 namespace Concurrency {
+        inline static std::string getVTableName( std::string const & exception_name ) {
+                return exception_name.empty() ? std::string() : Virtual::vtableTypeName(exception_name);
+        }
+        // Only detects threads constructed with the keyword thread.
+        inline static bool isThread( DeclarationWithType * decl ) {
+                Type * baseType = decl->get_type()->stripDeclarator();
+                StructInstType * instType = dynamic_cast<StructInstType *>( baseType );
+                if ( nullptr == instType ) { return false; }
+                return instType->baseStruct->is_thread();
+        }
         //=============================================================================================
         // Pass declarations
 …
           public:
+                ConcurrentSueKeyword( std::string&& type_name, std::string&& field_name, std::string&& getter_name, std::string&& context_error, bool needs_main, AggregateDecl::Aggregate cast_target ) :
+                  type_name( type_name ), field_name( field_name ), getter_name( getter_name ), context_error( context_error ), needs_main( needs_main ), cast_target( cast_target ) {}
+                ConcurrentSueKeyword( std::string&& type_name, std::string&& field_name,
+                        std::string&& getter_name, std::string&& context_error, std::string&& exception_name,
+                        bool needs_main, AggregateDecl::Aggregate cast_target ) :
+                  type_name( type_name ), field_name( field_name ), getter_name( getter_name ),
+                  context_error( context_error ), exception_name( exception_name ),
+                  vtable_name( getVTableName( exception_name ) ),
+                  needs_main( needs_main ), cast_target( cast_target ) {}
                 virtual ~ConcurrentSueKeyword() {}
 …
                 void handle( StructDecl * );
+                void addVtableForward( StructDecl * );
                 FunctionDecl * forwardDeclare( StructDecl * );
                 ObjectDecl * addField( StructDecl * );
 …
                 const std::string getter_name;
                 const std::string context_error;
+                const std::string exception_name;
+                const std::string vtable_name;
                 bool needs_main;
                 AggregateDecl::Aggregate cast_target;
 …
                 StructDecl   * type_decl = nullptr;
                 FunctionDecl * dtor_decl = nullptr;
+                StructDecl * except_decl = nullptr;
+                StructDecl * vtable_decl = nullptr;
         };
 …
                         "get_thread",
                         "thread keyword requires threads to be in scope, add #include <thread.hfa>\n",
+                        "ThreadCancelled",
                         true,
                         AggregateDecl::Thread
 …
                         "get_coroutine",
                         "coroutine keyword requires coroutines to be in scope, add #include <coroutine.hfa>\n",
+                        "CoroutineCancelled",
                         true,
                         AggregateDecl::Coroutine
 …
+                }
         };
         //-----------------------------------------------------------------------------
 …
                         "get_monitor",
                         "monitor keyword requires monitors to be in scope, add #include <monitor.hfa>\n",
+                        "",
                         false,
                         AggregateDecl::Monitor
 …
                         mutateAll( translationUnit, impl );
+                }
+        };
+        //-----------------------------------------------------------------------------
+        //Handles generator type declarations :
+        // generator MyGenerator {                   struct MyGenerator {
+        //      int data;                                  int data;
+        //      a_struct_t more_data;                      a_struct_t more_data;
+        //                                =>             int __gen_next;
+        // };                                        };
+        //
+        class GeneratorKeyword final : public ConcurrentSueKeyword {
+          public:
+                GeneratorKeyword() : ConcurrentSueKeyword(
+                        "$generator",
+                        "__generator_state",
+                        "get_generator",
+                        "Unable to find builtin type $generator\n",
+                        "",
+                        true,
+                        AggregateDecl::Generator
+                )
+                {}
+                virtual ~GeneratorKeyword() {}
+                virtual bool is_target( StructDecl * decl ) override final { return decl->is_generator(); }
+                static void implement( std::list< Declaration * > & translationUnit ) {
+                        PassVisitor< GeneratorKeyword > impl;
+                        mutateAll( translationUnit, impl );
+                }
+        };
+        //-----------------------------------------------------------------------------
+        class SuspendKeyword final : public WithStmtsToAdd, public WithGuards {
+        public:
+                SuspendKeyword() = default;
+                virtual ~SuspendKeyword() = default;
+                void  premutate( FunctionDecl * );
+                DeclarationWithType * postmutate( FunctionDecl * );
+                Statement * postmutate( SuspendStmt * );
+                static void implement( std::list< Declaration * > & translationUnit ) {
+                        PassVisitor< SuspendKeyword > impl;
+                        mutateAll( translationUnit, impl );
+                }
+        private:
+                bool is_real_suspend( FunctionDecl * );
+                Statement * make_generator_suspend( SuspendStmt * );
+                Statement * make_coroutine_suspend( SuspendStmt * );
+                struct LabelPair {
+                        Label obj;
+                        int   idx;
+                };
+                LabelPair make_label() {
+                        labels.push_back( gen.newLabel("generator") );
+                        return { labels.back(), int(labels.size()) };
+                }
+                DeclarationWithType * in_generator = nullptr;
+                FunctionDecl * decl_suspend = nullptr;
+                std::vector<Label> labels;
+                ControlStruct::LabelGenerator & gen = *ControlStruct::LabelGenerator::getGenerator();
         };
 …
                 std::list<DeclarationWithType*> findMutexArgs( FunctionDecl*, bool & first );
                 void validate( DeclarationWithType * );
+                void addDtorStatments( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
+                void addStatments( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
+                void addDtorStatements( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
+                void addStatements( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
+                void addThreadDtorStatements( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args );
                 static void implement( std::list< Declaration * > & translationUnit ) {
 …
                 StructDecl* guard_decl = nullptr;
                 StructDecl* dtor_guard_decl = nullptr;
+                StructDecl* thread_guard_decl = nullptr;
                 static std::unique_ptr< Type > generic_func;
 …
                 CoroutineKeyword        ::implement( translationUnit );
                 MonitorKeyword  ::implement( translationUnit );
+                GeneratorKeyword  ::implement( translationUnit );
+                SuspendKeyword    ::implement( translationUnit );
+        }
 …
                         handle( decl );
+                }
+                else if ( !except_decl && exception_name == decl->name && decl->body ) {
+                        except_decl = decl;
+                }
+                else if ( !vtable_decl && vtable_name == decl->name && decl->body ) {
+                        vtable_decl = decl;
+                }
+                // Might be able to get ride of is target.
+                assert( is_target(decl) == (cast_target == decl->kind) );
                 return decl;
+        }
         DeclarationWithType * ConcurrentSueKeyword::postmutate( FunctionDecl * decl ) {
+                if( !type_decl ) return decl;
+                if( !CodeGen::isDestructor( decl->name ) ) return decl;
+                auto params = decl->type->parameters;
+                if( params.size() != 1 ) return decl;
+                auto type = dynamic_cast<ReferenceType*>( params.front()->get_type() );
+                if( !type ) return decl;
+                auto stype = dynamic_cast<StructInstType*>( type->base );
+                if( !stype ) return decl;
+                if( stype->baseStruct != type_decl ) return decl;
+                if( !dtor_decl ) dtor_decl = decl;
+                if ( type_decl && isDestructorFor( decl, type_decl ) )
+                        dtor_decl = decl;
+                else if ( vtable_name.empty() )
+                        ;
+                else if( !decl->has_body() )
+                        ;
+                else if ( auto param = isMainFor( decl, cast_target ) ) {
+                        // This should never trigger.
+                        assert( vtable_decl );
+                        // Should be safe because of isMainFor.
+                        StructInstType * struct_type = static_cast<StructInstType *>(
+                                static_cast<ReferenceType *>( param->get_type() )->base );
+                        assert( struct_type );
+                        std::list< Expression * > poly_args = { new TypeExpr( struct_type->clone() ) };
+                        ObjectDecl * vtable_object = Virtual::makeVtableInstance(
+                                vtable_decl->makeInst( poly_args ), struct_type, nullptr );
+                        declsToAddAfter.push_back( vtable_object );
+                        declsToAddAfter.push_back( Virtual::makeGetExceptionFunction(
+                                vtable_object, except_decl->makeInst( std::move( poly_args ) )
+                        ) );
+                }
                 return decl;
+        }
 …
                 if( !dtor_decl ) SemanticError( decl, context_error );
+                addVtableForward( decl );
                 FunctionDecl * func = forwardDeclare( decl );
                 ObjectDecl * field = addField( decl );
                 addRoutines( field, func );
+        }
+        void ConcurrentSueKeyword::addVtableForward( StructDecl * decl ) {
+                if ( vtable_decl ) {
+                        std::list< Expression * > poly_args = {
+                                new TypeExpr( new StructInstType( noQualifiers, decl ) ),
+                        };
+                        declsToAddBefore.push_back( Virtual::makeGetExceptionForward(
+                                vtable_decl->makeInst( poly_args ),
+                                except_decl->makeInst( poly_args )
+                        ) );
+                        declsToAddBefore.push_back( Virtual::makeVtableForward(
+                                vtable_decl->makeInst( move( poly_args ) ) ) );
+                // Its only an error if we want a vtable and don't have one.
+                } else if ( ! vtable_name.empty() ) {
+                        SemanticError( decl, context_error );
+                }
+        }
 …
                                                 new CastExpr(
                                                         new VariableExpr( func->get_functionType()->get_parameters().front() ),
+                                                        func->get_functionType()->get_parameters().front()->get_type()->stripReferences()->clone()
+                                                        func->get_functionType()->get_parameters().front()->get_type()->stripReferences()->clone(),
+                                                        false
+                                                )
+                                        )
 …
                 declsToAddAfter.push_back( get_decl );
+                // get_decl->fixUniqueId();
+        }
+        }
+        //=============================================================================================
+        // Suspend keyword implementation
+        //=============================================================================================
+        bool SuspendKeyword::is_real_suspend( FunctionDecl * func ) {
+                if(isMangled(func->linkage)) return false; // the real suspend isn't mangled
+                if(func->name != "__cfactx_suspend") return false; // the real suspend has a specific name
+                if(func->type->parameters.size() != 0) return false; // Too many parameters
+                if(func->type->returnVals.size() != 0) return false; // Too many return values
+                return true;
+        }
+        void SuspendKeyword::premutate( FunctionDecl * func ) {
+                GuardValue(in_generator);
+                in_generator = nullptr;
+                // Is this the real suspend?
+                if(is_real_suspend(func)) {
+                        decl_suspend = decl_suspend ? decl_suspend : func;
+                        return;
+                }
+                // Is this the main of a generator?
+                auto param = isMainFor( func, AggregateDecl::Aggregate::Generator );
+                if(!param) return;
+                if(func->type->returnVals.size() != 0) SemanticError(func->location, "Generator main must return void");
+                in_generator = param;
+                GuardValue(labels);
+                labels.clear();
+        }
+        DeclarationWithType * SuspendKeyword::postmutate( FunctionDecl * func ) {
+                if( !func->statements ) return func; // Not the actual definition, don't do anything
+                if( !in_generator     ) return func; // Not in a generator, don't do anything
+                if( labels.empty()    ) return func; // Generator has no states, nothing to do, could throw a warning
+                // This is a generator main, we need to add the following code to the top
+                // static void * __generator_labels[] = {&&s0, &&s1, ...};
+                // goto * __generator_labels[gen.__generator_state];
+                const auto & loc = func->location;
+                const auto first_label = gen.newLabel("generator");
+                // for each label add to declaration
+                std::list<Initializer*> inits = { new SingleInit( new LabelAddressExpr( first_label ) ) };
+                for(const auto & label : labels) {
+                        inits.push_back(
+                                new SingleInit(
+                                        new LabelAddressExpr( label )
+                                )
+                        );
+                }
+                auto init = new ListInit(std::move(inits), noDesignators, true);
+                labels.clear();
+                // create decl
+                auto decl = new ObjectDecl(
+                        "__generator_labels",
+                        Type::StorageClasses( Type::Static ),
+                        LinkageSpec::AutoGen,
+                        nullptr,
+                        new ArrayType(
+                                Type::Qualifiers(),
+                                new PointerType(
+                                        Type::Qualifiers(),
+                                        new VoidType( Type::Qualifiers() )
+                                ),
+                                nullptr,
+                                false, false
+                        ),
+                        init
+                );
+                // create the goto
+                assert(in_generator);
+                auto go_decl = new ObjectDecl(
+                        "__generator_label",
+                        noStorageClasses,
+                        LinkageSpec::AutoGen,
+                        nullptr,
+                        new PointerType(
+                                Type::Qualifiers(),
+                                new VoidType( Type::Qualifiers() )
+                        ),
+                        new SingleInit(
+                                new UntypedExpr(
+                                        new NameExpr("?[?]"),
+                                        {
+                                                new NameExpr("__generator_labels"),
+                                                new UntypedMemberExpr(
+                                                        new NameExpr("__generator_state"),
+                                                        new VariableExpr( in_generator )
+                                                )
+                                        }
+                                )
+                        )
+                );
+                go_decl->location = loc;
+                auto go = new BranchStmt(
+                        new VariableExpr( go_decl ),
+                        BranchStmt::Goto
+                );
+                go->location = loc;
+                go->computedTarget->location = loc;
+                auto noop = new NullStmt({ first_label });
+                noop->location = loc;
+                // wrap everything in a nice compound
+                auto body = new CompoundStmt({
+                        new DeclStmt( decl ),
+                        new DeclStmt( go_decl ),
+                        go,
+                        noop,
+                        func->statements
+                });
+                body->location   = loc;
+                func->statements = body;
+                return func;
+        }
+        Statement * SuspendKeyword::postmutate( SuspendStmt * stmt ) {
+                SuspendStmt::Type type = stmt->type;
+                if(type == SuspendStmt::None) {
+                        // This suspend has a implicit target, find it
+                        type = in_generator ? SuspendStmt::Generator : SuspendStmt::Coroutine;
+                }
+                // Check that the target makes sense
+                if(!in_generator && type == SuspendStmt::Generator) SemanticError( stmt->location, "'suspend generator' must be used inside main of generator type.");
+                // Act appropriately
+                switch(type) {
+                        case SuspendStmt::Generator: return make_generator_suspend(stmt);
+                        case SuspendStmt::Coroutine: return make_coroutine_suspend(stmt);
+                        default: abort();
+                }
+        }
+        Statement * SuspendKeyword::make_generator_suspend( SuspendStmt * stmt ) {
+                assert(in_generator);
+                // Target code is :
+                //   gen.__generator_state = X;
+                //   { THEN }
+                //   return;
+                //   __gen_X:;
+                // Save the location and delete the old statement, we only need the location from this point on
+                auto loc = stmt->location;
+                // Build the label and get its index
+                auto label = make_label();
+                // Create the context saving statement
+                auto save = new ExprStmt( new UntypedExpr(
+                        new NameExpr( "?=?" ),
+                        {
+                                new UntypedMemberExpr(
+                                        new NameExpr("__generator_state"),
+                                        new VariableExpr( in_generator )
+                                ),
+                                new ConstantExpr(
+                                        Constant::from_int( label.idx )
+                                )
+                        }
+                ));
+                assert(save->expr);
+                save->location = loc;
+                stmtsToAddBefore.push_back( save );
+                // if we have a then add it here
+                auto then = stmt->then;
+                stmt->then = nullptr;
+                delete stmt;
+                if(then) stmtsToAddBefore.push_back( then );
+                // Create the return statement
+                auto ret = new ReturnStmt( nullptr );
+                ret->location = loc;
+                stmtsToAddBefore.push_back( ret );
+                // Create the null statement with the created label
+                auto noop = new NullStmt({ label.obj });
+                noop->location = loc;
+                // Return the null statement to take the place of the previous statement
+                return noop;
+        }
+        Statement * SuspendKeyword::make_coroutine_suspend( SuspendStmt * stmt ) {
+                if(stmt->then) SemanticError( stmt->location, "Compound statement following coroutines is not implemented.");
+                // Save the location and delete the old statement, we only need the location from this point on
+                auto loc = stmt->location;
+                delete stmt;
+                // Create the call expression
+                if(!decl_suspend) SemanticError( loc, "suspend keyword applied to coroutines requires coroutines to be in scope, add #include <coroutine.hfa>\n");
+                auto expr = new UntypedExpr( VariableExpr::functionPointer( decl_suspend ) );
+                expr->location = loc;
+                // Change this statement into a regular expr
+                assert(expr);
+                auto nstmt = new ExprStmt( expr );
+                nstmt->location = loc;
+                return nstmt;
+        }
         //=============================================================================================
 …
                 bool first = false;
                 std::list<DeclarationWithType*> mutexArgs = findMutexArgs( decl, first );
                 bool isDtor = CodeGen::isDestructor( decl->name );
+                bool const isDtor = CodeGen::isDestructor( decl->name );
                 // Is this function relevant to monitors
 …
                 // Instrument the body
+                if( isDtor ) {
+                        addDtorStatments( decl, body, mutexArgs );
+                if ( isDtor && isThread( mutexArgs.front() ) ) {
+                        if( !thread_guard_decl ) {
+                                SemanticError( decl, "thread destructor requires threads to be in scope, add #include <thread.hfa>\n" );
+                        }
+                        addThreadDtorStatements( decl, body, mutexArgs );
+                }
+                else if ( isDtor ) {
+                        addDtorStatements( decl, body, mutexArgs );
+                }
                 else {
                         addStatments( decl, body, mutexArgs );
+                        addStatements( decl, body, mutexArgs );
+                }
+        }
 …
                         assert( !dtor_guard_decl );
                         dtor_guard_decl = decl;
+                }
+                else if( decl->name == "thread_dtor_guard_t" && decl->body ) {
+                        assert( !thread_guard_decl );
+                        thread_guard_decl = decl;
+                }
+        }
 …
+        }
         void MutexKeyword::addDtorStatments( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
+        void MutexKeyword::addDtorStatements( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
                 Type * arg_type = args.front()->get_type()->clone();
                 arg_type->set_mutex( false );
 …
                         new SingleInit( new UntypedExpr(
                                 new NameExpr( "get_monitor" ),
                                 {  new CastExpr( new VariableExpr( args.front() ), arg_type ) }
+                                {  new CastExpr( new VariableExpr( args.front() ), arg_type, false ) }
                         ))
                 );
 …
+                                        {
                                                 new SingleInit( new AddressExpr( new VariableExpr( monitors ) ) ),
+                                                new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone() ) )
+                                                new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone(), false ) ),
+                                                new SingleInit( new ConstantExpr( Constant::from_bool( false ) ) )
                                         },
                                         noDesignators,
 …
                 //$monitor * __monitors[] = { get_monitor(a), get_monitor(b) };
+                body->push_front( new DeclStmt( monitors) );
+        }
+        void MutexKeyword::addStatments( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
+                body->push_front( new DeclStmt( monitors ) );
+        }
+        void MutexKeyword::addThreadDtorStatements(
+                        FunctionDecl*, CompoundStmt * body,
+                        const std::list<DeclarationWithType * > & args ) {
+                assert( args.size() == 1 );
+                DeclarationWithType * arg = args.front();
+                Type * arg_type = arg->get_type()->clone();
+                assert( arg_type->get_mutex() );
+                arg_type->set_mutex( false );
+                // thread_dtor_guard_t __guard = { this, intptr( 0 ) };
+                body->push_front(
+                        new DeclStmt( new ObjectDecl(
+                                "__guard",
+                                noStorageClasses,
+                                LinkageSpec::Cforall,
+                                nullptr,
+                                new StructInstType(
+                                        noQualifiers,
+                                        thread_guard_decl
+                                ),
+                                new ListInit(
+                                        {
+                                                new SingleInit( new CastExpr( new VariableExpr( arg ), arg_type ) ),
+                                                new SingleInit( new UntypedExpr(
+                                                        new NameExpr( "intptr" ), {
+                                                                new ConstantExpr( Constant::from_int( 0 ) ),
+                                                        }
+                                                ) ),
+                                        },
+                                        noDesignators,
+                                        true
+                                )
+                        ))
+                );
+        }
+        void MutexKeyword::addStatements( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
                 ObjectDecl * monitors = new ObjectDecl(
                         "__monitors",
 …
                                         return new SingleInit( new UntypedExpr(
                                                 new NameExpr( "get_monitor" ),
                                                 {  new CastExpr( new VariableExpr( var ), type ) }
+                                                {  new CastExpr( new VariableExpr( var ), type, false ) }
                                         ) );
                                 })
 …
                                                 new SingleInit( new VariableExpr( monitors ) ),
                                                 new SingleInit( new ConstantExpr( Constant::from_ulong( args.size() ) ) ),
                                                 new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone() ) )
+                                                new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone(), false ) )
                                         },
                                         noDesignators,
 …
 // tab-width: 4 //
 // End: //

src/Concurrency/Waitfor.cc

-              r3c64c668
+              r58fe85a
                                                                 decl_monitor
+                                                        )
+                                                )
+                                                ),
+                                                false
                                         );
 …
                         new CompoundStmt({
                                 makeAccStatement( acceptables, index, "is_dtor", detectIsDtor( clause.target.function )                                    , indexer ),
                                 makeAccStatement( acceptables, index, "func"   , new CastExpr( clause.target.function, fptr_t )                            , indexer ),
+                                makeAccStatement( acceptables, index, "func"   , new CastExpr( clause.target.function, fptr_t, false )                     , indexer ),
                                 makeAccStatement( acceptables, index, "data"   , new VariableExpr( monitors )                                              , indexer ),
                                 makeAccStatement( acceptables, index, "size"   , new ConstantExpr( Constant::from_ulong( clause.target.arguments.size() ) ), indexer ),
 …
                                                                 decl_mask
+                                                        )
+                                                )
+                                                ),
+                                                false
                                         ),
                                         timeout

src/Concurrency/module.mk

r3c64c668	r58fe85a
15	15	###############################################################################
16	16
17		SRC += Concurrency/Keywords.cc Concurrency/~~Waitfor.cc~~
	17	SRC += Concurrency/Keywords.cc Concurrency/Keywords.h Concurrency/Waitfor.cc Concurrency/Waitfor.h
18	18	SRCDEMANGLE += Concurrency/Keywords.cc
19	19

src/ControlStruct/ExceptTranslate.cc

-              r3c64c668
+              r58fe85a
 // Author           : Andrew Beach
 // Created On       : Wed Jun 14 16:49:00 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Dec 13 23:40:15 2019
 // Update Count     : 12
+// Last Modified By : Andrew Beach
+// Last Modified On : Wed Jun 24 11:18:00 2020
+// Update Count     : 17
 //
 …
+        }
+        class ExceptionMutatorCore : public WithGuards {
+                enum Context { NoHandler, TerHandler, ResHandler };
+                // Also need to handle goto, break & continue.
+                // They need to be cut off in a ResHandler, until we enter another
+                // loop, switch or the goto stays within the function.
+                Context cur_context;
+                // The current (innermost) termination handler exception declaration.
+                ObjectDecl * handler_except_decl;
+        class ThrowMutatorCore : public WithGuards {
+                ObjectDecl * terminate_handler_except;
+                enum Context { NoHandler, TerHandler, ResHandler } cur_context;
+                // The helper functions for code/syntree generation.
+                Statement * create_either_throw(
+                        ThrowStmt * throwStmt, const char * throwFunc );
+                Statement * create_terminate_rethrow( ThrowStmt * throwStmt );
+        public:
+                ThrowMutatorCore() :
+                        terminate_handler_except( nullptr ),
+                        cur_context( NoHandler )
+                {}
+                void premutate( CatchStmt *catchStmt );
+                Statement * postmutate( ThrowStmt *throwStmt );
+        };
+        // ThrowStmt Mutation Helpers
+        Statement * ThrowMutatorCore::create_either_throw(
+                        ThrowStmt * throwStmt, const char * throwFunc ) {
+                // `throwFunc`( `throwStmt->get_name()` );
+                UntypedExpr * call = new UntypedExpr( new NameExpr( throwFunc ) );
+                call->get_args().push_back( throwStmt->get_expr() );
+                throwStmt->set_expr( nullptr );
+                delete throwStmt;
+                return new ExprStmt( call );
+        }
+        Statement * ThrowMutatorCore::create_terminate_rethrow(
+                        ThrowStmt *throwStmt ) {
+                // { `terminate_handler_except` = 0p; __rethrow_terminate(); }
+                assert( nullptr == throwStmt->get_expr() );
+                assert( terminate_handler_except );
+                CompoundStmt * result = new CompoundStmt();
+                result->labels =  throwStmt->labels;
+                result->push_back( new ExprStmt( UntypedExpr::createAssign(
+                        nameOf( terminate_handler_except ),
+                        new ConstantExpr( Constant::null(
+                                terminate_handler_except->get_type()->clone()
+                                ) )
+                        ) ) );
+                result->push_back( new ExprStmt(
+                        new UntypedExpr( new NameExpr( "__cfaehm_rethrow_terminate" ) )
+                        ) );
+                delete throwStmt;
+                return result;
+        }
+        // Visiting/Mutating Functions
+        void ThrowMutatorCore::premutate( CatchStmt *catchStmt ) {
+                // Validate the statement's form.
+                ObjectDecl * decl = dynamic_cast<ObjectDecl *>( catchStmt->get_decl() );
+                // Also checking the type would be nice.
+                if ( !decl || !dynamic_cast<PointerType *>( decl->type ) ) {
+                        std::string kind = (CatchStmt::Terminate == catchStmt->kind) ? "catch" : "catchResume";
+                        SemanticError( catchStmt->location, kind + " must have pointer to an exception type" );
+                }
+                // Track the handler context.
+                GuardValue( cur_context );
+                if ( CatchStmt::Terminate == catchStmt->get_kind() ) {
+                        cur_context = TerHandler;
+                        GuardValue( terminate_handler_except );
+                        terminate_handler_except = decl;
+                } else {
+                        cur_context = ResHandler;
+                }
+        }
+        Statement * ThrowMutatorCore::postmutate( ThrowStmt *throwStmt ) {
+                // Ignoring throwStmt->get_target() for now.
+                if ( ThrowStmt::Terminate == throwStmt->get_kind() ) {
+                        if ( throwStmt->get_expr() ) {
+                                return create_either_throw( throwStmt, "$throw" );
+                        } else if ( TerHandler == cur_context ) {
+                                return create_terminate_rethrow( throwStmt );
+                        } else {
+                                abort("Invalid throw in %s at %i\n",
+                                        throwStmt->location.filename.c_str(),
+                                        throwStmt->location.first_line);
+                        }
+                } else {
+                        if ( throwStmt->get_expr() ) {
+                                return create_either_throw( throwStmt, "$throwResume" );
+                        } else if ( ResHandler == cur_context ) {
+                                // This has to be handled later.
+                                return throwStmt;
+                        } else {
+                                abort("Invalid throwResume in %s at %i\n",
+                                        throwStmt->location.filename.c_str(),
+                                        throwStmt->location.first_line);
+                        }
+                }
+        }
+        class TryMutatorCore {
                 // The built in types used in translation.
                 StructDecl * except_decl;
 …
                 // The many helper functions for code/syntree generation.
-                Statement * create_given_throw(
-                        const char * throwFunc, ThrowStmt * throwStmt );
-                Statement * create_terminate_throw( ThrowStmt * throwStmt );
-                Statement * create_terminate_rethrow( ThrowStmt * throwStmt );
-                Statement * create_resume_throw( ThrowStmt * throwStmt );
-                Statement * create_resume_rethrow( ThrowStmt * throwStmt );
                 CompoundStmt * take_try_block( TryStmt * tryStmt );
                 FunctionDecl * create_try_wrapper( CompoundStmt * body );
 …
                 FunctionDecl * create_finally_wrapper( TryStmt * tryStmt );
                 ObjectDecl * create_finally_hook( FunctionDecl * finally_wrapper );
+                Statement * create_resume_rethrow( ThrowStmt * throwStmt );
                 // Types used in translation, make sure to use clone.
 …
         public:
+                ExceptionMutatorCore() :
+                        cur_context( NoHandler ),
+                        handler_except_decl( nullptr ),
+                TryMutatorCore() :
                         except_decl( nullptr ), node_decl( nullptr ), hook_decl( nullptr ),
                         try_func_t( noQualifiers, false ),
 …
                 {}
-                void premutate( CatchStmt *catchStmt );
                 void premutate( StructDecl *structDecl );
+                Statement * postmutate( TryStmt *tryStmt );
                 Statement * postmutate( ThrowStmt *throwStmt );
-                Statement * postmutate( TryStmt *tryStmt );
         };
         void ExceptionMutatorCore::init_func_types() {
+        void TryMutatorCore::init_func_types() {
                 assert( except_decl );
 …
+        }
-        // ThrowStmt Mutation Helpers
-        Statement * ExceptionMutatorCore::create_given_throw(
-                        const char * throwFunc, ThrowStmt * throwStmt ) {
-                // `throwFunc`( `throwStmt->get_name` );
-                UntypedExpr * call = new UntypedExpr( new NameExpr( throwFunc ) );
-                call->get_args().push_back( throwStmt->get_expr() );
-                throwStmt->set_expr( nullptr );
-                delete throwStmt;
-                return new ExprStmt( call );
+        }
-        Statement * ExceptionMutatorCore::create_terminate_throw(
-                        ThrowStmt *throwStmt ) {
-                // __throw_terminate( `throwStmt->get_name()` ); }
-                return create_given_throw( "__cfaabi_ehm__throw_terminate", throwStmt );
+        }
-        Statement * ExceptionMutatorCore::create_terminate_rethrow(
-                        ThrowStmt *throwStmt ) {
-                // { `handler_except_decl` = NULL; __rethrow_terminate(); }
-                assert( nullptr == throwStmt->get_expr() );
-                assert( handler_except_decl );
-                CompoundStmt * result = new CompoundStmt();
-                result->labels =  throwStmt->labels;
-                result->push_back( new ExprStmt( UntypedExpr::createAssign(
-                        nameOf( handler_except_decl ),
-                        new ConstantExpr( Constant::null(
-                                new PointerType(
-                                        noQualifiers,
-                                        handler_except_decl->get_type()->clone()
+                                        )
-                                ) )
-                        ) ) );
-                result->push_back( new ExprStmt(
-                        new UntypedExpr( new NameExpr( "__cfaabi_ehm__rethrow_terminate" ) )
-                        ) );
-                delete throwStmt;
-                return result;
+        }
-        Statement * ExceptionMutatorCore::create_resume_throw(
-                        ThrowStmt *throwStmt ) {
-                // __throw_resume( `throwStmt->get_name` );
-                return create_given_throw( "__cfaabi_ehm__throw_resume", throwStmt );
+        }
-        Statement * ExceptionMutatorCore::create_resume_rethrow(
-                        ThrowStmt *throwStmt ) {
-                // return false;
-                Statement * result = new ReturnStmt(
-                        new ConstantExpr( Constant::from_bool( false ) )
-                        );
-                result->labels = throwStmt->labels;
-                delete throwStmt;
-                return result;
+        }
         // TryStmt Mutation Helpers
         CompoundStmt * ExceptionMutatorCore::take_try_block( TryStmt *tryStmt ) {
+        CompoundStmt * TryMutatorCore::take_try_block( TryStmt *tryStmt ) {
                 CompoundStmt * block = tryStmt->get_block();
                 tryStmt->set_block( nullptr );
 …
+        }
         FunctionDecl * ExceptionMutatorCore::create_try_wrapper(
+        FunctionDecl * TryMutatorCore::create_try_wrapper(
                         CompoundStmt *body ) {
 …
+        }
         FunctionDecl * ExceptionMutatorCore::create_terminate_catch(
+        FunctionDecl * TryMutatorCore::create_terminate_catch(
                         CatchList &handlers ) {
                 std::list<CaseStmt *> handler_wrappers;
 …
                         local_except->get_attributes().push_back( new Attribute(
                                 "cleanup",
                                 { new NameExpr( "__cfaabi_ehm__cleanup_terminate" ) }
+                                { new NameExpr( "__cfaehm_cleanup_terminate" ) }
                                 ) );
 …
         // Create a single check from a moddified handler.
         // except_obj is referenced, modded_handler will be freed.
         CompoundStmt * ExceptionMutatorCore::create_single_matcher(
+        CompoundStmt * TryMutatorCore::create_single_matcher(
                         DeclarationWithType * except_obj, CatchStmt * modded_handler ) {
                 // {
 …
+        }
         FunctionDecl * ExceptionMutatorCore::create_terminate_match(
+        FunctionDecl * TryMutatorCore::create_terminate_match(
                         CatchList &handlers ) {
                 // int match(exception * except) {
 …
+        }
         CompoundStmt * ExceptionMutatorCore::create_terminate_caller(
+        CompoundStmt * TryMutatorCore::create_terminate_caller(
                         FunctionDecl * try_wrapper,
                         FunctionDecl * terminate_catch,
                         FunctionDecl * terminate_match ) {
                 // { __cfaabi_ehm__try_terminate(`try`, `catch`, `match`); }
+                // { __cfaehm_try_terminate(`try`, `catch`, `match`); }
                 UntypedExpr * caller = new UntypedExpr( new NameExpr(
                         "__cfaabi_ehm__try_terminate" ) );
+                        "__cfaehm_try_terminate" ) );
                 std::list<Expression *>& args = caller->get_args();
                 args.push_back( nameOf( try_wrapper ) );
 …
+        }
         FunctionDecl * ExceptionMutatorCore::create_resume_handler(
+        FunctionDecl * TryMutatorCore::create_resume_handler(
                         CatchList &handlers ) {
                 // bool handle(exception * except) {
 …
+        }
         CompoundStmt * ExceptionMutatorCore::create_resume_wrapper(
+        CompoundStmt * TryMutatorCore::create_resume_wrapper(
                         Statement * wraps,
                         FunctionDecl * resume_handler ) {
 …
                 // struct __try_resume_node __resume_node
                 //      __attribute__((cleanup( __cfaabi_ehm__try_resume_cleanup )));
+                //      __attribute__((cleanup( __cfaehm_try_resume_cleanup )));
                 // ** unwinding of the stack here could cause problems **
                 // ** however I don't think that can happen currently **
                 // __cfaabi_ehm__try_resume_setup( &__resume_node, resume_handler );
+                // __cfaehm_try_resume_setup( &__resume_node, resume_handler );
                 std::list< Attribute * > attributes;
 …
                         std::list< Expression * > attr_params;
                         attr_params.push_back( new NameExpr(
                                 "__cfaabi_ehm__try_resume_cleanup" ) );
+                                "__cfaehm_try_resume_cleanup" ) );
                         attributes.push_back( new Attribute( "cleanup", attr_params ) );
+                }
 …
                 UntypedExpr *setup = new UntypedExpr( new NameExpr(
                         "__cfaabi_ehm__try_resume_setup" ) );
+                        "__cfaehm_try_resume_setup" ) );
                 setup->get_args().push_back( new AddressExpr( nameOf( obj ) ) );
                 setup->get_args().push_back( nameOf( resume_handler ) );
 …
+        }
         FunctionDecl * ExceptionMutatorCore::create_finally_wrapper(
+        FunctionDecl * TryMutatorCore::create_finally_wrapper(
                         TryStmt * tryStmt ) {
                 // void finally() { <finally code> }
+                // void finally() { `finally->block` }
                 FinallyStmt * finally = tryStmt->get_finally();
                 CompoundStmt * body = finally->get_block();
 …
+        }
         ObjectDecl * ExceptionMutatorCore::create_finally_hook(
+        ObjectDecl * TryMutatorCore::create_finally_hook(
                         FunctionDecl * finally_wrapper ) {
                 // struct __cfaabi_ehm__cleanup_hook __finally_hook
                 //      __attribute__((cleanup( finally_wrapper )));
+                // struct __cfaehm_cleanup_hook __finally_hook
+                //      __attribute__((cleanup( `finally_wrapper` )));
                 // Make Cleanup Attribute.
 …
+        }
+        Statement * TryMutatorCore::create_resume_rethrow( ThrowStmt *throwStmt ) {
+                // return false;
+                Statement * result = new ReturnStmt(
+                        new ConstantExpr( Constant::from_bool( false ) )
+                        );
+                result->labels = throwStmt->labels;
+                delete throwStmt;
+                return result;
+        }
         // Visiting/Mutating Functions
+        void ExceptionMutatorCore::premutate( CatchStmt *catchStmt ) {
+                // Validate the Statement's form.
+                ObjectDecl * decl =
+                        dynamic_cast<ObjectDecl *>( catchStmt->get_decl() );
+                if ( decl && true /* check decl->get_type() */ ) {
+                        // Pass.
+                } else if ( CatchStmt::Terminate == catchStmt->get_kind() ) {
+                        SemanticError(catchStmt->location, "catch must have exception type");
+                } else {
+                        SemanticError(catchStmt->location, "catchResume must have exception type");
+                }
+                // Track the handler context.
+                GuardValue( cur_context );
+                if ( CatchStmt::Terminate == catchStmt->get_kind() ) {
+                        cur_context = TerHandler;
+                        GuardValue( handler_except_decl );
+                        handler_except_decl = decl;
+                } else {
+                        cur_context = ResHandler;
+                }
+        }
+        void ExceptionMutatorCore::premutate( StructDecl *structDecl ) {
+        void TryMutatorCore::premutate( StructDecl *structDecl ) {
                 if ( !structDecl->has_body() ) {
                         // Skip children?
                         return;
                 } else if ( structDecl->get_name() == "__cfaabi_ehm__base_exception_t" ) {
+                } else if ( structDecl->get_name() == "__cfaehm_base_exception_t" ) {
                         assert( nullptr == except_decl );
                         except_decl = structDecl;
                         init_func_types();
                 } else if ( structDecl->get_name() == "__cfaabi_ehm__try_resume_node" ) {
+                } else if ( structDecl->get_name() == "__cfaehm_try_resume_node" ) {
                         assert( nullptr == node_decl );
                         node_decl = structDecl;
                 } else if ( structDecl->get_name() == "__cfaabi_ehm__cleanup_hook" ) {
+                } else if ( structDecl->get_name() == "__cfaehm_cleanup_hook" ) {
                         assert( nullptr == hook_decl );
                         hook_decl = structDecl;
+                }
+                // Later we might get the exception type as well.
+        }
+        Statement * ExceptionMutatorCore::postmutate( ThrowStmt *throwStmt ) {
+                assert( except_decl );
+                // Ignoring throwStmt->get_target() for now.
+                if ( ThrowStmt::Terminate == throwStmt->get_kind() ) {
+                        if ( throwStmt->get_expr() ) {
+                                return create_terminate_throw( throwStmt );
+                        } else if ( TerHandler == cur_context ) {
+                                return create_terminate_rethrow( throwStmt );
+                        } else {
+                                abort("Invalid throw in %s at %i\n",
+                                        throwStmt->location.filename.c_str(),
+                                        throwStmt->location.first_line);
+                        }
+                } else {
+                        if ( throwStmt->get_expr() ) {
+                                return create_resume_throw( throwStmt );
+                        } else if ( ResHandler == cur_context ) {
+                                return create_resume_rethrow( throwStmt );
+                        } else {
+                                abort("Invalid throwResume in %s at %i\n",
+                                        throwStmt->location.filename.c_str(),
+                                        throwStmt->location.first_line);
+                        }
+                }
+        }
+        Statement * ExceptionMutatorCore::postmutate( TryStmt *tryStmt ) {
+        }
+        Statement * TryMutatorCore::postmutate( TryStmt *tryStmt ) {
                 assert( except_decl );
                 assert( node_decl );
 …
+        }
+        void translateEHM( std::list< Declaration *> & translationUnit ) {
+                PassVisitor<ExceptionMutatorCore> translator;
+        Statement * TryMutatorCore::postmutate( ThrowStmt *throwStmt ) {
+                // Only valid `throwResume;` statements should remain. (2/3 checks)
+                assert( ThrowStmt::Resume == throwStmt->kind && ! throwStmt->expr );
+                return create_resume_rethrow( throwStmt );
+        }
+        void translateThrows( std::list< Declaration *> & translationUnit ) {
+                PassVisitor<ThrowMutatorCore> translator;
                 mutateAll( translationUnit, translator );
+        }
+        void translateTries( std::list< Declaration *> & translationUnit ) {
+                PassVisitor<TryMutatorCore> translator;
+                mutateAll( translationUnit, translator );
+        }
+}

src/ControlStruct/ExceptTranslate.h

-              r3c64c668
+              r58fe85a
 // Author           : Andrew Beach
 // Created On       : Tus Jun 06 10:13:00 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Jul 22 09:19:23 2017
 // Update Count     : 4
+// Last Modified By : Andrew Beach
+// Last Modified On : Tus May 19 11:47:00 2020
+// Update Count     : 5
 //
 …
 namespace ControlStruct {
+        void translateEHM( std::list< Declaration *> & translationUnit );
+        // Converts exception handling structures into their underlying C code.  Translation does use the exception
+        // handling header, make sure it is visible wherever translation occurs.
+        void translateThrows( std::list< Declaration *> & translationUnit );
+        /* Replaces all throw & throwResume statements with function calls.
+         * These still need to be resolved, so call this before the reslover.
+         */
+        void translateTries( std::list< Declaration *> & translationUnit );
+        /* Replaces all try blocks (and their many clauses) with function definitions and calls.
+         * This uses the exception built-ins to produce typed output and should take place after
+         * the resolver. It also produces virtual casts and should happen before they are expanded.
+         */
+}

src/ControlStruct/module.mk

-              r3c64c668
+              r58fe85a
 SRC_CONTROLSTRUCT = \
         ControlStruct/ForExprMutator.cc \
+        ControlStruct/ForExprMutator.h \
         ControlStruct/LabelFixer.cc \
+        ControlStruct/LabelFixer.h \
         ControlStruct/LabelGenerator.cc \
+        ControlStruct/LabelGenerator.h \
         ControlStruct/MLEMutator.cc \
+        ControlStruct/Mutate.cc
+        ControlStruct/MLEMutator.h \
+        ControlStruct/Mutate.cc \
+        ControlStruct/Mutate.h
 SRC += $(SRC_CONTROLSTRUCT) ControlStruct/ExceptTranslate.cc
+SRC += $(SRC_CONTROLSTRUCT) ControlStruct/ExceptTranslate.cc ControlStruct/ExceptTranslate.h
 SRCDEMANGLE += $(SRC_CONTROLSTRUCT)

src/GenPoly/GenPoly.cc

-              r3c64c668
+              r58fe85a
+                }
+                bool hasPolyParams( const std::vector<ast::ptr<ast::Expr>> & params, const ast::TypeSubstitution * env) {
+                        for (auto &param : params) {
+                                auto paramType = param.strict_as<ast::TypeExpr>();
+                                if (isPolyType(paramType->type, env)) return true;
+                        }
+                        return false;
+                }
                 /// Checks a parameter list for polymorphic parameters from tyVars; will substitute according to env if present
                 bool hasPolyParams( std::list< Expression* >& params, const TyVarMap &tyVars, const TypeSubstitution *env ) {
 …
+                }
+                bool hasPolyParams( const std::vector<ast::ptr<ast::Expr>> & params, const TyVarMap & tyVars, const ast::TypeSubstitution * env) {
+                        for (auto &param : params) {
+                                auto paramType = param.strict_as<ast::TypeExpr>();
+                                if (isPolyType(paramType->type, tyVars, env)) return true;
+                        }
+                        return false;
+                }
                 /// Checks a parameter list for dynamic-layout parameters from tyVars; will substitute according to env if present
                 bool hasDynParams( std::list< Expression* >& params, const TyVarMap &tyVars, const TypeSubstitution *env ) {
 …
                         Type *newType = env->lookup( typeInst->get_name() );
                         if ( newType ) return newType;
+                }
+                return type;
+        }
+        const ast::Type * replaceTypeInst(const ast::Type * type, const ast::TypeSubstitution * env) {
+                if (!env) return type;
+                if (auto typeInst = dynamic_cast<const ast::TypeInstType*> (type)) {
+                        auto newType = env->lookup(typeInst);
+                        if (newType) return newType;
+                }
                 return type;
 …
+        }
+        const ast::Type * isPolyType(const ast::Type * type, const ast::TypeSubstitution * env) {
+                type = replaceTypeInst( type, env );
+                if ( dynamic_cast< const ast::TypeInstType * >( type ) ) {
+                        return type;
+                } else if ( auto arrayType = dynamic_cast< const ast::ArrayType * >( type ) ) {
+                        return isPolyType( arrayType->base, env );
+                } else if ( auto structType = dynamic_cast< const ast::StructInstType* >( type ) ) {
+                        if ( hasPolyParams( structType->params, env ) ) return type;
+                } else if ( auto unionType = dynamic_cast< const ast::UnionInstType* >( type ) ) {
+                        if ( hasPolyParams( unionType->params, env ) ) return type;
+                }
+                return 0;
+        }
         Type *isPolyType( Type *type, const TyVarMap &tyVars, const TypeSubstitution *env ) {
                 type = replaceTypeInst( type, env );
 …
+                }
                 return 0;
+        }
+        const ast::Type * isPolyType(const ast::Type * type, const TyVarMap & tyVars, const ast::TypeSubstitution * env) {
+                type = replaceTypeInst( type, env );
+                if ( auto typeInst = dynamic_cast< const ast::TypeInstType * >( type ) ) {
+                        return tyVars.find(typeInst->typeString()) != tyVars.end() ? type : nullptr;
+                } else if ( auto arrayType = dynamic_cast< const ast::ArrayType * >( type ) ) {
+                        return isPolyType( arrayType->base, env );
+                } else if ( auto structType = dynamic_cast< const ast::StructInstType* >( type ) ) {
+                        if ( hasPolyParams( structType->params, env ) ) return type;
+                } else if ( auto unionType = dynamic_cast< const ast::UnionInstType* >( type ) ) {
+                        if ( hasPolyParams( unionType->params, env ) ) return type;
+                }
+                return nullptr;
+        }
 …
+        }
+        namespace {
+                // temporary hack to avoid re-implementing anything related to TyVarMap
+                // does this work? these two structs have identical definitions.
+                inline TypeDecl::Data convData(const ast::TypeDecl::Data & data) {
+                        return *reinterpret_cast<const TypeDecl::Data *>(&data);
+                }
+        }
         bool needsBoxing( Type * param, Type * arg, const TyVarMap &exprTyVars, const TypeSubstitution * env ) {
                 // is parameter is not polymorphic, don't need to box
 …
+        }
+        bool needsBoxing( const ast::Type * param, const ast::Type * arg, const TyVarMap &exprTyVars, const ast::TypeSubstitution * env) {
+                // is parameter is not polymorphic, don't need to box
+                if ( ! isPolyType( param, exprTyVars ) ) return false;
+                ast::ptr<ast::Type> newType = arg;
+                if ( env ) env->apply( newType );
+                // if the argument's type is polymorphic, we don't need to box again!
+                return ! isPolyType( newType );
+        }
         bool needsBoxing( Type * param, Type * arg, ApplicationExpr * appExpr, const TypeSubstitution * env ) {
                 FunctionType * function = getFunctionType( appExpr->function->result );
 …
+        }
+        bool needsBoxing( const ast::Type * param, const ast::Type * arg, const ast::ApplicationExpr * appExpr, const ast::TypeSubstitution * env) {
+                const ast::FunctionType * function = getFunctionType(appExpr->func->result);
+                assertf( function, "ApplicationExpr has non-function type: %s", toString( appExpr->func->result ).c_str() );
+                TyVarMap exprTyVars(TypeDecl::Data{});
+                makeTyVarMap(function, exprTyVars);
+                return needsBoxing(param, arg, exprTyVars, env);
+        }
         void addToTyVarMap( TypeDecl * tyVar, TyVarMap &tyVarMap ) {
                 tyVarMap.insert( tyVar->name, TypeDecl::Data{ tyVar } );
+        }
+        void addToTyVarMap( const ast::TypeInstType * tyVar, TyVarMap & tyVarMap) {
+                tyVarMap.insert(tyVar->typeString(), convData(ast::TypeDecl::Data{tyVar->base}));
+        }
 …
                 if ( PointerType *pointer = dynamic_cast< PointerType* >( type ) ) {
                         makeTyVarMap( pointer->get_base(), tyVarMap );
+                }
+        }
+        void makeTyVarMap(const ast::Type * type, TyVarMap & tyVarMap) {
+                if (auto ptype = dynamic_cast<const ast::FunctionType *>(type)) {
+                        for (auto & tyVar : ptype->forall) {
+                                assert (tyVar);
+                                addToTyVarMap(tyVar, tyVarMap);
+                        }
+                }
+                if (auto pointer = dynamic_cast<const ast::PointerType *>(type)) {
+                        makeTyVarMap(pointer->base, tyVarMap);
+                }
+        }

src/GenPoly/GenPoly.h

-              r3c64c668
+              r58fe85a
 namespace GenPoly {
         typedef ErasableScopedMap< std::string, TypeDecl::Data > TyVarMap;
         /// Replaces a TypeInstType by its referrent in the environment, if applicable
         Type* replaceTypeInst( Type* type, const TypeSubstitution* env );
 …
         /// returns polymorphic type if is polymorphic type, NULL otherwise; will look up substitution in env if provided
         Type *isPolyType( Type *type, const TypeSubstitution *env = 0 );
+        const ast::Type * isPolyType(const ast::Type * type, const ast::TypeSubstitution * env = nullptr);
         /// returns polymorphic type if is polymorphic type in tyVars, NULL otherwise; will look up substitution in env if provided
         Type *isPolyType( Type *type, const TyVarMap &tyVars, const TypeSubstitution *env = 0 );
+        const ast::Type * isPolyType(const ast::Type * type, const TyVarMap & tyVars, const ast::TypeSubstitution * env = nullptr);
         /// returns dynamic-layout type if is dynamic-layout type in tyVars, NULL otherwise; will look up substitution in env if provided
 …
         /// true if arg requires boxing given exprTyVars
         bool needsBoxing( Type * param, Type * arg, const TyVarMap &exprTyVars, const TypeSubstitution * env );
+        bool needsBoxing( const ast::Type * param, const ast::Type * arg, const TyVarMap &exprTyVars, const ast::TypeSubstitution * env);
         /// true if arg requires boxing in the call to appExpr
         bool needsBoxing( Type * param, Type * arg, ApplicationExpr * appExpr, const TypeSubstitution * env );
+        bool needsBoxing( const ast::Type * param, const ast::Type * arg, const ast::ApplicationExpr * appExpr, const ast::TypeSubstitution * env);
         /// Adds the type variable `tyVar` to `tyVarMap`
 …
         /// Adds the declarations in the forall list of type (and its pointed-to type if it's a pointer type) to `tyVarMap`
         void makeTyVarMap( Type *type, TyVarMap &tyVarMap );
+        void makeTyVarMap(const ast::Type * type, TyVarMap & tyVarMap);
         /// Prints type variable map

src/GenPoly/InstantiateGeneric.cc

-              r3c64c668
+              r58fe85a
 // Author           : Aaron B. Moss
 // Created On       : Thu Aug 04 18:33:00 2016
 // Last Modified By : Aaron B. Moss
 // Last Modified On : Thu Aug 04 18:33:00 2016
 // Update Count     : 1
+// Last Modified By : Andrew Beach
+// Last Modified On : Wed Jul 16 10:17:00 2020
+// Update Count     : 2
 //
 #include "InstantiateGeneric.h"
 …
                 InstantiationMap< AggregateDecl, AggregateDecl > instantiations;
                 /// Set of types which are dtype-only generic (and therefore have static layout)
                 ScopedSet< AggregateDecl* > dtypeStatics;
+                std::set<AggregateDecl *> dtypeStatics;
                 /// Namer for concrete types
                 UniqueName typeNamer;
 …
+        }
+        template< typename AggrInst >
+        static AggrInst * asForward( AggrInst * decl ) {
+                if ( !decl->body ) {
+                        return nullptr;
+                }
+                decl = decl->clone();
+                decl->body = false;
+                deleteAll( decl->members );
+                decl->members.clear();
+                return decl;
+        }
         void GenericInstantiator::stripDtypeParams( AggregateDecl *base, std::list< TypeDecl* >& baseParams, const std::list< TypeExpr* >& typeSubs ) {
                 substituteMembers( base->get_members(), baseParams, typeSubs );
 …
                                 concDecl->set_body( inst->get_baseStruct()->has_body() );
                                 substituteMembers( inst->get_baseStruct()->get_members(), *inst->get_baseParameters(), typeSubs, concDecl->get_members() );
+                                insert( inst, typeSubs, concDecl ); // must insert before recursion
+                                // Forward declare before recursion. (TODO: Only when needed, #199.)
+                                insert( inst, typeSubs, concDecl );
+                                if ( StructDecl *forwardDecl = asForward( concDecl ) ) {
+                                        declsToAddBefore.push_back( forwardDecl );
+                                }
                                 concDecl->acceptMutator( *visitor ); // recursively instantiate members
                                 declsToAddBefore.push_back( concDecl ); // must occur before declaration is added so that member instantiations appear first
 …
                                 concDecl->set_body( inst->get_baseUnion()->has_body() );
                                 substituteMembers( inst->get_baseUnion()->get_members(), *inst->get_baseParameters(), typeSubs, concDecl->get_members() );
+                                insert( inst, typeSubs, concDecl ); // must insert before recursion
+                                // Forward declare before recursion. (TODO: Only when needed, #199.)
+                                insert( inst, typeSubs, concDecl );
+                                if ( UnionDecl *forwardDecl = asForward( concDecl ) ) {
+                                        declsToAddBefore.push_back( forwardDecl );
+                                }
                                 concDecl->acceptMutator( *visitor ); // recursively instantiate members
                                 declsToAddBefore.push_back( concDecl ); // must occur before declaration is added so that member instantiations appear first
 …
         void GenericInstantiator::beginScope() {
                 instantiations.beginScope();
-                dtypeStatics.beginScope();
+        }
         void GenericInstantiator::endScope() {
                 instantiations.endScope();
-                dtypeStatics.endScope();
+        }

src/GenPoly/Specialize.cc

-              r3c64c668
+              r58fe85a
 // Author           : Richard C. Bilson
 // Created On       : Mon May 18 07:44:20 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Dec 13 23:40:49 2019
 // Update Count     : 32
+// Last Modified By : Andrew Beach
+// Last Modified On : Thr Jul  2 17:42:00 2020
+// Update Count     : 33
 //
 …
 namespace GenPoly {
+        struct Specialize final : public WithConstTypeSubstitution, public WithStmtsToAdd, public WithVisitorRef<Specialize> {
+        struct Specialize final : public WithConstTypeSubstitution,
+                        public WithDeclsToAdd, public WithVisitorRef<Specialize> {
                 Expression * postmutate( ApplicationExpr *applicationExpr );
                 Expression * postmutate( CastExpr *castExpr );
 …
                 thunkFunc->get_attributes().push_back( new Attribute( "unused" ) );
+                // Thunks at the global level must be static to avoid collisions between files.
+                // (Conversly thunks inside a function must be unique and not static.)
+                thunkFunc->storageClasses.is_static = !isInFunction();
                 // thread thunk parameters into call to actual function, naming thunk parameters as we go
                 UniqueName paramNamer( paramPrefix );
 …
                 } // if
+                // handle any specializations that may still be present
+                std::string oldParamPrefix = paramPrefix;
+                paramPrefix += "p";
+                // save stmtsToAddBefore in oldStmts
+                std::list< Statement* > oldStmts;
+                oldStmts.splice( oldStmts.end(), stmtsToAddBefore );
+                appExpr->acceptMutator( *visitor );
+                paramPrefix = oldParamPrefix;
+                // write any statements added for recursive specializations into the thunk body
+                thunkFunc->statements->kids.splice( thunkFunc->statements->kids.end(), stmtsToAddBefore );
+                // restore oldStmts into stmtsToAddBefore
+                stmtsToAddBefore.splice( stmtsToAddBefore.end(), oldStmts );
+                // Handle any specializations that may still be present.
+                {
+                        std::string oldParamPrefix = paramPrefix;
+                        paramPrefix += "p";
+                        std::list< Declaration * > oldDecls;
+                        oldDecls.splice( oldDecls.end(), declsToAddBefore );
+                        appExpr->acceptMutator( *visitor );
+                        // Write recursive specializations into the thunk body.
+                        for ( Declaration * decl : declsToAddBefore ) {
+                                thunkFunc->statements->kids.push_back( new DeclStmt( decl ) );
+                        }
+                        declsToAddBefore = std::move( oldDecls );
+                        paramPrefix = oldParamPrefix;
+                }
                 // add return (or valueless expression) to the thunk
 …
                 thunkFunc->statements->kids.push_back( appStmt );
                 // add thunk definition to queue of statements to add
                 stmtsToAddBefore.push_back( new DeclStmt( thunkFunc ) );
+                // Add the thunk definition (converted to DeclStmt if appproprate).
+                declsToAddBefore.push_back( thunkFunc );
                 // return address of thunk function as replacement expression
                 return new AddressExpr( new VariableExpr( thunkFunc ) );

src/GenPoly/module.mk

-              r3c64c668
+              r58fe85a
 SRC += GenPoly/Box.cc \
+       GenPoly/Box.h \
+       GenPoly/ErasableScopedMap.h \
+       GenPoly/FindFunction.cc \
+       GenPoly/FindFunction.h \
        GenPoly/GenPoly.cc \
+       GenPoly/GenPoly.h \
+       GenPoly/InstantiateGeneric.cc \
+       GenPoly/InstantiateGeneric.h \
+       GenPoly/Lvalue.cc \
+       GenPoly/Lvalue.h \
+       GenPoly/ScopedSet.h \
        GenPoly/ScrubTyVars.cc \
        GenPoly/Lvalue.cc \
+       GenPoly/ScrubTyVars.h \
        GenPoly/Specialize.cc \
+       GenPoly/FindFunction.cc \
+       GenPoly/InstantiateGeneric.cc
+       GenPoly/Specialize.h
 SRCDEMANGLE += GenPoly/GenPoly.cc GenPoly/Lvalue.cc
+SRCDEMANGLE += GenPoly/GenPoly.cc GenPoly/GenPoly.h GenPoly/Lvalue.cc GenPoly/Lvalue.h

src/InitTweak/FixGlobalInit.cc

-              r3c64c668
+              r58fe85a
 #include "SynTree/Visitor.h"       // for acceptAll, Visitor
+#include "AST/Expr.hpp"
+#include "AST/Node.hpp"
+#include "AST/Pass.hpp"
 namespace InitTweak {
         class GlobalFixer : public WithShortCircuiting {
 …
                 FunctionDecl * initFunction;
                 FunctionDecl * destroyFunction;
+        };
+        class GlobalFixer_new : public ast::WithShortCircuiting {
+        public:
+                void previsit (const ast::ObjectDecl *);
+                void previsit (const ast::FunctionDecl *) { visit_children = false; }
+                void previsit (const ast::StructDecl *) { visit_children = false; }
+                void previsit (const ast::UnionDecl *) { visit_children = false; }
+                void previsit (const ast::EnumDecl *) { visit_children = false; }
+                void previsit (const ast::TraitDecl *) { visit_children = false; }
+                void previsit (const ast::TypeDecl *) { visit_children = false; }
+                std::list< ast::ptr<ast::Stmt> > initStmts;
+                std::list< ast::ptr<ast::Stmt> > destroyStmts;
         };
 …
+        }
+        void fixGlobalInit(ast::TranslationUnit & translationUnit, bool inLibrary) {
+                ast::Pass<GlobalFixer_new> fixer;
+                accept_all(translationUnit, fixer);
+                if ( !fixer.core.initStmts.empty() ) {
+                        std::vector<ast::ptr<ast::Expr>> ctorParams;
+                        if (inLibrary) ctorParams.emplace_back(ast::ConstantExpr::from_int({}, 200));
+                        auto initFunction = new ast::FunctionDecl({}, "__global_init__", {}, {}, {}, new ast::CompoundStmt({}, std::move(fixer.core.initStmts)),
+                                ast::Storage::Static, ast::Linkage::C, {new ast::Attribute("constructor", std::move(ctorParams))});
+                        translationUnit.decls.emplace_back( initFunction );
+                } // if
+                if ( !fixer.core.destroyStmts.empty() ) {
+                        std::vector<ast::ptr<ast::Expr>> dtorParams;
+                        if (inLibrary) dtorParams.emplace_back(ast::ConstantExpr::from_int({}, 200));
+                        auto destroyFunction = new ast::FunctionDecl({}, "__global_destroy__", {}, {}, {}, new ast::CompoundStmt({}, std::move(fixer.core.destroyStmts)),
+                                ast::Storage::Static, ast::Linkage::C, {new ast::Attribute("destructor", std::move(dtorParams))});
+                        translationUnit.decls.emplace_back(destroyFunction);
+                } // if
+        }
         void GlobalFixer::previsit( ObjectDecl *objDecl ) {
                 std::list< Statement * > & initStatements = initFunction->get_statements()->get_kids();
 …
                         } // if
                         if ( Statement * ctor = ctorInit->ctor ) {
+                                addDataSectonAttribute( objDecl );
                                 initStatements.push_back( ctor );
                                 objDecl->init = nullptr;
 …
+        }
+        void GlobalFixer_new::previsit(const ast::ObjectDecl * objDecl) {
+                auto mutDecl = mutate(objDecl);
+                assertf(mutDecl == objDecl, "Global object decl must be unique");
+                if ( auto ctorInit = objDecl->init.as<ast::ConstructorInit>() ) {
+                        // a decision should have been made by the resolver, so ctor and init are not both non-NULL
+                        assert( ! ctorInit->ctor || ! ctorInit->init );
+                        const ast::Stmt * dtor = ctorInit->dtor;
+                        if ( dtor && ! isIntrinsicSingleArgCallStmt( dtor ) ) {
+                                // don't need to call intrinsic dtor, because it does nothing, but
+                                // non-intrinsic dtors must be called
+                                destroyStmts.push_front( dtor );
+                                // ctorInit->dtor = nullptr;
+                        } // if
+                        if ( const ast::Stmt * ctor = ctorInit->ctor ) {
+                                addDataSectionAttribute(mutDecl);
+                                initStmts.push_back( ctor );
+                                mutDecl->init = nullptr;
+                                // ctorInit->ctor = nullptr;
+                        } else if ( const ast::Init * init = ctorInit->init ) {
+                                mutDecl->init = init;
+                                // ctorInit->init = nullptr;
+                        } else {
+                                // no constructor and no initializer, which is okay
+                                mutDecl->init = nullptr;
+                        } // if
+                        // delete ctorInit;
+                } // if
+        }
         // only modify global variables
         void GlobalFixer::previsit( FunctionDecl * ) { visit_children = false; }

src/InitTweak/FixGlobalInit.h

-              r3c64c668
+              r58fe85a
 #include <string>  // for string
+#include <AST/Fwd.hpp>
 class Declaration;
 …
         /// function is for library code.
         void fixGlobalInit( std::list< Declaration * > & translationUnit, bool inLibrary );
+        void fixGlobalInit( ast::TranslationUnit & translationUnit, bool inLibrary );
 } // namespace

src/InitTweak/FixInit.cc

-              r3c64c668
+              r58fe85a
                 };
                 struct SplitExpressions : public WithShortCircuiting, public WithTypeSubstitution, public WithStmtsToAdd {
+                struct SplitExpressions : public WithShortCircuiting, /*public WithTypeSubstitution, */public WithStmtsToAdd {
                         /// add CompoundStmts around top-level expressions so that temporaries are destroyed in the correct places.
                         static void split( std::list< Declaration * > &translationUnit );
 …
                                 if ( Statement * ctor = ctorInit->get_ctor() ) {
                                         if ( objDecl->get_storageClasses().is_static ) {
+                                                // The ojbect needs to go in the data section, regardless of dtor complexity below.
+                                                // The attribute works, and is meant to apply, both for leaving the static local alone,
+                                                // and for hoisting it out as a static global.
+                                                addDataSectonAttribute( objDecl );
                                                 // originally wanted to take advantage of gcc nested functions, but
                                                 // we get memory errors with this approach. To remedy this, the static

src/InitTweak/FixInit.h

-              r3c64c668
+              r58fe85a
 class Declaration;
+namespace ast {
+        struct TranslationUnit;
+}
 namespace InitTweak {
         /// replace constructor initializers with expression statements and unwrap basic C-style initializers
         void fix( std::list< Declaration * > & translationUnit, bool inLibrary );
+        void fix( ast::TranslationUnit & translationUnit, bool inLibrary);
 } // namespace

src/InitTweak/GenInit.cc

-              r3c64c668
+              r58fe85a
 #include "AST/Node.hpp"
 #include "AST/Stmt.hpp"
+#include "CompilationState.h"
 #include "CodeGen/OperatorTable.h"
 #include "Common/PassVisitor.h"        // for PassVisitor, WithGuards, WithShort...
 …
         };
+        struct HoistArrayDimension_NoResolve final : public WithDeclsToAdd, public WithShortCircuiting, public WithGuards {
+                /// hoist dimension from array types in object declaration so that it uses a single
+                /// const variable of type size_t, so that side effecting array dimensions are only
+                /// computed once.
+                static void hoistArrayDimension( std::list< Declaration * > & translationUnit );
+                void premutate( ObjectDecl * objectDecl );
+                DeclarationWithType * postmutate( ObjectDecl * objectDecl );
+                void premutate( FunctionDecl *functionDecl );
+                // should not traverse into any of these declarations to find objects
+                // that need to be constructed or destructed
+                void premutate( AggregateDecl * ) { visit_children = false; }
+                void premutate( NamedTypeDecl * ) { visit_children = false; }
+                void premutate( FunctionType * ) { visit_children = false; }
+                void hoist( Type * type );
+                Type::StorageClasses storageClasses;
+                bool inFunction = false;
+        };
         void genInit( std::list< Declaration * > & translationUnit ) {
+                if (!useNewAST) {
+                        HoistArrayDimension::hoistArrayDimension( translationUnit );
+                }
+                else {
+                        HoistArrayDimension_NoResolve::hoistArrayDimension( translationUnit );
+                }
                 fixReturnStatements( translationUnit );
+                HoistArrayDimension::hoistArrayDimension( translationUnit );
+                CtorDtor::generateCtorDtor( translationUnit );
+                if (!useNewAST) {
+                        CtorDtor::generateCtorDtor( translationUnit );
+                }
+        }
 …
                         arrayType->isVarLen = ! isConstExpr( arrayType->dimension );
                         // don't need to hoist dimension if it's definitely pure - only need to if there's potential for side effects.
+                        // xxx - hoisting has no side effects anyways, so don't skip since we delay resolve
+                        // still try to detect constant expressions
                         if ( ! Tuples::maybeImpure( arrayType->dimension ) ) return;
 …
         void HoistArrayDimension::premutate( FunctionDecl * ) {
+                GuardValue( inFunction );
+                inFunction = true;
+        }
+        // precompute array dimension expression, because constructor generation may duplicate it,
+        // which would be incorrect if it is a side-effecting computation.
+        void HoistArrayDimension_NoResolve::hoistArrayDimension( std::list< Declaration * > & translationUnit ) {
+                PassVisitor<HoistArrayDimension_NoResolve> hoister;
+                mutateAll( translationUnit, hoister );
+        }
+        void HoistArrayDimension_NoResolve::premutate( ObjectDecl * objectDecl ) {
+                GuardValue( storageClasses );
+                storageClasses = objectDecl->get_storageClasses();
+        }
+        DeclarationWithType * HoistArrayDimension_NoResolve::postmutate( ObjectDecl * objectDecl ) {
+                hoist( objectDecl->get_type() );
+                return objectDecl;
+        }
+        void HoistArrayDimension_NoResolve::hoist( Type * type ) {
+                // if in function, generate const size_t var
+                static UniqueName dimensionName( "_array_dim" );
+                // C doesn't allow variable sized arrays at global scope or for static variables, so don't hoist dimension.
+                if ( ! inFunction ) return;
+                if ( storageClasses.is_static ) return;
+                if ( ArrayType * arrayType = dynamic_cast< ArrayType * >( type ) ) {
+                        if ( ! arrayType->get_dimension() ) return; // xxx - recursive call to hoist?
+                        // don't need to hoist dimension if it's definitely pure - only need to if there's potential for side effects.
+                        // xxx - hoisting has no side effects anyways, so don't skip since we delay resolve
+                        // still try to detect constant expressions
+                        if ( ! Tuples::maybeImpure( arrayType->dimension ) ) return;
+                        ObjectDecl * arrayDimension = new ObjectDecl( dimensionName.newName(), storageClasses, LinkageSpec::C, 0, Validate::SizeType->clone(), new SingleInit( arrayType->get_dimension() ) );
+                        arrayDimension->get_type()->set_const( true );
+                        arrayType->set_dimension( new VariableExpr( arrayDimension ) );
+                        declsToAddBefore.push_back( arrayDimension );
+                        hoist( arrayType->get_base() );
+                        return;
+                }
+        }
+        void HoistArrayDimension_NoResolve::premutate( FunctionDecl * ) {
                 GuardValue( inFunction );
                 inFunction = true;
 …
+        }
+        // why is this not just on FunctionDecl?
         void ManagedTypes::handleDWT( DeclarationWithType * dwt ) {
                 // if this function is a user-defined constructor or destructor, mark down the type as "managed"
 …
         void ManagedTypes::endScope() { managedTypes.endScope(); }
+        bool ManagedTypes_new::isManaged( const ast::Type * type ) const {
+                // references are never constructed
+                if ( dynamic_cast< const ast::ReferenceType * >( type ) ) return false;
+                if ( auto tupleType = dynamic_cast< const ast::TupleType * > ( type ) ) {
+                        // tuple is also managed if any of its components are managed
+                        for (auto & component : tupleType->types) {
+                                if (isManaged(component)) return true;
+                        }
+                }
+                // need to clear and reset qualifiers when determining if a type is managed
+                // ValueGuard< Type::Qualifiers > qualifiers( type->get_qualifiers() );
+                auto tmp = shallowCopy(type);
+                tmp->qualifiers = {};
+                // delete tmp at return
+                ast::ptr<ast::Type> guard = tmp;
+                // a type is managed if it appears in the map of known managed types, or if it contains any polymorphism (is a type variable or generic type containing a type variable)
+                return managedTypes.find( Mangle::mangle( tmp, {Mangle::NoOverrideable | Mangle::NoGenericParams | Mangle::Type} ) ) != managedTypes.end() || GenPoly::isPolyType( tmp );
+        }
+        bool ManagedTypes_new::isManaged( const ast::ObjectDecl * objDecl ) const {
+                const ast::Type * type = objDecl->type;
+                while ( auto at = dynamic_cast< const ast::ArrayType * >( type ) ) {
+                        // must always construct VLAs with an initializer, since this is an error in C
+                        if ( at->isVarLen && objDecl->init ) return true;
+                        type = at->base;
+                }
+                return isManaged( type );
+        }
+        void ManagedTypes_new::handleDWT( const ast::DeclWithType * dwt ) {
+                // if this function is a user-defined constructor or destructor, mark down the type as "managed"
+                if ( ! dwt->linkage.is_overrideable && CodeGen::isCtorDtor( dwt->name ) ) {
+                        auto & params = GenPoly::getFunctionType( dwt->get_type())->params;
+                        assert( ! params.empty() );
+                        // Type * type = InitTweak::getPointerBase( params.front() );
+                        // assert( type );
+                        managedTypes.insert( Mangle::mangle( params.front(), {Mangle::NoOverrideable | Mangle::NoGenericParams | Mangle::Type} ) );
+                }
+        }
+        void ManagedTypes_new::handleStruct( const ast::StructDecl * aggregateDecl ) {
+                // don't construct members, but need to take note if there is a managed member,
+                // because that means that this type is also managed
+                for ( auto & member : aggregateDecl->members ) {
+                        if ( auto field = member.as<ast::ObjectDecl>() ) {
+                                if ( isManaged( field ) ) {
+                                        // generic parameters should not play a role in determining whether a generic type is constructed - construct all generic types, so that
+                                        // polymorphic constructors make generic types managed types
+                                        ast::StructInstType inst( aggregateDecl );
+                                        managedTypes.insert( Mangle::mangle( &inst, {Mangle::NoOverrideable | Mangle::NoGenericParams | Mangle::Type} ) );
+                                        break;
+                                }
+                        }
+                }
+        }
+        void ManagedTypes_new::beginScope() { managedTypes.beginScope(); }
+        void ManagedTypes_new::endScope() { managedTypes.endScope(); }
         ImplicitCtorDtorStmt * genCtorDtor( const std::string & fname, ObjectDecl * objDecl, Expression * arg ) {
                 // call into genImplicitCall from Autogen.h to generate calls to ctor/dtor
 …
                 assert( stmts.size() <= 1 );
                 return stmts.size() == 1 ? strict_dynamic_cast< ImplicitCtorDtorStmt * >( stmts.front() ) : nullptr;
+        }
+        ast::ptr<ast::Stmt> genCtorDtor (const CodeLocation & loc, const std::string & fname, const ast::ObjectDecl * objDecl, const ast::Expr * arg) {
+                assertf(objDecl, "genCtorDtor passed null objDecl");
+                InitExpander_new srcParam(arg);
+                return SymTab::genImplicitCall(srcParam, new ast::VariableExpr(loc, objDecl), loc, fname, objDecl);
+        }
 …
         // constructable object
         InitExpander_new srcParam{ objDecl->init }, nullParam{ (const ast::Init *)nullptr };
+        ast::ptr< ast::Expr > dstParam = new ast::VariableExpr(loc, objDecl);
         ast::ptr< ast::Stmt > ctor = SymTab::genImplicitCall(
                 srcParam, new ast::VariableExpr{ loc, objDecl }, loc, "?{}", objDecl );
+                srcParam, dstParam, loc, "?{}", objDecl );
         ast::ptr< ast::Stmt > dtor = SymTab::genImplicitCall(
                 nullParam, new ast::VariableExpr{ loc, objDecl }, loc, "^?{}", objDecl,
+                nullParam, dstParam, loc, "^?{}", objDecl,
                 SymTab::LoopBackward );

src/InitTweak/GenInit.h

-              r3c64c668
+              r58fe85a
         /// generates a single ctor/dtor statement using objDecl as the 'this' parameter and arg as the optional argument
         ImplicitCtorDtorStmt * genCtorDtor( const std::string & fname, ObjectDecl * objDecl, Expression * arg = nullptr );
+        ast::ptr<ast::Stmt> genCtorDtor (const CodeLocation & loc, const std::string & fname, const ast::ObjectDecl * objDecl, const ast::Expr * arg = nullptr);
         /// creates an appropriate ConstructorInit node which contains a constructor, destructor, and C-initializer
 …
                 GenPoly::ScopedSet< std::string > managedTypes;
         };
+        class ManagedTypes_new {
+        public:
+                bool isManaged( const ast::ObjectDecl * objDecl ) const ; // determine if object is managed
+                bool isManaged( const ast::Type * type ) const; // determine if type is managed
+                void handleDWT( const ast::DeclWithType * dwt ); // add type to managed if ctor/dtor
+                void handleStruct( const ast::StructDecl * aggregateDecl ); // add type to managed if child is managed
+                void beginScope();
+                void endScope();
+        private:
+                GenPoly::ScopedSet< std::string > managedTypes;
+        };
 } // namespace

src/InitTweak/InitTweak.cc

-              r3c64c668
+              r58fe85a
                 };
+                struct HasDesignations_new : public ast::WithShortCircuiting {
+                        bool result = false;
+                        void previsit( const ast::Node * ) {
+                                // short circuit if we already know there are designations
+                                if ( result ) visit_children = false;
+                        }
+                        void previsit( const ast::Designation * des ) {
+                                // short circuit if we already know there are designations
+                                if ( result ) visit_children = false;
+                                else if ( ! des->designators.empty() ) {
+                                        result = true;
+                                        visit_children = false;
+                                }
+                        }
+                };
+                struct InitDepthChecker_new : public ast::WithGuards {
+                        bool result = true;
+                        const ast::Type * type;
+                        int curDepth = 0, maxDepth = 0;
+                        InitDepthChecker_new( const ast::Type * type ) : type( type ) {
+                                const ast::Type * t = type;
+                                while ( auto at = dynamic_cast< const ast::ArrayType * >( t ) ) {
+                                        maxDepth++;
+                                        t = at->base;
+                                }
+                                maxDepth++;
+                        }
+                        void previsit( ListInit * ) {
+                                curDepth++;
+                                GuardAction( [this]() { curDepth--; } );
+                                if ( curDepth > maxDepth ) result = false;
+                        }
+                };
                 struct InitFlattener_old : public WithShortCircuiting {
                         void previsit( SingleInit * singleInit ) {
 …
+        }
+        bool isDesignated( const ast::Init * init ) {
+                ast::Pass<HasDesignations_new> finder;
+                maybe_accept( init, finder );
+                return finder.core.result;
+        }
+        bool checkInitDepth( const ast::ObjectDecl * objDecl ) {
+                ast::Pass<InitDepthChecker_new> checker( objDecl->type );
+                maybe_accept( objDecl->init.get(), checker );
+                return checker.core.result;
+        }
 std::vector< ast::ptr< ast::Expr > > makeInitList( const ast::Init * init ) {
         ast::Pass< InitFlattener_new > flattener;
         maybe_accept( init, flattener );
         return std::move( flattener.pass.argList );
+        return std::move( flattener.core.argList );
+}
 …
                         if ( auto listInit = dynamic_cast< const ast::ListInit * >( init ) ) {
                                 for ( const ast::Init * init : *listInit ) {
                                         buildCallExpr( callExpr, index, dimension, init, out );
+                                        buildCallExpr( shallowCopy(callExpr), index, dimension, init, out );
+                                }
                         } else {
                                 buildCallExpr( callExpr, index, dimension, init, out );
+                                buildCallExpr( shallowCopy(callExpr), index, dimension, init, out );
+                        }
                 } else {
 …
+        }
+        const ast::ObjectDecl * getParamThis(const ast::FunctionDecl * func) {
+                assertf( func, "getParamThis: nullptr ftype" );
+                auto & params = func->params;
+                assertf( ! params.empty(), "getParamThis: ftype with 0 parameters: %s", toString( func ).c_str());
+                return params.front().strict_as<ast::ObjectDecl>();
+        }
         bool tryConstruct( DeclarationWithType * dwt ) {
                 ObjectDecl * objDecl = dynamic_cast< ObjectDecl * >( dwt );
 …
+        }
+        bool tryConstruct( const ast::DeclWithType * dwt ) {
+                auto objDecl = dynamic_cast< const ast::ObjectDecl * >( dwt );
+                if ( ! objDecl ) return false;
+                return (objDecl->init == nullptr ||
+                                ( objDecl->init != nullptr && objDecl->init->maybeConstructed ))
+                        && ! objDecl->storage.is_extern
+                        && isConstructable( objDecl->type );
+        }
+        bool isConstructable( const ast::Type * type ) {
+                return ! dynamic_cast< const ast::VarArgsType * >( type ) && ! dynamic_cast< const ast::ReferenceType * >( type )
+                && ! dynamic_cast< const ast::FunctionType * >( type ) && ! Tuples::isTtype( type );
+        }
         struct CallFinder_old {
                 CallFinder_old( const std::list< std::string > & names ) : names( names ) {}
 …
         struct CallFinder_new final {
                 std::vector< ast::ptr< ast::Expr > > matches;
+                std::vector< const ast::Expr * > matches;
                 const std::vector< std::string > names;
 …
+        }
         std::vector< ast::ptr< ast::Expr > > collectCtorDtorCalls( const ast::Stmt * stmt ) {
+        std::vector< const ast::Expr * > collectCtorDtorCalls( const ast::Stmt * stmt ) {
                 ast::Pass< CallFinder_new > finder{ std::vector< std::string >{ "?{}", "^?{}" } };
                 maybe_accept( stmt, finder );
                 return std::move( finder.pass.matches );
+                return std::move( finder.core.matches );
+        }
 …
                 template <typename Predicate>
                 bool allofCtorDtor( const ast::Stmt * stmt, const Predicate & pred ) {
                         std::vector< ast::ptr< ast::Expr > > callExprs = collectCtorDtorCalls( stmt );
+                        std::vector< const ast::Expr * > callExprs = collectCtorDtorCalls( stmt );
                         return std::all_of( callExprs.begin(), callExprs.end(), pred );
+                }
 …
+        }
+        // looks like some other such codegen uses UntypedExpr and does not create fake function. should revisit afterwards
+        // following passes may accidentally resolve this expression if returned as untyped...
+        ast::Expr * createBitwiseAssignment (const ast::Expr * dst, const ast::Expr * src) {
+                static ast::ptr<ast::FunctionDecl> assign = nullptr;
+                if (!assign) {
+                        auto td = new ast::TypeDecl({}, "T", {}, nullptr, ast::TypeDecl::Dtype, true);
+                        assign = new ast::FunctionDecl({}, "?=?", {},
+                        { new ast::ObjectDecl({}, "_dst", new ast::ReferenceType(new ast::TypeInstType("T", td))),
+                          new ast::ObjectDecl({}, "_src", new ast::TypeInstType("T", td))},
+                        { new ast::ObjectDecl({}, "_ret", new ast::TypeInstType("T", td))}, nullptr, {}, ast::Linkage::Intrinsic);
+                }
+                if (dst->result.as<ast::ReferenceType>()) {
+                        for (int depth = dst->result->referenceDepth(); depth > 0; depth--) {
+                                dst = new ast::AddressExpr(dst);
+                        }
+                }
+                else {
+                        dst = new ast::CastExpr(dst, new ast::ReferenceType(dst->result, {}));
+                }
+                if (src->result.as<ast::ReferenceType>()) {
+                        for (int depth = src->result->referenceDepth(); depth > 0; depth--) {
+                                src = new ast::AddressExpr(src);
+                        }
+                }
+                return new ast::ApplicationExpr(dst->location, ast::VariableExpr::functionPointer(dst->location, assign), {dst, src});
+        }
         struct ConstExprChecker : public WithShortCircuiting {
                 // most expressions are not const expr
 …
         };
+        struct ConstExprChecker_new : public ast::WithShortCircuiting {
+                // most expressions are not const expr
+                void previsit( const ast::Expr * ) { result = false; visit_children = false; }
+                void previsit( const ast::AddressExpr *addressExpr ) {
+                        visit_children = false;
+                        const ast::Expr * arg = addressExpr->arg;
+                        // address of a variable or member expression is constexpr
+                        if ( ! dynamic_cast< const ast::NameExpr * >( arg )
+                        && ! dynamic_cast< const ast::VariableExpr * >( arg )
+                        && ! dynamic_cast< const ast::MemberExpr * >( arg )
+                        && ! dynamic_cast< const ast::UntypedMemberExpr * >( arg ) ) result = false;
+                }
+                // these expressions may be const expr, depending on their children
+                void previsit( const ast::SizeofExpr * ) {}
+                void previsit( const ast::AlignofExpr * ) {}
+                void previsit( const ast::UntypedOffsetofExpr * ) {}
+                void previsit( const ast::OffsetofExpr * ) {}
+                void previsit( const ast::OffsetPackExpr * ) {}
+                void previsit( const ast::CommaExpr * ) {}
+                void previsit( const ast::LogicalExpr * ) {}
+                void previsit( const ast::ConditionalExpr * ) {}
+                void previsit( const ast::CastExpr * ) {}
+                void previsit( const ast::ConstantExpr * ) {}
+                void previsit( const ast::VariableExpr * varExpr ) {
+                        visit_children = false;
+                        if ( auto inst = varExpr->result.as<ast::EnumInstType>() ) {
+                                long long int value;
+                                if ( inst->base->valueOf( varExpr->var, value ) ) {
+                                        // enumerators are const expr
+                                        return;
+                                }
+                        }
+                        result = false;
+                }
+                bool result = true;
+        };
         bool isConstExpr( Expression * expr ) {
                 if ( expr ) {
 …
+        }
+        bool isConstExpr( const ast::Expr * expr ) {
+                if ( expr ) {
+                        ast::Pass<ConstExprChecker_new> checker;
+                        expr->accept( checker );
+                        return checker.core.result;
+                }
+                return true;
+        }
+        bool isConstExpr( const ast::Init * init ) {
+                if ( init ) {
+                        ast::Pass<ConstExprChecker_new> checker;
+                        init->accept( checker );
+                        return checker.core.result;
+                } // if
+                // for all intents and purposes, no initializer means const expr
+                return true;
+        }
         bool isConstructor( const std::string & str ) { return str == "?{}"; }
         bool isDestructor( const std::string & str ) { return str == "^?{}"; }
 …
                 if ( ftype->params.size() != 2 ) return false;
                 const ast::Type * t1 = getPointerBase( ftype->params.front()->get_type() );
+                const ast::Type * t1 = getPointerBase( ftype->params.front() );
                 if ( ! t1 ) return false;
                 const ast::Type * t2 = ftype->params.back()->get_type();
+                const ast::Type * t2 = ftype->params.back();
                 return ResolvExpr::typesCompatibleIgnoreQualifiers( t1, t2, ast::SymbolTable{} );
 …
                 return isCopyFunction( decl, "?{}" );
+        }
+        void addDataSectonAttribute( ObjectDecl * objDecl ) {
+                Type *strLitT = new PointerType( Type::Qualifiers( ),
+                        new BasicType( Type::Qualifiers( ), BasicType::Char ) );
+                std::list< Expression * > attr_params;
+                attr_params.push_back(
+                        new ConstantExpr( Constant( strLitT, "\".data#\"", std::nullopt ) ) );
+                objDecl->attributes.push_back(new Attribute("section", attr_params));
+        }
+        void addDataSectionAttribute( ast::ObjectDecl * objDecl ) {
+                auto strLitT = new ast::PointerType(new ast::BasicType(ast::BasicType::Char));
+                objDecl->attributes.push_back(new ast::Attribute("section", {new ast::ConstantExpr(objDecl->location, strLitT, "\".data#\"", std::nullopt)}));
+        }
+}

src/InitTweak/InitTweak.h

-              r3c64c668
+              r58fe85a
         /// returns the first parameter of a constructor/destructor/assignment function
         ObjectDecl * getParamThis( FunctionType * ftype );
+        const ast::ObjectDecl * getParamThis(const ast::FunctionDecl * func);
         /// generate a bitwise assignment operation.
         ApplicationExpr * createBitwiseAssignment( Expression * dst, Expression * src );
+        ast::Expr * createBitwiseAssignment( const ast::Expr * dst, const ast::Expr * src);
         /// transform Initializer into an argument list that can be passed to a call expression
 …
         /// True if the resolver should try to construct dwt
         bool tryConstruct( DeclarationWithType * dwt );
+        bool tryConstruct( const ast::DeclWithType * dwt );
         /// True if the type can have a user-defined constructor
         bool isConstructable( Type * t );
+        bool isConstructable( const ast::Type * t );
         /// True if the Initializer contains designations
         bool isDesignated( Initializer * init );
+        bool isDesignated( const ast::Init * init );
         /// True if the ObjectDecl's Initializer nesting level is not deeper than the depth of its
         /// type, where the depth of its type is the number of nested ArrayTypes + 1
         bool checkInitDepth( ObjectDecl * objDecl );
+        bool checkInitDepth( const ast::ObjectDecl * objDecl );
         /// returns the declaration of the function called by the expr (must be ApplicationExpr or UntypedExpr)
 …
         /// get all Ctor/Dtor call expressions from a Statement
         void collectCtorDtorCalls( Statement * stmt, std::list< Expression * > & matches );
         std::vector< ast::ptr< ast::Expr > > collectCtorDtorCalls( const ast::Stmt * stmt );
+        std::vector< const ast::Expr * > collectCtorDtorCalls( const ast::Stmt * stmt );
         /// get the Ctor/Dtor call expression from a Statement that looks like a generated ctor/dtor call
 …
         bool isConstExpr( Expression * expr );
         bool isConstExpr( Initializer * init );
+        bool isConstExpr( const ast::Expr * expr );
+        bool isConstExpr( const ast::Init * init );
+        /// Modifies objDecl to have:
+        ///    __attribute__((section (".data#")))
+        /// which makes gcc put the declared variable in the data section,
+        /// which is helpful for global constants on newer gcc versions,
+        /// so that CFA's generated initialization won't segfault when writing it via a const cast.
+        /// The trailing # is an injected assembly comment, to suppress the "a" in
+        ///    .section .data,"a"
+        ///    .section .data#,"a"
+        /// to avoid assembler warning "ignoring changed section attributes for .data"
+        void addDataSectonAttribute( ObjectDecl * objDecl );
+        void addDataSectionAttribute( ast::ObjectDecl * objDecl );
         class InitExpander_old {

src/InitTweak/module.mk

-              r3c64c668
+              r58fe85a
 ###############################################################################
+SRC += InitTweak/GenInit.cc \
+SRC += \
+        InitTweak/FixGlobalInit.cc \
+        InitTweak/FixGlobalInit.h \
         InitTweak/FixInit.cc \
+        InitTweak/FixGlobalInit.cc \
+        InitTweak/InitTweak.cc
+        InitTweak/FixInit.h \
+        InitTweak/GenInit.cc \
+        InitTweak/GenInit.h \
+        InitTweak/InitTweak.cc \
+        InitTweak/InitTweak.h \
+        InitTweak/FixInitNew.cpp
+SRCDEMANGLE += InitTweak/GenInit.cc \
+        InitTweak/InitTweak.cc
+SRCDEMANGLE += \
+        InitTweak/GenInit.cc \
+        InitTweak/GenInit.h \
+        InitTweak/InitTweak.cc \
+        InitTweak/InitTweak.h

src/Makefile.am

-              r3c64c668
+              r58fe85a
 SRC = main.cc \
+      CompilationState.cc \
+      CompilationState.h \
       MakeLibCfa.cc \
+      CompilationState.cc
+        MakeLibCfa.h
 SRCDEMANGLE = CompilationState.cc
 …
 ___driver_cfa_cpp_SOURCES = $(SRC)
 ___driver_cfa_cpp_LDADD = -ldl $(LIBPROFILER) $(LIBTCMALLOC)
+EXTRA_DIST = include/cassert include/optional BasicTypes-gen.cc
 AM_CXXFLAGS = @HOST_FLAGS@ -Wno-deprecated -Wall -Wextra -DDEBUG_ALL -I./Parser -I$(srcdir)/Parser -I$(srcdir)/include -DYY_NO_INPUT -O3 -g -std=c++14 $(TCMALLOCFLAG)

src/Parser/DeclarationNode.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Sat May 16 12:34:05 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Dec 16 15:32:22 2019
 // Update Count     : 1133
+// Last Modified On : Thu Oct  8 08:03:38 2020
+// Update Count     : 1135
 //
 …
                         if ( DeclarationWithType * dwt = dynamic_cast< DeclarationWithType * >( decl ) ) {
                                 dwt->location = cur->location;
                                 * out++ = dwt;
+                                *out++ = dwt;
                         } else if ( StructDecl * agg = dynamic_cast< StructDecl * >( decl ) ) {
                                 // e.g., int foo(struct S) {}
 …
                                 auto obj = new ObjectDecl( "", Type::StorageClasses(), linkage, nullptr, inst, nullptr );
                                 obj->location = cur->location;
                                 * out++ = obj;
+                                *out++ = obj;
                                 delete agg;
                         } else if ( UnionDecl * agg = dynamic_cast< UnionDecl * >( decl ) ) {
 …
                                 auto obj = new ObjectDecl( "", Type::StorageClasses(), linkage, nullptr, inst, nullptr );
                                 obj->location = cur->location;
                                 * out++ = obj;
+                                *out++ = obj;
                         } else if ( EnumDecl * agg = dynamic_cast< EnumDecl * >( decl ) ) {
                                 // e.g., int foo(enum E) {}
 …
                                 auto obj = new ObjectDecl( "", Type::StorageClasses(), linkage, nullptr, inst, nullptr );
                                 obj->location = cur->location;
                                 * out++ = obj;
+                                *out++ = obj;
                         } // if
                 } catch( SemanticErrorException & e ) {
 …
         // SUE's cannot have function specifiers, either
         //
         //    inlne _Noreturn struct S { ... };         // disallowed
         //    inlne _Noreturn enum   E { ... };         // disallowed
+        //    inline _Noreturn struct S { ... };                // disallowed
+        //    inline _Noreturn enum   E { ... };                // disallowed
         if ( funcSpecs.any() ) {
                 SemanticError( this, "invalid function specifier for " );

src/Parser/ExpressionNode.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Sat May 16 13:17:07 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec 18 21:14:58 2019
 // Update Count     : 981
+// Last Modified On : Thu Aug 20 14:01:46 2020
+// Update Count     : 1076
 //
 …
 void lnthSuffix( string & str, int & type, int & ltype ) {
+        // 'u' can appear before or after length suffix
         string::size_type posn = str.find_last_of( "lL" );
         if ( posn == string::npos ) return;                                     // no suffix
+        if ( posn == str.length() - 1 ) { type = 3; return; } // no length => long
+        size_t end = str.length() - 1;
+        if ( posn == end ) { type = 3; return; }                        // no length after 'l' => long
         string::size_type next = posn + 1;                                      // advance to length
         if ( str[next] == '3' ) {                                                       // 32
 …
                 } // if
         } // if
+        // remove "lL" for these cases because it may not imply long
+        str.erase( posn );                                                                      // remove length
+        char fix = '\0';
+        if ( str[end] == 'u' || str[end] == 'U' ) fix = str[end]; // ends with 'uU' ?
+        str.erase( posn );                                                                      // remove length suffix and possibly uU
+        if ( type == 5 ) {                                                                      // L128 does not need uU
+                end = str.length() - 1;
+                if ( str[end] == 'u' || str[end] == 'U' ) str.erase( end ); // ends with 'uU' ? remove
+        } else if ( fix != '\0' ) str += fix;                           // put 'uU' back if removed
 } // lnthSuffix
 …
 } // valueToType
+static void scanbin( string & str, unsigned long long int & v ) {
+        v = 0;
+        size_t last = str.length() - 1;                                         // last subscript of constant
+        for ( unsigned int i = 2;; ) {                                          // ignore prefix
+                if ( str[i] == '1' ) v |= 1;
+                i += 1;
+          if ( i == last - 1 || (str[i] != '0' && str[i] != '1') ) break;
+                v <<= 1;
+        } // for
+} // scanbin
 Expression * build_constantInteger( string & str ) {
         static const BasicType::Kind kind[2][6] = {
                 // short (h) must be before char (hh) because shorter type has the longer suffix
                 { BasicType::ShortSignedInt, BasicType::SignedChar, BasicType::SignedInt, BasicType::LongSignedInt, BasicType::LongLongSignedInt, BasicType::SignedInt128, },
                 { BasicType::ShortUnsignedInt, BasicType::UnsignedChar, BasicType::UnsignedInt, BasicType::LongUnsignedInt, BasicType::LongLongUnsignedInt, BasicType::UnsignedInt128, },
+                { BasicType::ShortSignedInt, BasicType::SignedChar, BasicType::SignedInt, BasicType::LongSignedInt, BasicType::LongLongSignedInt, /* BasicType::SignedInt128 */ BasicType::LongLongSignedInt, },
+                { BasicType::ShortUnsignedInt, BasicType::UnsignedChar, BasicType::UnsignedInt, BasicType::LongUnsignedInt, BasicType::LongLongUnsignedInt, /* BasicType::UnsignedInt128 */ BasicType::LongLongUnsignedInt, },
         };
 …
         }; // lnthsInt
+        unsigned long long int v;                                                       // converted integral value
+        size_t last = str.length() - 1;                                         // last subscript of constant
+        Expression * ret;
+        //string fred( str );
+        string str2( "0x0" );
+        unsigned long long int v, v2 = 0;                                       // converted integral value
+        Expression * ret, * ret2;
         int type = -1;                                                                          // 0 => short, 1 => char, 2 => int, 3 => long int, 4 => long long int, 5 => int128
 …
         } // if
+        string::size_type posn;
+        // 'u' can appear before or after length suffix
+        if ( str.find_last_of( "uU" ) != string::npos ) Unsigned = true;
+        if ( isdigit( str[str.length() - 1] ) ) {                       // no suffix ?
+                lnthSuffix( str, type, ltype );                                 // could have length suffix
+        } else {
+                // At least one digit in integer constant, so safe to backup while looking for suffix.
+                posn = str.find_last_of( "pP" );                                // pointer value
+                if ( posn != string::npos ) { ltype = 5; str.erase( posn, 1 ); goto FINI; }
+                posn = str.find_last_of( "zZ" );                                // size_t
+                if ( posn != string::npos ) { Unsigned = true; type = 2; ltype = 4; str.erase( posn, 1 ); goto FINI; }
+                posn = str.rfind( "hh" );                                               // char
+                if ( posn != string::npos ) { type = 1; str.erase( posn, 2 ); goto FINI; }
+                posn = str.rfind( "HH" );                                               // char
+                if ( posn != string::npos ) { type = 1; str.erase( posn, 2 ); goto FINI; }
+                posn = str.find_last_of( "hH" );                                // short
+                if ( posn != string::npos ) { type = 0; str.erase( posn, 1 ); goto FINI; }
+                posn = str.find_last_of( "nN" );                                // int (natural number)
+                if ( posn != string::npos ) { type = 2; str.erase( posn, 1 ); goto FINI; }
+                if ( str.rfind( "ll" ) != string::npos || str.rfind( "LL" ) != string::npos ) { type = 4; goto FINI; }
+                lnthSuffix( str, type, ltype );                                 // must be after check for "ll"
+          FINI: ;
+        } // if
         // Cannot be just "0"/"1"; sscanf stops at the suffix, if any; value goes over the wall => always generate
+#if ! defined(__SIZEOF_INT128__)
+        if ( type == 5 ) SemanticError( yylloc, "int128 constant is not supported on this target " + str );
+#endif // ! __SIZEOF_INT128__
         if ( str[0] == '0' ) {                                                          // radix character ?
                 dec = false;
                 if ( checkX( str[1] ) ) {                                               // hex constant ?
+                        sscanf( (char *)str.c_str(), "%llx", &v );
+                        if ( type < 5 ) {                                                       // not L128 ?
+                                sscanf( (char *)str.c_str(), "%llx", &v );
+#if defined(__SIZEOF_INT128__)
+                        } else {                                                                        // hex int128 constant
+                                unsigned int len = str.length();
+                                if ( len > (2 + 16 + 16) ) SemanticError( yylloc, "128-bit hexadecimal constant to large " + str );
+                          if ( len <= (2 + 16) ) goto FHEX1;            // hex digits < 2^64
+                                str2 = "0x" + str.substr( len - 16 );
+                                sscanf( (char *)str2.c_str(), "%llx", &v2 );
+                                str = str.substr( 0, len - 16 );
+                          FHEX1: ;
+                                sscanf( (char *)str.c_str(), "%llx", &v );
+#endif // __SIZEOF_INT128__
+                        } // if
                         //printf( "%llx %llu\n", v, v );
                 } else if ( checkB( str[1] ) ) {                                // binary constant ?
+                        v = 0;                                                                          // compute value
+                        for ( unsigned int i = 2;; ) {                          // ignore prefix
+                                if ( str[i] == '1' ) v |= 1;
+                                i += 1;
+                          if ( i == last - 1 || (str[i] != '0' && str[i] != '1') ) break;
+                                v <<= 1;
+                        } // for
+#if defined(__SIZEOF_INT128__)
+                        unsigned int len = str.length();
+                        if ( type == 5 && len > 2 + 64 ) {
+                                if ( len > 2 + 64 + 64 ) SemanticError( yylloc, "128-bit binary constant to large " + str );
+                                str2 = "0b" + str.substr( len - 64 );
+                                str = str.substr( 0, len - 64 );
+                                scanbin( str2, v2 );
+                        } // if
+#endif // __SIZEOF_INT128__
+                        scanbin( str, v );
                         //printf( "%#llx %llu\n", v, v );
                 } else {                                                                                // octal constant
+                        sscanf( (char *)str.c_str(), "%llo", &v );
+                        if ( type < 5 ) {                                                       // not L128 ?
+                                sscanf( (char *)str.c_str(), "%llo", &v );
+#if defined(__SIZEOF_INT128__)
+                        } else {                                                                        // octal int128 constant
+                                unsigned int len = str.length();
+                                if ( len > 1 + 43 || (len == 1 + 43 && str[0] > '3') ) SemanticError( yylloc, "128-bit octal constant to large " + str );
+                                char buf[32];
+                                if ( len <= 1 + 21 ) {                                  // value < 21 octal digitis
+                                        sscanf( (char *)str.c_str(), "%llo", &v );
+                                } else {
+                                        sscanf( &str[len - 21], "%llo", &v );
+                                        __int128 val = v;                                       // accumulate bits
+                                        str[len - 21] ='\0';                            // shorten string
+                                        sscanf( &str[len == 43 ? 1 : 0], "%llo", &v );
+                                        val |= (__int128)v << 63;                       // store bits
+                                        if ( len == 1 + 43 ) {                          // most significant 2 bits ?
+                                                str[2] = '\0';                                  // shorten string
+                                                sscanf( &str[1], "%llo", &v );  // process most significant 2 bits
+                                                val |= (__int128)v << 126;              // store bits
+                                        } // if
+                                        v = val >> 64; v2 = (uint64_t)val;      // replace octal constant with 2 hex constants
+                                        sprintf( buf, "%#llx", v2 );
+                                        str2 = buf;
+                                } // if
+                                sprintf( buf, "%#llx", v );
+                                str = buf;
+#endif // __SIZEOF_INT128__
+                        } // if
                         //printf( "%#llo %llu\n", v, v );
                 } // if
         } else {                                                                                        // decimal constant ?
+                sscanf( (char *)str.c_str(), "%llu", &v );
+                if ( type < 5 ) {                                                               // not L128 ?
+                        sscanf( (char *)str.c_str(), "%llu", &v );
+#if defined(__SIZEOF_INT128__)
+                } else {                                                                                // decimal int128 constant
+                        #define P10_UINT64 10'000'000'000'000'000'000ULL // 19 zeroes
+                        unsigned int len = str.length();
+                        if ( str.length() == 39 && str > (Unsigned ? "340282366920938463463374607431768211455" : "170141183460469231731687303715884105727") )
+                                SemanticError( yylloc, "128-bit decimal constant to large " + str );
+                        char buf[32];
+                        if ( len <= 19 ) {                                                      // value < 19 decimal digitis
+                                sscanf( (char *)str.c_str(), "%llu", &v );
+                        } else {
+                                sscanf( &str[len - 19], "%llu", &v );
+                                __int128 val = v;                                               // accumulate bits
+                                str[len - 19] ='\0';                                    // shorten string
+                                sscanf( &str[len == 39 ? 1 : 0], "%llu", &v );
+                                val += (__int128)v * (__int128)P10_UINT64; // store bits
+                                if ( len == 39 ) {                                              // most significant 2 bits ?
+                                        str[1] = '\0';                                          // shorten string
+                                        sscanf( &str[0], "%llu", &v );          // process most significant 2 bits
+                                        val += (__int128)v * (__int128)P10_UINT64 * (__int128)P10_UINT64; // store bits
+                                } // if
+                                v = val >> 64; v2 = (uint64_t)val;              // replace decimal constant with 2 hex constants
+                                sprintf( buf, "%#llx", v2 );
+                                str2 = buf;
+                        } // if
+                        sprintf( buf, "%#llx", v );
+                        str = buf;
+#endif // __SIZEOF_INT128__
+                } // if
                 //printf( "%llu\n", v );
         } // if
+        string::size_type posn;
+        if ( isdigit( str[last] ) ) {                                           // no suffix ?
+                lnthSuffix( str, type, ltype );                                 // could have length suffix
+                if ( type == -1 ) {                                                             // no suffix
+                        valueToType( v, dec, type, Unsigned );
+                } // if
+        } else {
+                // At least one digit in integer constant, so safe to backup while looking for suffix.
+                posn = str.find_last_of( "pP" );
+                if ( posn != string::npos ) { valueToType( v, dec, type, Unsigned ); ltype = 5; str.erase( posn, 1 ); goto FINI; }
+                posn = str.find_last_of( "zZ" );
+                if ( posn != string::npos ) { Unsigned = true; type = 2; ltype = 4; str.erase( posn, 1 ); goto FINI; }
+                // 'u' can appear before or after length suffix
+                if ( str.find_last_of( "uU" ) != string::npos ) Unsigned = true;
+                posn = str.rfind( "hh" );
+                if ( posn != string::npos ) { type = 1; str.erase( posn, 2 ); goto FINI; }
+                posn = str.rfind( "HH" );
+                if ( posn != string::npos ) { type = 1; str.erase( posn, 2 ); goto FINI; }
+                posn = str.find_last_of( "hH" );
+                if ( posn != string::npos ) { type = 0; str.erase( posn, 1 ); goto FINI; }
+                posn = str.find_last_of( "nN" );
+                if ( posn != string::npos ) { type = 2; str.erase( posn, 1 ); goto FINI; }
+                if ( str.rfind( "ll" ) != string::npos || str.rfind( "LL" ) != string::npos ) { type = 4; goto FINI; }
+                lnthSuffix( str, type, ltype );                                 // must be after check for "ll"
+                if ( type == -1 ) {                                                             // only 'u' suffix ?
+                        valueToType( v, dec, type, Unsigned );
+                } // if
+          FINI: ;
+        } // if
+        if ( type == -1 ) {                                                                     // no suffix => determine type from value size
+                valueToType( v, dec, type, Unsigned );
+        } // if
+        /* printf( "%s %llo %s %llo\n", str.c_str(), v, str2.c_str(), v2 ); */
         //if ( !( 0 <= type && type <= 6 ) ) { printf( "%s %lu %d %s\n", fred.c_str(), fred.length(), type, str.c_str() ); }
 …
         } else if ( ltype != -1 ) {                                                     // explicit length ?
                 if ( ltype == 6 ) {                                                             // int128, (int128)constant
+                        ret = new CastExpr( ret, new BasicType( Type::Qualifiers(), kind[Unsigned][type] ), false );
+//                      ret = new CastExpr( ret, new BasicType( Type::Qualifiers(), kind[Unsigned][type] ), false );
+                        ret2 = new ConstantExpr( Constant( new BasicType( noQualifiers, BasicType::LongLongSignedInt ), str2, v2 ) );
+                        ret = build_compoundLiteral( DeclarationNode::newBasicType( DeclarationNode::Int128 )->addType( DeclarationNode::newSignedNess( DeclarationNode::Unsigned ) ),
+                                                                                 new InitializerNode( (InitializerNode *)(new InitializerNode( new ExpressionNode( v2 == 0 ? ret2 : ret ) ))->set_last( new InitializerNode( new ExpressionNode( v2 == 0 ? ret : ret2 ) ) ), true ) );
                 } else {                                                                                // explicit length, (length_type)constant
                         ret = new CastExpr( ret, new TypeInstType( Type::Qualifiers(), lnthsInt[Unsigned][ltype], false ), false );
 …
                 if ( str[1] == '8' ) goto Default;                              // utf-8 characters => array of char
                 // lookup type of associated typedef
                 strtype = new TypeInstType( Type::Qualifiers( Type::Const ), "char16_t", false );
+                strtype = new TypeInstType( Type::Qualifiers( ), "char16_t", false );
                 break;
           case 'U':
                 strtype = new TypeInstType( Type::Qualifiers( Type::Const ), "char32_t", false );
+                strtype = new TypeInstType( Type::Qualifiers( ), "char32_t", false );
                 break;
           case 'L':
                 strtype = new TypeInstType( Type::Qualifiers( Type::Const ), "wchar_t", false );
+                strtype = new TypeInstType( Type::Qualifiers( ), "wchar_t", false );
                 break;
           Default:                                                                                      // char default string type
           default:
                 strtype = new BasicType( Type::Qualifiers( Type::Const ), BasicType::Char );
+                strtype = new BasicType( Type::Qualifiers( ), BasicType::Char );
         } // switch
         ArrayType * at = new ArrayType( noQualifiers, strtype,

src/Parser/ParseNode.h

-              r3c64c668
+              r58fe85a
 // Created On       : Sat May 16 13:28:16 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Feb  7 17:56:02 2020
 // Update Count     : 891
+// Last Modified On : Sat Oct 24 03:53:54 2020
+// Update Count     : 895
 //
 …
 class Attribute;
 class Declaration;
 class DeclarationNode;
+struct DeclarationNode;
 class DeclarationWithType;
 class ExpressionNode;
 class Initializer;
 class StatementNode;
+struct StatementNode;
 //##############################################################################
 …
 class InitializerNode : public ParseNode {
   public:
         InitializerNode( ExpressionNode *, bool aggrp = false,  ExpressionNode * des = nullptr );
+        InitializerNode( ExpressionNode *, bool aggrp = false, ExpressionNode * des = nullptr );
         InitializerNode( InitializerNode *, bool aggrp = false, ExpressionNode * des = nullptr );
         InitializerNode( bool isDelete );
 …
 struct TypeData;
+class DeclarationNode : public ParseNode {
+  public:
+struct DeclarationNode : public ParseNode {
         // These enumerations must harmonize with their names in DeclarationNode.cc.
         enum BasicType { Void, Bool, Char, Int, Int128,
 …
         bool get_inLine() const { return inLine; }
         DeclarationNode * set_inLine( bool inL ) { inLine = inL; return this; }
+  public:
         DeclarationNode * get_last() { return (DeclarationNode *)ParseNode::get_last(); }
 …
 //##############################################################################
+class StatementNode final : public ParseNode {
+  public:
+struct StatementNode final : public ParseNode {
         StatementNode() { stmt = nullptr; }
         StatementNode( Statement * stmt ) : stmt( stmt ) {}
 …
                 os << stmt.get() << std::endl;
+        }
+  private:
         std::unique_ptr<Statement> stmt;
 }; // StatementNode
 …
 Statement * build_finally( StatementNode * stmt );
 Statement * build_compound( StatementNode * first );
+StatementNode * maybe_build_compound( StatementNode * first );
 Statement * build_asm( bool voltile, Expression * instruction, ExpressionNode * output = nullptr, ExpressionNode * input = nullptr, ExpressionNode * clobber = nullptr, LabelNode * gotolabels = nullptr );
 Statement * build_directive( std::string * directive );
+SuspendStmt * build_suspend( StatementNode *, SuspendStmt::Type = SuspendStmt::None);
 WaitForStmt * build_waitfor( ExpressionNode * target, StatementNode * stmt, ExpressionNode * when );
 WaitForStmt * build_waitfor( ExpressionNode * target, StatementNode * stmt, ExpressionNode * when, WaitForStmt * existing );

src/Parser/StatementNode.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Sat May 16 14:59:41 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Aug  4 09:39:25 2018
 // Update Count     : 363
+// Last Modified On : Sat Oct 24 04:20:55 2020
+// Update Count     : 383
 //
 …
 } // build_finally
+SuspendStmt * build_suspend( StatementNode * then, SuspendStmt::Type type ) {
+        auto node = new SuspendStmt();
+        node->type = type;
+        std::list< Statement * > stmts;
+        buildMoveList< Statement, StatementNode >( then, stmts );
+        if(!stmts.empty()) {
+                assert( stmts.size() == 1 );
+                node->then = dynamic_cast< CompoundStmt * >( stmts.front() );
+        }
+        return node;
+}
 WaitForStmt * build_waitfor( ExpressionNode * targetExpr, StatementNode * stmt, ExpressionNode * when ) {
         auto node = new WaitForStmt();
 …
 } // build_compound
+// A single statement in a control structure is always converted to a compound statement so subsequent generated code
+// can be placed within this compound statement. Otherwise, code generation has to constantly check for a single
+// statement and wrap it into a compound statement to insert additional code. Hence, all control structures have a
+// conical form for code generation.
+StatementNode * maybe_build_compound( StatementNode * first ) {
+        // Optimization: if the control-structure statement is a compound statement, do not wrap it.
+        // e.g., if (...) {...} do not wrap the existing compound statement.
+        if ( ! dynamic_cast<CompoundStmt *>( first->stmt.get() ) ) { // unique_ptr
+                CompoundStmt * cs = new CompoundStmt();
+                buildMoveList( first, cs->get_kids() );
+                return new StatementNode( cs );
+        } // if
+        return first;
+} // maybe_build_compound
 Statement * build_asm( bool voltile, Expression * instruction, ExpressionNode * output, ExpressionNode * input, ExpressionNode * clobber, LabelNode * gotolabels ) {
         std::list< Expression * > out, in;

src/Parser/TypeData.cc

-              r3c64c668
+              r58fe85a
           case AggregateDecl::Struct:
           case AggregateDecl::Coroutine:
+          case AggregateDecl::Generator:
           case AggregateDecl::Monitor:
           case AggregateDecl::Thread:
 …
                 ret = new TypeDecl( name, scs, typebuild( td->base ), TypeDecl::Dtype, true );
         } // if
-        buildList( td->symbolic.params, ret->get_parameters() );
         buildList( td->symbolic.assertions, ret->get_assertions() );
         ret->base->attributes.splice( ret->base->attributes.end(), attributes );

src/Parser/lex.ll

-              r3c64c668
+              r58fe85a
  * Created On       : Sat Sep 22 08:58:10 2001
  * Last Modified By : Peter A. Buhr
  * Last Modified On : Sat Feb 15 11:05:50 2020
  * Update Count     : 737
+ * Last Modified On : Tue Oct  6 18:15:41 2020
+ * Update Count     : 743
  */
 …
 #define IDENTIFIER_RETURN()     RETURN_VAL( typedefTable.isKind( yytext ) )
 #ifdef HAVE_KEYWORDS_FLOATXX                                                            // GCC >= 7 => keyword, otherwise typedef
+#ifdef HAVE_KEYWORDS_FLOATXX                                                    // GCC >= 7 => keyword, otherwise typedef
 #define FLOATXX(v) KEYWORD_RETURN(v);
 #else
 #define FLOATXX(v) IDENTIFIER_RETURN();
+#define FLOATXX(v) IDENTIFIER_RETURN();
 #endif // HAVE_KEYWORDS_FLOATXX
 …
 __restrict__    { KEYWORD_RETURN(RESTRICT); }                   // GCC
 return                  { KEYWORD_RETURN(RETURN); }
         /* resume                       { KEYWORD_RETURN(RESUME); }                             // CFA */
+ /* resume                      { KEYWORD_RETURN(RESUME); }                             // CFA */
 short                   { KEYWORD_RETURN(SHORT); }
 signed                  { KEYWORD_RETURN(SIGNED); }
 …
 _Static_assert  { KEYWORD_RETURN(STATICASSERT); }               // C11
 struct                  { KEYWORD_RETURN(STRUCT); }
+        /* suspend                      { KEYWORD_RETURN(SUSPEND); }                    // CFA */
+suspend                 { KEYWORD_RETURN(SUSPEND); }                    // CFA
 switch                  { KEYWORD_RETURN(SWITCH); }
 thread                  { KEYWORD_RETURN(THREAD); }                             // C11

src/Parser/module.mk

-              r3c64c668
+              r58fe85a
 BUILT_SOURCES = Parser/parser.hh
 AM_YFLAGS = -d -t -v
+AM_YFLAGS = -d -t -v -Wno-yacc
 SRC += \
 …
        Parser/ExpressionNode.cc \
        Parser/InitializerNode.cc \
+       Parser/lex.ll \
        Parser/ParseNode.cc \
+       Parser/ParseNode.h \
+       Parser/parser.yy \
+       Parser/ParserTypes.h \
+       Parser/parserutility.cc \
+       Parser/parserutility.h \
        Parser/StatementNode.cc \
        Parser/TypeData.cc \
+       Parser/TypeData.h \
        Parser/TypedefTable.cc \
+       Parser/lex.ll \
+       Parser/parser.yy \
+       Parser/parserutility.cc
+       Parser/TypedefTable.h
 MOSTLYCLEANFILES += Parser/lex.cc Parser/parser.cc Parser/parser.hh Parser/parser.output

src/Parser/parser.yy

-              r3c64c668
+              r58fe85a
 // Created On       : Sat Sep  1 20:22:55 2001
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Feb 21 14:47:29 2020
 // Update Count     : 4468
+// Last Modified On : Sat Oct 24 08:21:14 2020
+// Update Count     : 4624
 //
 …
                         return forCtrl( type, new string( identifier->name ), start, compop, comp, inc );
                 } else {
                         SemanticError( yylloc, "Expression disallowed. Only loop-index name allowed" ); return nullptr;
+                        SemanticError( yylloc, "Expression disallowed. Only loop-index name allowed." ); return nullptr;
                 } // if
         } else {
                 SemanticError( yylloc, "Expression disallowed. Only loop-index name allowed" ); return nullptr;
+                SemanticError( yylloc, "Expression disallowed. Only loop-index name allowed." ); return nullptr;
         } // if
 } // forCtrl
 …
 %token OTYPE FTYPE DTYPE TTYPE TRAIT                                    // CFA
 %token SIZEOF OFFSETOF
+// %token SUSPEND RESUME                                                                        // CFA
+// %token RESUME                                                                                        // CFA
+%token SUSPEND                                                                                  // CFA
 %token ATTRIBUTE EXTENSION                                                              // GCC
 %token IF ELSE SWITCH CASE DEFAULT DO WHILE FOR BREAK CONTINUE GOTO RETURN
 …
 %type<en> conditional_expression                constant_expression                     assignment_expression           assignment_expression_opt
 %type<en> comma_expression                              comma_expression_opt
 %type<en> argument_expression_list              argument_expression                     default_initialize_opt
+%type<en> argument_expression_list_opt  argument_expression                     default_initialize_opt
 %type<ifctl> if_control_expression
 %type<fctl> for_control_expression              for_control_expression_list
 …
 %type<decl> assertion assertion_list assertion_list_opt
 %type<en>   bit_subrange_size_opt bit_subrange_size
+%type<en> bit_subrange_size_opt bit_subrange_size
 %type<decl> basic_declaration_specifier basic_type_name basic_type_specifier direct_type indirect_type
 …
                 // equivalent to the old x[i,j].
                 { $$ = new ExpressionNode( build_binary_val( OperKinds::Index, $1, $3 ) ); }
         | postfix_expression '{' argument_expression_list '}' // CFA, constructor call
+        | postfix_expression '{' argument_expression_list_opt '}' // CFA, constructor call
+                {
                         Token fn;
 …
                         $$ = new ExpressionNode( new ConstructorExpr( build_func( new ExpressionNode( build_varref( fn ) ), (ExpressionNode *)( $1 )->set_last( $3 ) ) ) );
+                }
         | postfix_expression '(' argument_expression_list ')'
+        | postfix_expression '(' argument_expression_list_opt ')'
                 { $$ = new ExpressionNode( build_func( $1, $3 ) ); }
         | postfix_expression '`' identifier                                     // CFA, postfix call
 …
         | '(' type_no_function ')' '@' '{' initializer_list_opt comma_opt '}' // CFA, explicit C compound-literal
                 { $$ = new ExpressionNode( build_compoundLiteral( $2, (new InitializerNode( $6, true ))->set_maybeConstructed( false ) ) ); }
         | '^' primary_expression '{' argument_expression_list '}' // CFA, destructor call
+        | '^' primary_expression '{' argument_expression_list_opt '}' // CFA, destructor call
+                {
                         Token fn;
 …
+        ;
 argument_expression_list:
+argument_expression_list_opt:
         // empty
                 { $$ = nullptr; }
         | argument_expression
         | argument_expression_list ',' argument_expression
+        | argument_expression_list_opt ',' argument_expression
                 { $$ = (ExpressionNode *)($1->set_last( $3 )); }
+        ;
 …
         | '(' aggregate_control '&' ')' cast_expression         // CFA
                 { $$ = new ExpressionNode( build_keyword_cast( $2, $5 ) ); }
-                // VIRTUAL cannot be opt because of look ahead issues
         | '(' VIRTUAL ')' cast_expression                                       // CFA
                 { $$ = new ExpressionNode( new VirtualCastExpr( maybeMoveBuild< Expression >( $4 ), maybeMoveBuildType( nullptr ) ) ); }
 …
         conditional_expression
         | unary_expression assignment_operator assignment_expression
+                { $$ = new ExpressionNode( build_binary_val( $2, $1, $3 ) ); }
+                {
+//                      if ( $2 == OperKinds::AtAssn ) {
+//                              SemanticError( yylloc, "C @= assignment is currently unimplemented." ); $$ = nullptr;
+//                      } else {
+                                $$ = new ExpressionNode( build_binary_val( $2, $1, $3 ) );
+//                      } // if
+                }
         | unary_expression '=' '{' initializer_list_opt comma_opt '}'
                 { SemanticError( yylloc, "Initializer assignment is currently unimplemented." ); $$ = nullptr; }
 …
 tuple_expression_list:
+        assignment_expression_opt
+        | tuple_expression_list ',' assignment_expression_opt
+        assignment_expression
+        | '@'                                                                                           // CFA
+                { SemanticError( yylloc, "Eliding tuple element with '@' is currently unimplemented." ); $$ = nullptr; }
+        | tuple_expression_list ',' assignment_expression
                 { $$ = (ExpressionNode *)($1->set_last( $3 )); }
+        | tuple_expression_list ',' '@'
+                { SemanticError( yylloc, "Eliding tuple element with '@' is currently unimplemented." ); $$ = nullptr; }
+        ;
 …
         IF '(' if_control_expression ')' statement                      %prec THEN
                 // explicitly deal with the shift/reduce conflict on if/else
                 { $$ = new StatementNode( build_if( $3, $5, nullptr ) ); }
+                { $$ = new StatementNode( build_if( $3, maybe_build_compound( $5 ), nullptr ) ); }
         | IF '(' if_control_expression ')' statement ELSE statement
                 { $$ = new StatementNode( build_if( $3, $5, $7 ) ); }
+                { $$ = new StatementNode( build_if( $3, maybe_build_compound( $5 ), maybe_build_compound( $7 ) ) ); }
+        ;
 …
 case_clause:                                                                                    // CFA
         case_label_list statement                                       { $$ = $1->append_last_case( new StatementNode( build_compound( $2 ) ) ); }
+        case_label_list statement                                       { $$ = $1->append_last_case( maybe_build_compound( $2 ) ); }
+        ;
 …
 iteration_statement:
         WHILE '(' push if_control_expression ')' statement pop
                 { $$ = new StatementNode( build_while( $4, $6 ) ); }
+                { $$ = new StatementNode( build_while( $4, maybe_build_compound( $6 ) ) ); }
         | WHILE '(' ')' statement                                                       // CFA => while ( 1 )
                 { $$ = new StatementNode( build_while( new IfCtrl( nullptr, new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ), $4 ) ); }
+                { $$ = new StatementNode( build_while( new IfCtrl( nullptr, new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ), maybe_build_compound( $4 ) ) ); }
         | DO statement WHILE '(' comma_expression ')' ';'
                 { $$ = new StatementNode( build_do_while( $5, $2 ) ); }
+                { $$ = new StatementNode( build_do_while( $5, maybe_build_compound( $2 ) ) ); }
         | DO statement WHILE '(' ')' ';'                                        // CFA => do while( 1 )
                 { $$ = new StatementNode( build_do_while( new ExpressionNode( build_constantInteger( *new string( "1" ) ) ), $2 ) ); }
+                { $$ = new StatementNode( build_do_while( new ExpressionNode( build_constantInteger( *new string( "1" ) ) ), maybe_build_compound( $2 ) ) ); }
         | FOR '(' push for_control_expression_list ')' statement pop
                 { $$ = new StatementNode( build_for( $4, $6 ) ); }
+                { $$ = new StatementNode( build_for( $4, maybe_build_compound( $6 ) ) ); }
         | FOR '(' ')' statement                                                         // CFA => for ( ;; )
                 { $$ = new StatementNode( build_for( new ForCtrl( (ExpressionNode * )nullptr, (ExpressionNode * )nullptr, (ExpressionNode * )nullptr ), $4 ) ); }
+                { $$ = new StatementNode( build_for( new ForCtrl( (ExpressionNode * )nullptr, (ExpressionNode * )nullptr, (ExpressionNode * )nullptr ), maybe_build_compound( $4 ) ) ); }
+        ;
 …
                 { $$ = forCtrl( $1, new string( DeclarationNode::anonymous.newName() ), new ExpressionNode( build_constantInteger( *new string( "0" ) ) ),
                                                 OperKinds::LThan, $1->clone(), new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ); }
         | '=' comma_expression                                                                  // CFA
+        | '=' comma_expression                                                          // CFA
                 { $$ = forCtrl( $2, new string( DeclarationNode::anonymous.newName() ), new ExpressionNode( build_constantInteger( *new string( "0" ) ) ),
                                                 OperKinds::LEThan, $2->clone(), new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ); }
 …
         | comma_expression inclexcl comma_expression '~' comma_expression // CFA
                 { $$ = forCtrl( $1, new string( DeclarationNode::anonymous.newName() ), $1->clone(), $2, $3, $5 ); }
+        | comma_expression ';'                                                          // CFA
+                { $$ = forCtrl( new ExpressionNode( build_constantInteger( *new string( "0u" ) ) ), $1, nullptr, OperKinds::LThan, nullptr, nullptr ); }
         | comma_expression ';' comma_expression                         // CFA
                 { $$ = forCtrl( $3, $1, new ExpressionNode( build_constantInteger( *new string( "0" ) ) ),
                                                 OperKinds::LThan, $3->clone(), new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ); }
         | comma_expression ';' '=' comma_expression                             // CFA
+        | comma_expression ';' '=' comma_expression                     // CFA
                 { $$ = forCtrl( $4, $1, new ExpressionNode( build_constantInteger( *new string( "0" ) ) ),
                                                 OperKinds::LEThan, $4->clone(), new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ); }
 …
         | RETURN '{' initializer_list_opt comma_opt '}' ';'
                 { SemanticError( yylloc, "Initializer return is currently unimplemented." ); $$ = nullptr; }
+        // | SUSPEND ';'
+        //      { SemanticError( yylloc, "Suspend expression is currently unimplemented." ); $$ = nullptr; }
+        // | SUSPEND compound_statement ';'
+        //      { SemanticError( yylloc, "Suspend expression is currently unimplemented." ); $$ = nullptr; }
+        | SUSPEND ';'
+                { $$ = new StatementNode( build_suspend( nullptr ) ); }
+        | SUSPEND compound_statement
+                { $$ = new StatementNode( build_suspend( $2 ) ); }
+        | SUSPEND COROUTINE ';'
+                { $$ = new StatementNode( build_suspend( nullptr, SuspendStmt::Coroutine ) ); }
+        | SUSPEND COROUTINE compound_statement
+                { $$ = new StatementNode( build_suspend( $3, SuspendStmt::Coroutine ) ); }
+        | SUSPEND GENERATOR ';'
+                { $$ = new StatementNode( build_suspend( nullptr, SuspendStmt::Generator ) ); }
+        | SUSPEND GENERATOR compound_statement
+                { $$ = new StatementNode( build_suspend( $3, SuspendStmt::Generator ) ); }
         | THROW assignment_expression_opt ';'                           // handles rethrow
                 { $$ = new StatementNode( build_throw( $2 ) ); }
 …
 // If MUTEX becomes a general qualifier, there are shift/reduce conflicts, so change syntax to "with mutex".
 mutex_statement:
         MUTEX '(' argument_expression_list ')' statement
+        MUTEX '(' argument_expression_list_opt ')' statement
                 { SemanticError( yylloc, "Mutex statement is currently unimplemented." ); $$ = nullptr; }
+        ;
 …
         WAITFOR '(' cast_expression ')'
                 { $$ = $3; }
 //      | WAITFOR '(' cast_expression ',' argument_expression_list ')'
+//      | WAITFOR '(' cast_expression ',' argument_expression_list_opt ')'
 //              { $$ = (ExpressionNode *)$3->set_last( $5 ); }
         | WAITFOR '(' cast_expression_list ':' argument_expression_list ')'
+        | WAITFOR '(' cast_expression_list ':' argument_expression_list_opt ')'
                 { $$ = (ExpressionNode *)($3->set_last( $5 )); }
+        ;
 …
         cast_expression
         | cast_expression_list ',' cast_expression
+                { $$ = (ExpressionNode *)($1->set_last( $3 )); }
+                // { $$ = (ExpressionNode *)($1->set_last( $3 )); }
+                { SemanticError( yylloc, "List of mutex member is currently unimplemented." ); $$ = nullptr; }
+        ;
 …
 waitfor_clause:
         when_clause_opt waitfor statement                                       %prec THEN
                 { $$ = build_waitfor( $2, $3, $1 ); }
+                { $$ = build_waitfor( $2, maybe_build_compound( $3 ), $1 ); }
         | when_clause_opt waitfor statement WOR waitfor_clause
                 { $$ = build_waitfor( $2, $3, $1, $5 ); }
+                { $$ = build_waitfor( $2, maybe_build_compound( $3 ), $1, $5 ); }
         | when_clause_opt timeout statement                                     %prec THEN
                 { $$ = build_waitfor_timeout( $2, $3, $1 ); }
+                { $$ = build_waitfor_timeout( $2, maybe_build_compound( $3 ), $1 ); }
         | when_clause_opt ELSE statement
                 { $$ = build_waitfor_timeout( nullptr, $3, $1 ); }
+                { $$ = build_waitfor_timeout( nullptr, maybe_build_compound( $3 ), $1 ); }
                 // "else" must be conditional after timeout or timeout is never triggered (i.e., it is meaningless)
         | when_clause_opt timeout statement WOR ELSE statement
                 { SemanticError( yylloc, "else clause must be conditional after timeout or timeout never triggered." ); $$ = nullptr; }
         | when_clause_opt timeout statement WOR when_clause ELSE statement
                 { $$ = build_waitfor_timeout( $2, $3, $1, $7, $5 ); }
+                { $$ = build_waitfor_timeout( $2, maybe_build_compound( $3 ), $1, maybe_build_compound( $7 ), $5 ); }
+        ;
 …
 typedef_expression:
                 // GCC, naming expression type: typedef name = exp; gives a name to the type of an expression
+                // deprecated GCC, naming expression type: typedef name = exp; gives a name to the type of an expression
         TYPEDEF identifier '=' assignment_expression
+                {
+                        // $$ = DeclarationNode::newName( 0 );                  // unimplemented
+                        SemanticError( yylloc, "Typedef expression is currently unimplemented." ); $$ = nullptr;
+                        SemanticError( yylloc, "Typedef expression is deprecated, use typeof(...) instead." ); $$ = nullptr;
+                }
         | typedef_expression pop ',' push identifier '=' assignment_expression
+                {
+                        // $$ = DeclarationNode::newName( 0 );                  // unimplemented
+                        SemanticError( yylloc, "Typedef expression is currently unimplemented." ); $$ = nullptr;
+                }
+        ;
+//c_declaration:
+//      declaring_list pop ';'
+//      | typedef_declaration pop ';'
+//      | typedef_expression pop ';'                                            // GCC, naming expression type
+//      | sue_declaration_specifier pop ';'
+//      ;
+//
+//declaring_list:
+//              // A semantic check is required to ensure asm_name only appears on declarations with implicit or explicit static
+//              // storage-class
+//       declarator asm_name_opt initializer_opt
+//              {
+//                      typedefTable.addToEnclosingScope( IDENTIFIER );
+//                      $$ = ( $2->addType( $1 ))->addAsmName( $3 )->addInitializer( $4 );
+//              }
+//      | declaring_list ',' attribute_list_opt declarator asm_name_opt initializer_opt
+//              {
+//                      typedefTable.addToEnclosingScope( IDENTIFIER );
+//                      $$ = $1->appendList( $1->cloneBaseType( $4->addAsmName( $5 )->addInitializer( $6 ) ) );
+//              }
+//      ;
+                        SemanticError( yylloc, "Typedef expression is deprecated, use typeof(...) instead." ); $$ = nullptr;
+                }
+        ;
 c_declaration:
 …
                 { $$ = distAttr( $1, $2 ); }
         | typedef_declaration
         | typedef_expression                                                            // GCC, naming expression type
+        | typedef_expression                                                            // deprecated GCC, naming expression type
         | sue_declaration_specifier
+        ;
 …
                 { yyy = true; $$ = AggregateDecl::Union; }
         | EXCEPTION                                                                                     // CFA
+                { yyy = true; $$ = AggregateDecl::Exception; }
+                // { yyy = true; $$ = AggregateDecl::Exception; }
+                { SemanticError( yylloc, "exception aggregate is currently unimplemented." ); $$ = AggregateDecl::NoAggregate; }
+        ;
 aggregate_control:                                                                              // CFA
+        GENERATOR
+                { yyy = true; $$ = AggregateDecl::Coroutine; }
+        MONITOR
+                { yyy = true; $$ = AggregateDecl::Monitor; }
+        | MUTEX STRUCT
+                { yyy = true; $$ = AggregateDecl::Monitor; }
+        | GENERATOR
+                { yyy = true; $$ = AggregateDecl::Generator; }
+        | MUTEX GENERATOR
+                { SemanticError( yylloc, "monitor generator is currently unimplemented." ); $$ = AggregateDecl::NoAggregate; }
         | COROUTINE
                 { yyy = true; $$ = AggregateDecl::Coroutine; }
         | MONITOR
                 { yyy = true; $$ = AggregateDecl::Monitor; }
+        | MUTEX COROUTINE
+                { SemanticError( yylloc, "monitor coroutine is currently unimplemented." ); $$ = AggregateDecl::NoAggregate; }
         | THREAD
                 { yyy = true; $$ = AggregateDecl::Thread; }
+        | MUTEX THREAD
+                { SemanticError( yylloc, "monitor thread is currently unimplemented." ); $$ = AggregateDecl::NoAggregate; }
+        ;
 …
 // Overloading: function, data, and operator identifiers may be overloaded.
 //
 // Type declarations: "type" is used to generate new types for declaring objects. Similarly, "dtype" is used for object
+// Type declarations: "otype" is used to generate new types for declaring objects. Similarly, "dtype" is used for object
 //     and incomplete types, and "ftype" is used for function types. Type declarations with initializers provide
 //     definitions of new types. Type declarations with storage class "extern" provide opaque types.
 …
         type_class identifier_or_type_name
                 { typedefTable.addToScope( *$2, TYPEDEFname, "9" ); }
           type_initializer_opt assertion_list_opt
+        type_initializer_opt assertion_list_opt
                 { $$ = DeclarationNode::newTypeParam( $1, $2 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
         | type_specifier identifier_parameter_declarator
 …
         assertion
         | assertion_list assertion
                 { $$ = $1 ? $1->appendList( $2 ) : $2; }
+                { $$ = $1->appendList( $2 ); }
+        ;
 …
         | attr_name
                 { $$ = DeclarationNode::newAttribute( $1 ); }
         | attr_name '(' argument_expression_list ')'
+        | attr_name '(' argument_expression_list_opt ')'
                 { $$ = DeclarationNode::newAttribute( $1, $3 ); }
+        ;

src/ResolvExpr/AdjustExprType.cc

-              r3c64c668
+              r58fe85a
 namespace {
+        struct AdjustExprType_new final : public ast::WithShortCircuiting {
+        class AdjustExprType_new final : public ast::WithShortCircuiting {
+                const ast::SymbolTable & symtab;
+        public:
                 const ast::TypeEnvironment & tenv;
-                const ast::SymbolTable & symtab;
                 AdjustExprType_new( const ast::TypeEnvironment & e, const ast::SymbolTable & syms )
                 : tenv( e ), symtab( syms ) {}
+                : symtab( syms ), tenv( e ) {}
                 void premutate( const ast::VoidType * ) { visit_children = false; }
                 void premutate( const ast::BasicType * ) { visit_children = false; }
                 void premutate( const ast::PointerType * ) { visit_children = false; }
                 void premutate( const ast::ArrayType * ) { visit_children = false; }
                 void premutate( const ast::FunctionType * ) { visit_children = false; }
                 void premutate( const ast::StructInstType * ) { visit_children = false; }
                 void premutate( const ast::UnionInstType * ) { visit_children = false; }
                 void premutate( const ast::EnumInstType * ) { visit_children = false; }
                 void premutate( const ast::TraitInstType * ) { visit_children = false; }
                 void premutate( const ast::TypeInstType * ) { visit_children = false; }
                 void premutate( const ast::TupleType * ) { visit_children = false; }
                 void premutate( const ast::VarArgsType * ) { visit_children = false; }
                 void premutate( const ast::ZeroType * ) { visit_children = false; }
                 void premutate( const ast::OneType * ) { visit_children = false; }
+                void previsit( const ast::VoidType * ) { visit_children = false; }
+                void previsit( const ast::BasicType * ) { visit_children = false; }
+                void previsit( const ast::PointerType * ) { visit_children = false; }
+                void previsit( const ast::ArrayType * ) { visit_children = false; }
+                void previsit( const ast::FunctionType * ) { visit_children = false; }
+                void previsit( const ast::StructInstType * ) { visit_children = false; }
+                void previsit( const ast::UnionInstType * ) { visit_children = false; }
+                void previsit( const ast::EnumInstType * ) { visit_children = false; }
+                void previsit( const ast::TraitInstType * ) { visit_children = false; }
+                void previsit( const ast::TypeInstType * ) { visit_children = false; }
+                void previsit( const ast::TupleType * ) { visit_children = false; }
+                void previsit( const ast::VarArgsType * ) { visit_children = false; }
+                void previsit( const ast::ZeroType * ) { visit_children = false; }
+                void previsit( const ast::OneType * ) { visit_children = false; }
                 const ast::Type * postmutate( const ast::ArrayType * at ) {
+                const ast::Type * postvisit( const ast::ArrayType * at ) {
                         return new ast::PointerType{ at->base, at->qualifiers };
+                }
                 const ast::Type * postmutate( const ast::FunctionType * ft ) {
+                const ast::Type * postvisit( const ast::FunctionType * ft ) {
                         return new ast::PointerType{ ft };
+                }
                 const ast::Type * postmutate( const ast::TypeInstType * inst ) {
+                const ast::Type * postvisit( const ast::TypeInstType * inst ) {
                         // replace known function-type-variables with pointer-to-function
                         if ( const ast::EqvClass * eqvClass = tenv.lookup( inst->name ) ) {
+                        if ( const ast::EqvClass * eqvClass = tenv.lookup( *inst ) ) {
                                 if ( eqvClass->data.kind == ast::TypeDecl::Ftype ) {
                                         return new ast::PointerType{ inst };

src/ResolvExpr/AlternativeFinder.cc

-              r3c64c668
+              r58fe85a
         void printAlts( const AltList &list, std::ostream &os, unsigned int indentAmt ) {
+                Indenter indent = { indentAmt };
+                for ( AltList::const_iterator i = list.begin(); i != list.end(); ++i ) {
+                        i->print( os, indent );
+                        os << std::endl;
+                std::vector<std::string> sorted;
+                sorted.reserve(list.size());
+                for(const auto & c : list) {
+                        std::stringstream ss;
+                        c.print( ss, indentAmt );
+                        sorted.push_back(ss.str());
+                }
+                std::sort(sorted.begin(), sorted.end());
+                for ( const auto & s : sorted ) {
+                        os << s << std::endl;
+                }
+        }
 …
                         SemanticError( expr, "No reasonable alternatives for expression " );
+                }
                 if ( mode.satisfyAssns || mode.prune ) {
+                if ( mode.prune ) {
                         // trim candidates just to those where the assertions resolve
                         // - necessary pre-requisite to pruning
 …
                         unify( castExpr->result, alt.expr->result, alt.env, needAssertions,
                                 haveAssertions, openVars, indexer );
+                        Cost thisCost = castCost( alt.expr->result, castExpr->result, alt.expr->get_lvalue(),
+                                indexer, alt.env );
+                        Cost thisCost =
+                                castExpr->isGenerated
+                                ? conversionCost( alt.expr->result, castExpr->result, alt.expr->get_lvalue(),   indexer, alt.env )
+                                : castCost( alt.expr->result, castExpr->result, alt.expr->get_lvalue(), indexer, alt.env );
                         PRINT(
                                 std::cerr << "working on cast with result: " << castExpr->result << std::endl;
 …
                                 // unification run for side-effects
+                                unify( toType, alt.expr->result, newEnv, need, have, openVars, indexer );
+                                bool canUnify = unify( toType, alt.expr->result, newEnv, need, have, openVars, indexer );
+                                (void) canUnify;
                                 // xxx - do some inspecting on this line... why isn't result bound to initAlt.type?
                                 Cost thisCost = castCost( alt.expr->result, toType, alt.expr->get_lvalue(),
+                                Cost thisCost = computeConversionCost( alt.expr->result, toType, alt.expr->get_lvalue(),
                                         indexer, newEnv );
+                                PRINT(
+                                        Cost legacyCost = castCost( alt.expr->result, toType, alt.expr->get_lvalue(),
+                                                indexer, newEnv );
+                                        std::cerr << "Considering initialization:";
+                                        std::cerr << std::endl << "  FROM: "; alt.expr->result->print(std::cerr);
+                                        std::cerr << std::endl << "  TO: ";   toType          ->print(std::cerr);
+                                        std::cerr << std::endl << "  Unification " << (canUnify ? "succeeded" : "failed");
+                                        std::cerr << std::endl << "  Legacy cost " << legacyCost;
+                                        std::cerr << std::endl << "  New cost " << thisCost;
+                                        std::cerr << std::endl;
+                                )
                                 if ( thisCost != Cost::infinity ) {
                                         // count one safe conversion for each value that is thrown away

src/ResolvExpr/Candidate.cpp

-              r3c64c668
+              r58fe85a
 void print( std::ostream & os, const CandidateList & cands, Indenter indent ) {
+        for ( const CandidateRef & cand : cands ) {
+                print( os, *cand, indent );
+                os << std::endl;
+        std::vector<std::string> sorted;
+        sorted.reserve(cands.size());
+        for(const auto & c : cands) {
+                std::stringstream ss;
+                print( ss, *c, indent );
+                sorted.push_back(ss.str());
+        }
+        std::sort(sorted.begin(), sorted.end());
+        for ( const auto & s : sorted ) {
+                os << s << std::endl;
+        }
+}

src/ResolvExpr/Candidate.hpp

-              r3c64c668
+              r58fe85a
         Candidate( const ast::Expr * x, const ast::TypeEnvironment & e )
+        : expr( x ), cost( Cost::zero ), cvtCost( Cost::zero ), env( e ), open(), need() {}
+        : expr( x ), cost( Cost::zero ), cvtCost( Cost::zero ), env( e ), open(), need() {
+                assert(x->result);
+        }
         Candidate( const Candidate & o, const ast::Expr * x, const Cost & addedCost = Cost::zero )
         : expr( x ), cost( o.cost + addedCost ), cvtCost( Cost::zero ), env( o.env ), open( o.open ),
+          need( o.need ) {}
+          need( o.need ) {
+                assert(x->result);
+        }
         Candidate(
                 const ast::Expr * x, const ast::TypeEnvironment & e, const ast::OpenVarSet & o,
+                const ast::Expr * x, const ast::TypeEnvironment & e, const ast::OpenVarSet & o,
                 const ast::AssertionSet & n, const Cost & c, const Cost & cvt = Cost::zero )
+        : expr( x ), cost( c ), cvtCost( cvt ), env( e ), open( o ), need( n.begin(), n.end() ) {}
+        : expr( x ), cost( c ), cvtCost( cvt ), env( e ), open( o ), need( n.begin(), n.end() ) {
+                assert(x->result);
+        }
         Candidate(
 …
                 ast::AssertionSet && n, const Cost & c, const Cost & cvt = Cost::zero )
         : expr( x ), cost( c ), cvtCost( cvt ), env( std::move( e ) ), open( std::move( o ) ),
+          need( n.begin(), n.end() ) {}
+          need( n.begin(), n.end() ) {
+                assert(x->result);
+        }
 };

src/ResolvExpr/CandidateFinder.cpp

-              r3c64c668
+              r58fe85a
 // Author           : Aaron B. Moss
 // Created On       : Wed Jun 5 14:30:00 2019
 // Last Modified By : Aaron B. Moss
 // Last Modified On : Wed Jun 5 14:30:00 2019
 // Update Count     : 1
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Oct  1 14:55:00 2019
+// Update Count     : 2
 //
 …
 #include "SymTab/Validate.h"      // for validateType
 #include "Tuples/Tuples.h"        // for handleTupleAssignment
+#include "InitTweak/InitTweak.h"  // for getPointerBase
+#include "Common/Stats/Counter.h"
 #define PRINT( text ) if ( resolvep ) { text }
 …
                 return new ast::CastExpr{ expr, expr->result->stripReferences() };
+        }
         return expr;
+}
 …
 UniqueId globalResnSlot = 0;
 Cost computeConversionCost(
         const ast::Type * argType, const ast::Type * paramType, const ast::SymbolTable & symtab,
         const ast::TypeEnvironment & env
+Cost computeConversionCost(
+        const ast::Type * argType, const ast::Type * paramType, bool argIsLvalue,
+        const ast::SymbolTable & symtab, const ast::TypeEnvironment & env
 ) {
         PRINT(
 …
                 std::cerr << std::endl;
+        )
         Cost convCost = conversionCost( argType, paramType, symtab, env );
+        Cost convCost = conversionCost( argType, paramType, argIsLvalue, symtab, env );
         PRINT(
                 std::cerr << std::endl << "cost is " << convCost << std::endl;
 …
         /// Computes conversion cost for a given expression to a given type
         const ast::Expr * computeExpressionConversionCost(
                 const ast::Expr * arg, const ast::Type * paramType, const ast::SymbolTable & symtab, const ast::TypeEnvironment & env, Cost & outCost
+        const ast::Expr * computeExpressionConversionCost(
+                const ast::Expr * arg, const ast::Type * paramType, const ast::SymbolTable & symtab, const ast::TypeEnvironment & env, Cost & outCost
         ) {
+                Cost convCost = computeConversionCost( arg->result, paramType, symtab, env );
+                Cost convCost = computeConversionCost(
+                                arg->result, paramType, arg->get_lvalue(), symtab, env );
                 outCost += convCost;
                 // If there is a non-zero conversion cost, ignoring poly cost, then the expression requires
                 // conversion. Ignore poly cost for now, since this requires resolution of the cast to
+                // If there is a non-zero conversion cost, ignoring poly cost, then the expression requires
+                // conversion. Ignore poly cost for now, since this requires resolution of the cast to
                 // infer parameters and this does not currently work for the reason stated below
                 Cost tmpCost = convCost;
 …
                         return new ast::CastExpr{ arg, newType };
                         // xxx - *should* be able to resolve this cast, but at the moment pointers are not
                         // castable to zero_t, but are implicitly convertible. This is clearly inconsistent,
+                        // xxx - *should* be able to resolve this cast, but at the moment pointers are not
+                        // castable to zero_t, but are implicitly convertible. This is clearly inconsistent,
                         // once this is fixed it should be possible to resolve the cast.
                         // xxx - this isn't working, it appears because type1 (parameter) is seen as widenable,
                         // but it shouldn't be because this makes the conversion from DT* to DT* since
+                        // xxx - this isn't working, it appears because type1 (parameter) is seen as widenable,
+                        // but it shouldn't be because this makes the conversion from DT* to DT* since
                         // commontype(zero_t, DT*) is DT*, rather than nothing
                         // CandidateFinder finder{ symtab, env };
                         // finder.find( arg, ResolvMode::withAdjustment() );
                         // assertf( finder.candidates.size() > 0,
+                        // assertf( finder.candidates.size() > 0,
                         //      "Somehow castable expression failed to find alternatives." );
                         // assertf( finder.candidates.size() == 1,
+                        // assertf( finder.candidates.size() == 1,
                         //      "Somehow got multiple alternatives for known cast expression." );
                         // return finder.candidates.front()->expr;
 …
         /// Computes conversion cost for a given candidate
         Cost computeApplicationConversionCost(
                 CandidateRef cand, const ast::SymbolTable & symtab
+        Cost computeApplicationConversionCost(
+                CandidateRef cand, const ast::SymbolTable & symtab
         ) {
                 auto appExpr = cand->expr.strict_as< ast::ApplicationExpr >();
 …
                                 if ( function->isVarArgs ) {
                                         convCost.incUnsafe();
                                         PRINT( std::cerr << "end of params with varargs function: inc unsafe: "
+                                        PRINT( std::cerr << "end of params with varargs function: inc unsafe: "
                                                 << convCost << std::endl; ; )
                                         // convert reference-typed expressions into value-typed expressions
                                         cand->expr = ast::mutate_field_index(
                                                 appExpr, &ast::ApplicationExpr::args, i,
+                                        cand->expr = ast::mutate_field_index(
+                                                appExpr, &ast::ApplicationExpr::args, i,
                                                 referenceToRvalueConversion( args[i], convCost ) );
                                         continue;
 …
                                 // Default arguments should be free - don't include conversion cost.
                                 // Unwrap them here because they are not relevant to the rest of the system
                                 cand->expr = ast::mutate_field_index(
+                                cand->expr = ast::mutate_field_index(
                                         appExpr, &ast::ApplicationExpr::args, i, def->expr );
                                 ++param;
 …
                         // mark conversion cost and also specialization cost of param type
                         const ast::Type * paramType = (*param)->get_type();
                         cand->expr = ast::mutate_field_index(
                                 appExpr, &ast::ApplicationExpr::args, i,
                                 computeExpressionConversionCost(
                                         args[i], paramType, symtab, cand->env, convCost ) );
                         convCost.decSpec( specCost( paramType ) );
+                        // const ast::Type * paramType = (*param)->get_type();
+                        cand->expr = ast::mutate_field_index(
+                                appExpr, &ast::ApplicationExpr::args, i,
+                                computeExpressionConversionCost(
+                                        args[i], *param, symtab, cand->env, convCost ) );
+                        convCost.decSpec( specCost( *param ) );
                         ++param;  // can't be in for-loop update because of the continue
+                }
 …
                 if ( param != params.end() ) return Cost::infinity;
                 // specialization cost of return types can't be accounted for directly, it disables
+                // specialization cost of return types can't be accounted for directly, it disables
                 // otherwise-identical calls, like this example based on auto-newline in the I/O lib:
                 //
 …
                 // mark type variable and specialization cost of forall clause
                 convCost.incVar( function->forall.size() );
+                for ( const ast::TypeDecl * td : function->forall ) {
+                        convCost.decSpec( td->assertions.size() );
+                }
+                convCost.decSpec( function->assertions.size() );
                 return convCost;
+        }
         void makeUnifiableVars(
                 const ast::ParameterizedType * type, ast::OpenVarSet & unifiableVars,
                 ast::AssertionSet & need
+        void makeUnifiableVars(
+                const ast::FunctionType * type, ast::OpenVarSet & unifiableVars,
+                ast::AssertionSet & need
         ) {
                 for ( const ast::TypeDecl * tyvar : type->forall ) {
                         unifiableVars[ tyvar->name ] = ast::TypeDecl::Data{ tyvar };
                         for ( const ast::DeclWithType * assn : tyvar->assertions ) {
                                 need[ assn ].isUsed = true;
+                        }
+                for ( auto & tyvar : type->forall ) {
+                        unifiableVars[ *tyvar ] = ast::TypeDecl::Data{ tyvar->base };
+                }
+                for ( auto & assn : type->assertions ) {
+                        need[ assn ].isUsed = true;
+                }
+        }
 …
                 ArgPack()
                 : parent( 0 ), expr(), cost( Cost::zero ), env(), need(), have(), open(), nextArg( 0 ),
+                : parent( 0 ), expr(), cost( Cost::zero ), env(), need(), have(), open(), nextArg( 0 ),
                   tupleStart( 0 ), nextExpl( 0 ), explAlt( 0 ) {}
                 ArgPack(
                         const ast::TypeEnvironment & env, const ast::AssertionSet & need,
+                ArgPack(
+                        const ast::TypeEnvironment & env, const ast::AssertionSet & need,
                         const ast::AssertionSet & have, const ast::OpenVarSet & open )
                 : parent( 0 ), expr(), cost( Cost::zero ), env( env ), need( need ), have( have ),
+                : parent( 0 ), expr(), cost( Cost::zero ), env( env ), need( need ), have( have ),
                   open( open ), nextArg( 0 ), tupleStart( 0 ), nextExpl( 0 ), explAlt( 0 ) {}
                 ArgPack(
                         std::size_t parent, const ast::Expr * expr, ast::TypeEnvironment && env,
                         ast::AssertionSet && need, ast::AssertionSet && have, ast::OpenVarSet && open,
                         unsigned nextArg, unsigned tupleStart = 0, Cost cost = Cost::zero,
+                        std::size_t parent, const ast::Expr * expr, ast::TypeEnvironment && env,
+                        ast::AssertionSet && need, ast::AssertionSet && have, ast::OpenVarSet && open,
+                        unsigned nextArg, unsigned tupleStart = 0, Cost cost = Cost::zero,
                         unsigned nextExpl = 0, unsigned explAlt = 0 )
                 : parent(parent), expr( expr ), cost( cost ), env( move( env ) ), need( move( need ) ),
                   have( move( have ) ), open( move( open ) ), nextArg( nextArg ), tupleStart( tupleStart ),
                   nextExpl( nextExpl ), explAlt( explAlt ) {}
                 ArgPack(
                         const ArgPack & o, ast::TypeEnvironment && env, ast::AssertionSet && need,
+                        const ArgPack & o, ast::TypeEnvironment && env, ast::AssertionSet && need,
                         ast::AssertionSet && have, ast::OpenVarSet && open, unsigned nextArg, Cost added )
                 : parent( o.parent ), expr( o.expr ), cost( o.cost + added ), env( move( env ) ),
                   need( move( need ) ), have( move( have ) ), open( move( open ) ), nextArg( nextArg ),
+                : parent( o.parent ), expr( o.expr ), cost( o.cost + added ), env( move( env ) ),
+                  need( move( need ) ), have( move( have ) ), open( move( open ) ), nextArg( nextArg ),
                   tupleStart( o.tupleStart ), nextExpl( 0 ), explAlt( 0 ) {}
                 /// true if this pack is in the middle of an exploded argument
                 bool hasExpl() const { return nextExpl > 0; }
 …
                         return args[ nextArg-1 ][ explAlt ];
+                }
                 /// Ends a tuple expression, consolidating the appropriate args
                 void endTuple( const std::vector< ArgPack > & packs ) {
 …
         /// Instantiates an argument to match a parameter, returns false if no matching results left
         bool instantiateArgument(
                 const ast::Type * paramType, const ast::Init * init, const ExplodedArgs_new & args,
                 std::vector< ArgPack > & results, std::size_t & genStart, const ast::SymbolTable & symtab,
                 unsigned nTuples = 0
+        bool instantiateArgument(
+                const ast::Type * paramType, const ast::Init * init, const ExplodedArgs_new & args,
+                std::vector< ArgPack > & results, std::size_t & genStart, const ast::SymbolTable & symtab,
+                unsigned nTuples = 0
         ) {
                 if ( auto tupleType = dynamic_cast< const ast::TupleType * >( paramType ) ) {
 …
                                 // xxx - dropping initializer changes behaviour from previous, but seems correct
                                 // ^^^ need to handle the case where a tuple has a default argument
                                 if ( ! instantiateArgument(
+                                if ( ! instantiateArgument(
                                         type, nullptr, args, results, genStart, symtab, nTuples ) ) return false;
                                 nTuples = 0;
 …
                 } else if ( const ast::TypeInstType * ttype = Tuples::isTtype( paramType ) ) {
                         // paramType is a ttype, consumes all remaining arguments
                         // completed tuples; will be spliced to end of results to finish
                         std::vector< ArgPack > finalResults{};
 …
                                 for ( std::size_t i = genStart; i < genEnd; ++i ) {
                                         unsigned nextArg = results[i].nextArg;
                                         // use next element of exploded tuple if present
                                         if ( results[i].hasExpl() ) {
 …
                                                 results.emplace_back(
                                                         i, expl.exprs[ results[i].nextExpl ], copy( results[i].env ),
                                                         copy( results[i].need ), copy( results[i].have ),
+                                                        copy( results[i].need ), copy( results[i].have ),
                                                         copy( results[i].open ), nextArg, nTuples, Cost::zero, nextExpl,
                                                         results[i].explAlt );
 …
                                                         // push empty tuple expression
                                                         newResult.parent = i;
+                                                        std::vector< ast::ptr< ast::Expr > > emptyList;
+                                                        newResult.expr =
+                                                                new ast::TupleExpr{ CodeLocation{}, move( emptyList ) };
+                                                        newResult.expr = new ast::TupleExpr{ CodeLocation{}, {} };
                                                         argType = newResult.expr->result;
                                                 } else {
 …
                                                 // check unification for ttype before adding to final
                                                 if (
                                                         unify(
+                                                if (
+                                                        unify(
                                                                 ttype, argType, newResult.env, newResult.need, newResult.have,
                                                                 newResult.open, symtab )
+                                                                newResult.open, symtab )
                                                 ) {
                                                         finalResults.emplace_back( move( newResult ) );
 …
                                                 if ( expl.exprs.empty() ) {
                                                         results.emplace_back(
                                                                 results[i], move( env ), copy( results[i].need ),
+                                                                results[i], move( env ), copy( results[i].need ),
                                                                 copy( results[i].have ), move( open ), nextArg + 1, expl.cost );
                                                         continue;
+                                                }
 …
                                                 // add new result
                                                 results.emplace_back(
                                                         i, expl.exprs.front(), move( env ), copy( results[i].need ),
                                                         copy( results[i].have ), move( open ), nextArg + 1, nTuples,
+                                                        i, expl.exprs.front(), move( env ), copy( results[i].need ),
+                                                        copy( results[i].have ), move( open ), nextArg + 1, nTuples,
                                                         expl.cost, expl.exprs.size() == 1 ? 0 : 1, j );
+                                        }
 …
                                         results.emplace_back(
                                                 i, expr, move( env ), move( need ), move( have ), move( open ), nextArg,
+                                                i, expr, move( env ), move( need ), move( have ), move( open ), nextArg,
                                                 nTuples, Cost::zero, nextExpl, results[i].explAlt );
+                                }
 …
                                         if ( unify( paramType, cnst->result, env, need, have, open, symtab ) ) {
                                                 results.emplace_back(
                                                         i, new ast::DefaultArgExpr{ cnst->location, cnst }, move( env ),
+                                                        i, new ast::DefaultArgExpr{ cnst->location, cnst }, move( env ),
                                                         move( need ), move( have ), move( open ), nextArg, nTuples );
+                                        }
 …
                                 if ( expl.exprs.empty() ) {
                                         results.emplace_back(
                                                 results[i], move( env ), move( need ), move( have ), move( open ),
+                                                results[i], move( env ), move( need ), move( have ), move( open ),
                                                 nextArg + 1, expl.cost );
                                         continue;
+                                }
 …
                                         // add new result
                                         results.emplace_back(
                                                 i, expr, move( env ), move( need ), move( have ), move( open ),
+                                                i, expr, move( env ), move( need ), move( have ), move( open ),
                                                 nextArg + 1, nTuples, expl.cost, expl.exprs.size() == 1 ? 0 : 1, j );
+                                }
 …
                 genStart = genEnd;
                 return genEnd != results.size();
+                return genEnd != results.size();  // were any new results added?
+        }
         /// Generate a cast expression from `arg` to `toType`
         const ast::Expr * restructureCast(
+        const ast::Expr * restructureCast(
                 ast::ptr< ast::Expr > & arg, const ast::Type * toType, ast::GeneratedFlag isGenerated = ast::GeneratedCast
         ) {
                 if (
                         arg->result->size() > 1
                         && ! toType->isVoid()
                         && ! dynamic_cast< const ast::ReferenceType * >( toType )
+                if (
+                        arg->result->size() > 1
+                        && ! toType->isVoid()
+                        && ! dynamic_cast< const ast::ReferenceType * >( toType )
                 ) {
                         // Argument is a tuple and the target type is neither void nor a reference. Cast each
                         // member of the tuple to its corresponding target type, producing the tuple of those
                         // cast expressions. If there are more components of the tuple than components in the
                         // target type, then excess components do not come out in the result expression (but
+                        // Argument is a tuple and the target type is neither void nor a reference. Cast each
+                        // member of the tuple to its corresponding target type, producing the tuple of those
+                        // cast expressions. If there are more components of the tuple than components in the
+                        // target type, then excess components do not come out in the result expression (but
                         // UniqueExpr ensures that the side effects will still be produced)
                         if ( Tuples::maybeImpureIgnoreUnique( arg ) ) {
                                 // expressions which may contain side effects require a single unique instance of
+                                // expressions which may contain side effects require a single unique instance of
                                 // the expression
                                 arg = new ast::UniqueExpr{ arg->location, arg };
 …
                                 // cast each component
                                 ast::ptr< ast::Expr > idx = new ast::TupleIndexExpr{ arg->location, arg, i };
                                 components.emplace_back(
+                                components.emplace_back(
                                         restructureCast( idx, toType->getComponent( i ), isGenerated ) );
+                        }
 …
         /// Actually visits expressions to find their candidate interpretations
+        struct Finder final : public ast::WithShortCircuiting {
+        class Finder final : public ast::WithShortCircuiting {
+                const ast::SymbolTable & symtab;
+        public:
+                static size_t traceId;
                 CandidateFinder & selfFinder;
-                const ast::SymbolTable & symtab;
                 CandidateList & candidates;
                 const ast::TypeEnvironment & tenv;
                 ast::ptr< ast::Type > & targetType;
+                enum Errors {
+                        NotFound,
+                        NoMatch,
+                        ArgsToFew,
+                        ArgsToMany,
+                        RetsToFew,
+                        RetsToMany,
+                        NoReason
+                };
+                struct {
+                        Errors code = NotFound;
+                } reason;
                 Finder( CandidateFinder & f )
                 : selfFinder( f ), symtab( f.symtab ), candidates( f.candidates ), tenv( f.env ),
+                : symtab( f.localSyms ), selfFinder( f ), candidates( f.candidates ), tenv( f.env ),
                   targetType( f.targetType ) {}
                 void previsit( const ast::Node * ) { visit_children = false; }
 …
                 void addCandidate( Args &&... args ) {
                         candidates.emplace_back( new Candidate{ std::forward<Args>( args )... } );
+                        reason.code = NoReason;
+                }
 …
                 /// Completes a function candidate with arguments located
                 void validateFunctionCandidate(
                         const CandidateRef & func, ArgPack & result, const std::vector< ArgPack > & results,
                         CandidateList & out
+                void validateFunctionCandidate(
+                        const CandidateRef & func, ArgPack & result, const std::vector< ArgPack > & results,
+                        CandidateList & out
                 ) {
                         ast::ApplicationExpr * appExpr =
+                        ast::ApplicationExpr * appExpr =
                                 new ast::ApplicationExpr{ func->expr->location, func->expr };
                         // sum cost and accumulate arguments
 …
                         appExpr->args = move( vargs );
                         // build and validate new candidate
                         auto newCand =
+                        auto newCand =
                                 std::make_shared<Candidate>( appExpr, result.env, result.open, result.need, cost );
                         PRINT(
 …
                 /// Builds a list of candidates for a function, storing them in out
                 void makeFunctionCandidates(
                         const CandidateRef & func, const ast::FunctionType * funcType,
+                        const CandidateRef & func, const ast::FunctionType * funcType,
                         const ExplodedArgs_new & args, CandidateList & out
                 ) {
 …
                         ast::TypeEnvironment funcEnv{ func->env };
                         makeUnifiableVars( funcType, funcOpen, funcNeed );
                         // add all type variables as open variables now so that those not used in the parameter
                         // list are still considered open
+                        // add all type variables as open variables now so that those not used in the
+                        // parameter list are still considered open
                         funcEnv.add( funcType->forall );
                         if ( targetType && ! targetType->isVoid() && ! funcType->returns.empty() ) {
                                 // attempt to narrow based on expected target type
                                 const ast::Type * returnType = funcType->returns.front()->get_type();
                                 if ( ! unify(
                                         returnType, targetType, funcEnv, funcNeed, funcHave, funcOpen, symtab )
+                                const ast::Type * returnType = funcType->returns.front();
+                                if ( ! unify(
+                                        returnType, targetType, funcEnv, funcNeed, funcHave, funcOpen, symtab )
                                 ) {
                                         // unification failed, do not pursue this candidate
 …
                         std::size_t genStart = 0;
+                        for ( const ast::DeclWithType * param : funcType->params ) {
+                                auto obj = strict_dynamic_cast< const ast::ObjectDecl * >( param );
+                                // Try adding the arguments corresponding to the current parameter to the existing
+                        // xxx - how to handle default arg after change to ftype representation?
+                        if (const ast::VariableExpr * varExpr = func->expr.as<ast::VariableExpr>()) {
+                                if (const ast::FunctionDecl * funcDecl = varExpr->var.as<ast::FunctionDecl>()) {
+                                        // function may have default args only if directly calling by name
+                                        // must use types on candidate however, due to RenameVars substitution
+                                        auto nParams = funcType->params.size();
+                                        for (size_t i=0; i<nParams; ++i) {
+                                                auto obj = funcDecl->params[i].strict_as<ast::ObjectDecl>();
+                                                if (!instantiateArgument(
+                                                        funcType->params[i], obj->init, args, results, genStart, symtab)) return;
+                                        }
+                                        goto endMatch;
+                                }
+                        }
+                        for ( const auto & param : funcType->params ) {
+                                // Try adding the arguments corresponding to the current parameter to the existing
                                 // matches
+                                if ( ! instantiateArgument(
+                                        obj->type, obj->init, args, results, genStart, symtab ) ) return;
+                        }
+                                // no default args for indirect calls
+                                if ( ! instantiateArgument(
+                                        param, nullptr, args, results, genStart, symtab ) ) return;
+                        }
+                        endMatch:
                         if ( funcType->isVarArgs ) {
                                 // append any unused arguments to vararg pack
 …
                                                         if ( expl.exprs.empty() ) {
                                                                 results.emplace_back(
                                                                         results[i], move( env ), copy( results[i].need ),
                                                                         copy( results[i].have ), move( open ), nextArg + 1,
+                                                                        results[i], move( env ), copy( results[i].need ),
+                                                                        copy( results[i].have ), move( open ), nextArg + 1,
                                                                         expl.cost );
 …
                                                         results.emplace_back(
                                                                 i, expl.exprs.front(), move( env ), copy( results[i].need ),
                                                                 copy( results[i].have ), move( open ), nextArg + 1, 0, expl.cost,
+                                                                copy( results[i].have ), move( open ), nextArg + 1, 0, expl.cost,
                                                                 expl.exprs.size() == 1 ? 0 : 1, j );
+                                                }
 …
                 /// Adds implicit struct-conversions to the alternative list
                 void addAnonConversions( const CandidateRef & cand ) {
                         // adds anonymous member interpretations whenever an aggregate value type is seen.
                         // it's okay for the aggregate expression to have reference type -- cast it to the
+                        // adds anonymous member interpretations whenever an aggregate value type is seen.
+                        // it's okay for the aggregate expression to have reference type -- cast it to the
                         // base type to treat the aggregate as the referenced value
                         ast::ptr< ast::Expr > aggrExpr( cand->expr );
                         ast::ptr< ast::Type > & aggrType = aggrExpr.get_and_mutate()->result;
                         cand->env.apply( aggrType );
                         if ( aggrType.as< ast::ReferenceType >() ) {
                                 aggrExpr = new ast::CastExpr{ aggrExpr, aggrType->stripReferences() };
 …
                 /// Adds aggregate member interpretations
                 void addAggMembers(
                         const ast::ReferenceToType * aggrInst, const ast::Expr * expr,
                         const Candidate & cand, const Cost & addedCost, const std::string & name
+                void addAggMembers(
+                        const ast::BaseInstType * aggrInst, const ast::Expr * expr,
+                        const Candidate & cand, const Cost & addedCost, const std::string & name
                 ) {
                         for ( const ast::Decl * decl : aggrInst->lookup( name ) ) {
                                 auto dwt = strict_dynamic_cast< const ast::DeclWithType * >( decl );
                                 CandidateRef newCand = std::make_shared<Candidate>(
+                                CandidateRef newCand = std::make_shared<Candidate>(
                                         cand, new ast::MemberExpr{ expr->location, dwt, expr }, addedCost );
                                 // add anonymous member interpretations whenever an aggregate value type is seen
+                                // add anonymous member interpretations whenever an aggregate value type is seen
                                 // as a member expression
                                 addAnonConversions( newCand );
 …
                 /// Adds tuple member interpretations
                 void addTupleMembers(
                         const ast::TupleType * tupleType, const ast::Expr * expr, const Candidate & cand,
                         const Cost & addedCost, const ast::Expr * member
+                void addTupleMembers(
+                        const ast::TupleType * tupleType, const ast::Expr * expr, const Candidate & cand,
+                        const Cost & addedCost, const ast::Expr * member
                 ) {
                         if ( auto constantExpr = dynamic_cast< const ast::ConstantExpr * >( member ) ) {
                                 // get the value of the constant expression as an int, must be between 0 and the
+                                // get the value of the constant expression as an int, must be between 0 and the
                                 // length of the tuple to have meaning
                                 long long val = constantExpr->intValue();
                                 if ( val >= 0 && (unsigned long long)val < tupleType->size() ) {
                                         addCandidate(
                                                 cand, new ast::TupleIndexExpr{ expr->location, expr, (unsigned)val },
+                                                cand, new ast::TupleIndexExpr{ expr->location, expr, (unsigned)val },
                                                 addedCost );
+                                }
 …
                 void postvisit( const ast::UntypedExpr * untypedExpr ) {
+                        CandidateFinder funcFinder{ symtab, tenv };
+                        funcFinder.find( untypedExpr->func, ResolvMode::withAdjustment() );
+                        // short-circuit if no candidates
+                        if ( funcFinder.candidates.empty() ) return;
+                        std::vector< CandidateFinder > argCandidates =
+                        std::vector< CandidateFinder > argCandidates =
                                 selfFinder.findSubExprs( untypedExpr->args );
                         // take care of possible tuple assignments
                         // if not tuple assignment, handled as normal function call
                         Tuples::handleTupleAssignment( selfFinder, untypedExpr, argCandidates );
+                        CandidateFinder funcFinder{ symtab, tenv };
+                        if (auto nameExpr = untypedExpr->func.as<ast::NameExpr>()) {
+                                auto kind = ast::SymbolTable::getSpecialFunctionKind(nameExpr->name);
+                                if (kind != ast::SymbolTable::SpecialFunctionKind::NUMBER_OF_KINDS) {
+                                        assertf(!argCandidates.empty(), "special function call without argument");
+                                        for (auto & firstArgCand: argCandidates[0]) {
+                                                ast::ptr<ast::Type> argType = firstArgCand->expr->result;
+                                                firstArgCand->env.apply(argType);
+                                                // strip references
+                                                // xxx - is this correct?
+                                                while (argType.as<ast::ReferenceType>()) argType = argType.as<ast::ReferenceType>()->base;
+                                                // convert 1-tuple to plain type
+                                                if (auto tuple = argType.as<ast::TupleType>()) {
+                                                        if (tuple->size() == 1) {
+                                                                argType = tuple->types[0];
+                                                        }
+                                                }
+                                                // if argType is an unbound type parameter, all special functions need to be searched.
+                                                if (isUnboundType(argType)) {
+                                                        funcFinder.otypeKeys.clear();
+                                                        break;
+                                                }
+                                                if (argType.as<ast::PointerType>()) funcFinder.otypeKeys.insert(Mangle::Encoding::pointer);
+                                                else funcFinder.otypeKeys.insert(Mangle::mangle(argType, Mangle::NoGenericParams | Mangle::Type));
+                                        }
+                                }
+                        }
+                        // if candidates are already produced, do not fail
+                        // xxx - is it possible that handleTupleAssignment and main finder both produce candidates?
+                        // this means there exists ctor/assign functions with a tuple as first parameter.
+                        ResolvMode mode = {
+                                true, // adjust
+                                !untypedExpr->func.as<ast::NameExpr>(), // prune if not calling by name
+                                selfFinder.candidates.empty() // failfast if other options are not found
+                        };
+                        funcFinder.find( untypedExpr->func, mode );
+                        // short-circuit if no candidates
+                        // if ( funcFinder.candidates.empty() ) return;
+                        reason.code = NoMatch;
                         // find function operators
 …
                                                 if ( auto function = pointer->base.as< ast::FunctionType >() ) {
                                                         CandidateRef newFunc{ new Candidate{ *func } };
                                                         newFunc->expr =
+                                                        newFunc->expr =
                                                                 referenceToRvalueConversion( newFunc->expr, newFunc->cost );
                                                         makeFunctionCandidates( newFunc, function, argExpansions, found );
+                                                }
                                         } else if (
                                                 auto inst = dynamic_cast< const ast::TypeInstType * >( funcResult )
+                                        } else if (
+                                                auto inst = dynamic_cast< const ast::TypeInstType * >( funcResult )
                                         ) {
                                                 if ( const ast::EqvClass * clz = func->env.lookup( inst->name ) ) {
+                                                if ( const ast::EqvClass * clz = func->env.lookup( *inst ) ) {
                                                         if ( auto function = clz->bound.as< ast::FunctionType >() ) {
                                                                 CandidateRef newFunc{ new Candidate{ *func } };
                                                                 newFunc->expr =
+                                                                newFunc->expr =
                                                                         referenceToRvalueConversion( newFunc->expr, newFunc->cost );
                                                                 makeFunctionCandidates( newFunc, function, argExpansions, found );
 …
                                 std::vector< ExplodedArg > funcE;
                                 funcE.reserve( funcFinder.candidates.size() );
                                 for ( const CandidateRef & func : funcFinder ) {
+                                for ( const CandidateRef & func : funcFinder ) {
                                         funcE.emplace_back( *func, symtab );
+                                }
 …
                                                         if ( auto function = pointer->base.as< ast::FunctionType >() ) {
                                                                 CandidateRef newOp{ new Candidate{ *op} };
                                                                 newOp->expr =
+                                                                newOp->expr =
                                                                         referenceToRvalueConversion( newOp->expr, newOp->cost );
                                                                 makeFunctionCandidates( newOp, function, argExpansions, found );
 …
+                        }
                         // Implement SFINAE; resolution errors are only errors if there aren't any non-error
+                        // Implement SFINAE; resolution errors are only errors if there aren't any non-error
                         // candidates
                         if ( found.empty() && ! errors.isEmpty() ) { throw errors; }
 …
                                         auto pointer = appExpr->func->result.strict_as< ast::PointerType >();
                                         auto function = pointer->base.strict_as< ast::FunctionType >();
                                         std::cerr << "Case +++++++++++++ " << appExpr->func << std::endl;
                                         std::cerr << "parameters are:" << std::endl;
 …
                         promoteCvtCost( winners );
                         // function may return a struct/union value, in which case we need to add candidates
                         // for implicit conversions to each of the anonymous members, which must happen after
+                        // function may return a struct/union value, in which case we need to add candidates
+                        // for implicit conversions to each of the anonymous members, which must happen after
                         // `findMinCost`, since anon conversions are never the cheapest
                         for ( const CandidateRef & c : winners ) {
 …
                         if ( candidates.empty() && targetType && ! targetType->isVoid() ) {
                                 // If resolution is unsuccessful with a target type, try again without, since it
+                                // If resolution is unsuccessful with a target type, try again without, since it
                                 // will sometimes succeed when it wouldn't with a target type binding.
                                 // For example:
 …
                 /// true if expression is an lvalue
                 static bool isLvalue( const ast::Expr * x ) {
                         return x->result && ( x->result->is_lvalue() || x->result.as< ast::ReferenceType >() );
+                        return x->result && ( x->get_lvalue() || x->result.as< ast::ReferenceType >() );
+                }
 …
                         CandidateFinder finder{ symtab, tenv };
                         finder.find( addressExpr->arg );
+                        if( finder.candidates.empty() ) return;
+                        reason.code = NoMatch;
                         for ( CandidateRef & r : finder.candidates ) {
                                 if ( ! isLvalue( r->expr ) ) continue;
 …
                         assert( toType );
                         toType = resolveTypeof( toType, symtab );
                         toType = SymTab::validateType( castExpr->location, toType, symtab );
+                        // toType = SymTab::validateType( castExpr->location, toType, symtab );
                         toType = adjustExprType( toType, tenv, symtab );
                         CandidateFinder finder{ symtab, tenv, toType };
                         finder.find( castExpr->arg, ResolvMode::withAdjustment() );
+                        if( !finder.candidates.empty() ) reason.code = NoMatch;
                         CandidateList matches;
 …
                                 cand->env.extractOpenVars( open );
                                 // It is possible that a cast can throw away some values in a multiply-valued
                                 // expression, e.g. cast-to-void, one value to zero. Figure out the prefix of the
                                 // subexpression results that are cast directly. The candidate is invalid if it
+                                // It is possible that a cast can throw away some values in a multiply-valued
+                                // expression, e.g. cast-to-void, one value to zero. Figure out the prefix of the
+                                // subexpression results that are cast directly. The candidate is invalid if it
                                 // has fewer results than there are types to cast to.
                                 int discardedValues = cand->expr->result->size() - toType->size();
 …
                                 // unification run for side-effects
                                 unify( toType, cand->expr->result, cand->env, need, have, open, symtab );
+                                Cost thisCost = castCost( cand->expr->result, toType, symtab, cand->env );
+                                Cost thisCost =
+                                        (castExpr->isGenerated == ast::GeneratedFlag::GeneratedCast)
+                            ? conversionCost( cand->expr->result, toType, cand->expr->get_lvalue(), symtab, cand->env )
+                            : castCost( cand->expr->result, toType, cand->expr->get_lvalue(), symtab, cand->env );
                                 PRINT(
                                         std::cerr << "working on cast with result: " << toType << std::endl;
 …
                                         // count one safe conversion for each value that is thrown away
                                         thisCost.incSafe( discardedValues );
                                         CandidateRef newCand = std::make_shared<Candidate>(
                                                 restructureCast( cand->expr, toType, castExpr->isGenerated ),
                                                 copy( cand->env ), move( open ), move( need ), cand->cost,
+                                        CandidateRef newCand = std::make_shared<Candidate>(
+                                                restructureCast( cand->expr, toType, castExpr->isGenerated ),
+                                                copy( cand->env ), move( open ), move( need ), cand->cost,
                                                 cand->cost + thisCost );
                                         inferParameters( newCand, matches );
 …
                         finder.find( castExpr->arg, ResolvMode::withoutPrune() );
                         for ( CandidateRef & r : finder.candidates ) {
                                 addCandidate(
                                         *r,
+                                addCandidate(
+                                        *r,
                                         new ast::VirtualCastExpr{ castExpr->location, r->expr, castExpr->result } );
+                        }
+                }
+                void postvisit( const ast::KeywordCastExpr * castExpr ) {
+                        const auto & loc = castExpr->location;
+                        assertf( castExpr->result, "Cast target should have been set in Validate." );
+                        auto ref = castExpr->result.strict_as<ast::ReferenceType>();
+                        auto inst = ref->base.strict_as<ast::StructInstType>();
+                        auto target = inst->base.get();
+                        CandidateFinder finder{ symtab, tenv };
+                        auto pick_alternatives = [target, this](CandidateList & found, bool expect_ref) {
+                                for(auto & cand : found) {
+                                        const ast::Type * expr = cand->expr->result.get();
+                                        if(expect_ref) {
+                                                auto res = dynamic_cast<const ast::ReferenceType*>(expr);
+                                                if(!res) { continue; }
+                                                expr = res->base.get();
+                                        }
+                                        if(auto insttype = dynamic_cast<const ast::TypeInstType*>(expr)) {
+                                                auto td = cand->env.lookup(*insttype);
+                                                if(!td) { continue; }
+                                                expr = td->bound.get();
+                                        }
+                                        if(auto base = dynamic_cast<const ast::StructInstType*>(expr)) {
+                                                if(base->base == target) {
+                                                        candidates.push_back( std::move(cand) );
+                                                        reason.code = NoReason;
+                                                }
+                                        }
+                                }
+                        };
+                        try {
+                                // Attempt 1 : turn (thread&)X into ($thread&)X.__thrd
+                                // Clone is purely for memory management
+                                std::unique_ptr<const ast::Expr> tech1 { new ast::UntypedMemberExpr(loc, new ast::NameExpr(loc, castExpr->concrete_target.field), castExpr->arg) };
+                                // don't prune here, since it's guaranteed all alternatives will have the same type
+                                finder.find( tech1.get(), ResolvMode::withoutPrune() );
+                                pick_alternatives(finder.candidates, false);
+                                return;
+                        } catch(SemanticErrorException & ) {}
+                        // Fallback : turn (thread&)X into ($thread&)get_thread(X)
+                        std::unique_ptr<const ast::Expr> fallback { ast::UntypedExpr::createDeref(loc,  new ast::UntypedExpr(loc, new ast::NameExpr(loc, castExpr->concrete_target.getter), { castExpr->arg })) };
+                        // don't prune here, since it's guaranteed all alternatives will have the same type
+                        finder.find( fallback.get(), ResolvMode::withoutPrune() );
+                        pick_alternatives(finder.candidates, true);
+                        // Whatever happens here, we have no more fallbacks
+                }
 …
                         aggFinder.find( memberExpr->aggregate, ResolvMode::withAdjustment() );
                         for ( CandidateRef & agg : aggFinder.candidates ) {
                                 // it's okay for the aggregate expression to have reference type -- cast it to the
+                                // it's okay for the aggregate expression to have reference type -- cast it to the
                                 // base type to treat the aggregate as the referenced value
                                 Cost addedCost = Cost::zero;
 …
                                 // find member of the given type
                                 if ( auto structInst = agg->expr->result.as< ast::StructInstType >() ) {
                                         addAggMembers(
+                                        addAggMembers(
                                                 structInst, agg->expr, *agg, addedCost, getMemberName( memberExpr ) );
                                 } else if ( auto unionInst = agg->expr->result.as< ast::UnionInstType >() ) {
                                         addAggMembers(
+                                        addAggMembers(
                                                 unionInst, agg->expr, *agg, addedCost, getMemberName( memberExpr ) );
                                 } else if ( auto tupleType = agg->expr->result.as< ast::TupleType >() ) {
 …
                 void postvisit( const ast::NameExpr * nameExpr ) {
+                        std::vector< ast::SymbolTable::IdData > declList = symtab.lookupId( nameExpr->name );
+                        std::vector< ast::SymbolTable::IdData > declList;
+                        if (!selfFinder.otypeKeys.empty()) {
+                                auto kind = ast::SymbolTable::getSpecialFunctionKind(nameExpr->name);
+                                assertf(kind != ast::SymbolTable::SpecialFunctionKind::NUMBER_OF_KINDS, "special lookup with non-special target: %s", nameExpr->name.c_str());
+                                for (auto & otypeKey: selfFinder.otypeKeys) {
+                                        auto result = symtab.specialLookupId(kind, otypeKey);
+                                        declList.insert(declList.end(), std::make_move_iterator(result.begin()), std::make_move_iterator(result.end()));
+                                }
+                        }
+                        else {
+                                declList = symtab.lookupId( nameExpr->name );
+                        }
                         PRINT( std::cerr << "nameExpr is " << nameExpr->name << std::endl; )
+                        if( declList.empty() ) return;
+                        reason.code = NoMatch;
                         for ( auto & data : declList ) {
                                 Cost cost = Cost::zero;
 …
                                 CandidateRef newCand = std::make_shared<Candidate>(
                                         newExpr, copy( tenv ), ast::OpenVarSet{}, ast::AssertionSet{}, Cost::zero,
+                                        newExpr, copy( tenv ), ast::OpenVarSet{}, ast::AssertionSet{}, Cost::zero,
                                         cost );
                                 PRINT(
 …
                                         std::cerr << std::endl;
+                                )
                                 newCand->expr = ast::mutate_field(
                                         newCand->expr.get(), &ast::Expr::result,
+                                newCand->expr = ast::mutate_field(
+                                        newCand->expr.get(), &ast::Expr::result,
                                         renameTyVars( newCand->expr->result ) );
                                 // add anonymous member interpretations whenever an aggregate value type is seen
+                                // add anonymous member interpretations whenever an aggregate value type is seen
                                 // as a name expression
                                 addAnonConversions( newCand );
 …
                         // not sufficient to just pass `variableExpr` here, type might have changed since
                         // creation
                         addCandidate(
+                        addCandidate(
                                 new ast::VariableExpr{ variableExpr->location, variableExpr->var }, tenv );
+                }
 …
                 void postvisit( const ast::SizeofExpr * sizeofExpr ) {
                         if ( sizeofExpr->type ) {
                                 addCandidate(
                                         new ast::SizeofExpr{
                                                 sizeofExpr->location, resolveTypeof( sizeofExpr->type, symtab ) },
+                                addCandidate(
+                                        new ast::SizeofExpr{
+                                                sizeofExpr->location, resolveTypeof( sizeofExpr->type, symtab ) },
                                         tenv );
                         } else {
 …
                                 CandidateList winners = findMinCost( finder.candidates );
                                 if ( winners.size() != 1 ) {
                                         SemanticError(
+                                        SemanticError(
                                                 sizeofExpr->expr.get(), "Ambiguous expression in sizeof operand: " );
+                                }
 …
                 void postvisit( const ast::AlignofExpr * alignofExpr ) {
                         if ( alignofExpr->type ) {
                                 addCandidate(
                                         new ast::AlignofExpr{
                                                 alignofExpr->location, resolveTypeof( alignofExpr->type, symtab ) },
+                                addCandidate(
+                                        new ast::AlignofExpr{
+                                                alignofExpr->location, resolveTypeof( alignofExpr->type, symtab ) },
                                         tenv );
                         } else {
 …
                                 CandidateList winners = findMinCost( finder.candidates );
                                 if ( winners.size() != 1 ) {
                                         SemanticError(
+                                        SemanticError(
                                                 alignofExpr->expr.get(), "Ambiguous expression in alignof operand: " );
+                                }
 …
                                 choice->expr = referenceToRvalueConversion( choice->expr, choice->cost );
                                 choice->cost = Cost::zero;
                                 addCandidate(
+                                addCandidate(
                                         *choice, new ast::AlignofExpr{ alignofExpr->location, choice->expr } );
+                        }
 …
                 void postvisit( const ast::UntypedOffsetofExpr * offsetofExpr ) {
                         const ast::ReferenceToType * aggInst;
+                        const ast::BaseInstType * aggInst;
                         if (( aggInst = offsetofExpr->type.as< ast::StructInstType >() )) ;
                         else if (( aggInst = offsetofExpr->type.as< ast::UnionInstType >() )) ;
 …
                         for ( const ast::Decl * member : aggInst->lookup( offsetofExpr->member ) ) {
                                 auto dwt = strict_dynamic_cast< const ast::DeclWithType * >( member );
                                 addCandidate(
+                                addCandidate(
                                         new ast::OffsetofExpr{ offsetofExpr->location, aggInst, dwt }, tenv );
+                        }
 …
                         finder2.find( logicalExpr->arg2, ResolvMode::withAdjustment() );
                         if ( finder2.candidates.empty() ) return;
+                        reason.code = NoMatch;
                         for ( const CandidateRef & r1 : finder1.candidates ) {
 …
                                         addCandidate(
                                                 new ast::LogicalExpr{
+                                                new ast::LogicalExpr{
                                                         logicalExpr->location, r1->expr, r2->expr, logicalExpr->isAnd },
                                                 move( env ), move( open ), move( need ), r1->cost + r2->cost );
 …
                         finder3.find( conditionalExpr->arg3, ResolvMode::withAdjustment() );
                         if ( finder3.candidates.empty() ) return;
+                        reason.code = NoMatch;
                         for ( const CandidateRef & r1 : finder1.candidates ) {
 …
                                                 ast::AssertionSet have;
                                                 // unify true and false results, then infer parameters to produce new
+                                                // unify true and false results, then infer parameters to produce new
                                                 // candidates
                                                 ast::ptr< ast::Type > common;
                                                 if (
                                                         unify(
                                                                 r2->expr->result, r3->expr->result, env, need, have, open, symtab,
                                                                 common )
+                                                if (
+                                                        unify(
+                                                                r2->expr->result, r3->expr->result, env, need, have, open, symtab,
+                                                                common )
                                                 ) {
                                                         // generate typed expression
                                                         ast::ConditionalExpr * newExpr = new ast::ConditionalExpr{
+                                                        ast::ConditionalExpr * newExpr = new ast::ConditionalExpr{
                                                                 conditionalExpr->location, r1->expr, r2->expr, r3->expr };
                                                         newExpr->result = common ? common : r2->expr->result;
                                                         // convert both options to result type
                                                         Cost cost = r1->cost + r2->cost + r3->cost;
                                                         newExpr->arg2 = computeExpressionConversionCost(
+                                                        newExpr->arg2 = computeExpressionConversionCost(
                                                                 newExpr->arg2, newExpr->result, symtab, env, cost );
                                                         newExpr->arg3 = computeExpressionConversionCost(
 …
                         ast::TypeEnvironment env{ tenv };
                         ast::ptr< ast::Expr > arg1 = resolveInVoidContext( commaExpr->arg1, symtab, env );
                         CandidateFinder finder2{ symtab, env };
                         finder2.find( commaExpr->arg2, ResolvMode::withAdjustment() );
 …
                         finder2.find( rangeExpr->high, ResolvMode::withAdjustment() );
                         if ( finder2.candidates.empty() ) return;
+                        reason.code = NoMatch;
                         for ( const CandidateRef & r1 : finder1.candidates ) {
 …
                                         ast::ptr< ast::Type > common;
                                         if (
                                                 unify(
                                                         r1->expr->result, r2->expr->result, env, need, have, open, symtab,
                                                         common )
+                                        if (
+                                                unify(
+                                                        r1->expr->result, r2->expr->result, env, need, have, open, symtab,
+                                                        common )
                                         ) {
                                                 // generate new expression
                                                 ast::RangeExpr * newExpr =
+                                                ast::RangeExpr * newExpr =
                                                         new ast::RangeExpr{ rangeExpr->location, r1->expr, r2->expr };
                                                 newExpr->result = common ? common : r1->expr->result;
                                                 // add candidate
                                                 CandidateRef newCand = std::make_shared<Candidate>(
                                                         newExpr, move( env ), move( open ), move( need ),
+                                                        newExpr, move( env ), move( open ), move( need ),
                                                         r1->cost + r2->cost );
                                                 inferParameters( newCand, candidates );
 …
                 void postvisit( const ast::UntypedTupleExpr * tupleExpr ) {
                         std::vector< CandidateFinder > subCandidates =
+                        std::vector< CandidateFinder > subCandidates =
                                 selfFinder.findSubExprs( tupleExpr->exprs );
                         std::vector< CandidateList > possibilities;
 …
                                 addCandidate(
                                         new ast::TupleExpr{ tupleExpr->location, move( exprs ) },
+                                        new ast::TupleExpr{ tupleExpr->location, move( exprs ) },
                                         move( env ), move( open ), move( need ), sumCost( subs ) );
+                        }
 …
                                 // calculate target type
                                 const ast::Type * toType = resolveTypeof( initAlt.type, symtab );
                                 toType = SymTab::validateType( initExpr->location, toType, symtab );
+                                // toType = SymTab::validateType( initExpr->location, toType, symtab );
                                 toType = adjustExprType( toType, tenv, symtab );
                                 // The call to find must occur inside this loop, otherwise polymorphic return
                                 // types are not bound to the initialization type, since return type variables are
                                 // only open for the duration of resolving the UntypedExpr.
+                                // The call to find must occur inside this loop, otherwise polymorphic return
+                                // types are not bound to the initialization type, since return type variables are
+                                // only open for the duration of resolving the UntypedExpr.
                                 CandidateFinder finder{ symtab, tenv, toType };
                                 finder.find( initExpr->expr, ResolvMode::withAdjustment() );
                                 for ( CandidateRef & cand : finder.candidates ) {
+                                        if(reason.code == NotFound) reason.code = NoMatch;
                                         ast::TypeEnvironment env{ cand->env };
                                         ast::AssertionSet need( cand->need.begin(), cand->need.end() ), have;
 …
+                                        )
                                         // It is possible that a cast can throw away some values in a multiply-valued
                                         // expression, e.g. cast-to-void, one value to zero. Figure out the prefix of
                                         // the subexpression results that are cast directly. The candidate is invalid
+                                        // It is possible that a cast can throw away some values in a multiply-valued
+                                        // expression, e.g. cast-to-void, one value to zero. Figure out the prefix of
+                                        // the subexpression results that are cast directly. The candidate is invalid
                                         // if it has fewer results than there are types to cast to.
                                         int discardedValues = cand->expr->result->size() - toType->size();
 …
                                         // unification run for side-effects
+                                        unify( toType, cand->expr->result, env, need, have, open, symtab );
+                                        Cost thisCost = castCost( cand->expr->result, toType, symtab, env );
+                                        bool canUnify = unify( toType, cand->expr->result, env, need, have, open, symtab );
+                                        (void) canUnify;
+                                        Cost thisCost = computeConversionCost( cand->expr->result, toType, cand->expr->get_lvalue(),
+                                                symtab, env );
+                                        PRINT(
+                                                Cost legacyCost = castCost( cand->expr->result, toType, cand->expr->get_lvalue(),
+                                                        symtab, env );
+                                                std::cerr << "Considering initialization:";
+                                                std::cerr << std::endl << "  FROM: " << cand->expr->result << std::endl;
+                                                std::cerr << std::endl << "  TO: "   << toType             << std::endl;
+                                                std::cerr << std::endl << "  Unification " << (canUnify ? "succeeded" : "failed");
+                                                std::cerr << std::endl << "  Legacy cost " << legacyCost;
+                                                std::cerr << std::endl << "  New cost " << thisCost;
+                                                std::cerr << std::endl;
+                                        )
                                         if ( thisCost != Cost::infinity ) {
                                                 // count one safe conversion for each value that is thrown away
                                                 thisCost.incSafe( discardedValues );
                                                 CandidateRef newCand = std::make_shared<Candidate>(
                                                         new ast::InitExpr{
                                                                 initExpr->location, restructureCast( cand->expr, toType ),
                                                                 initAlt.designation },
                                                         copy( cand->env ), move( open ), move( need ), cand->cost, thisCost );
+                                                CandidateRef newCand = std::make_shared<Candidate>(
+                                                        new ast::InitExpr{
+                                                                initExpr->location, restructureCast( cand->expr, toType ),
+                                                                initAlt.designation },
+                                                        move(env), move( open ), move( need ), cand->cost, thisCost );
                                                 inferParameters( newCand, matches );
+                                        }
+                                }
+                        }
 …
         };
+        /// Prunes a list of candidates down to those that have the minimum conversion cost for a given
+        // size_t Finder::traceId = Stats::Heap::new_stacktrace_id("Finder");
+        /// Prunes a list of candidates down to those that have the minimum conversion cost for a given
         /// return type. Skips ambiguous candidates.
+        CandidateList pruneCandidates( CandidateList & candidates ) {
+                struct PruneStruct {
+                        CandidateRef candidate;
+                        bool ambiguous;
+                        PruneStruct() = default;
+                        PruneStruct( const CandidateRef & c ) : candidate( c ), ambiguous( false ) {}
+                };
+                // find lowest-cost candidate for each type
+                std::unordered_map< std::string, PruneStruct > selected;
+                for ( CandidateRef & candidate : candidates ) {
+                        std::string mangleName;
+} // anonymous namespace
+bool CandidateFinder::pruneCandidates( CandidateList & candidates, CandidateList & out, std::vector<std::string> & errors ) {
+        struct PruneStruct {
+                CandidateRef candidate;
+                bool ambiguous;
+                PruneStruct() = default;
+                PruneStruct( const CandidateRef & c ) : candidate( c ), ambiguous( false ) {}
+        };
+        // find lowest-cost candidate for each type
+        std::unordered_map< std::string, PruneStruct > selected;
+        // attempt to skip satisfyAssertions on more expensive alternatives if better options have been found
+        std::sort(candidates.begin(), candidates.end(), [](const CandidateRef & x, const CandidateRef & y){return x->cost < y->cost;});
+        for ( CandidateRef & candidate : candidates ) {
+                std::string mangleName;
+                {
+                        ast::ptr< ast::Type > newType = candidate->expr->result;
+                        assertf(candidate->expr->result, "Result of expression %p for candidate is null", candidate->expr.get());
+                        candidate->env.apply( newType );
+                        mangleName = Mangle::mangle( newType );
+                }
+                auto found = selected.find( mangleName );
+                if (found != selected.end() && found->second.candidate->cost < candidate->cost) {
+                        PRINT(
+                                std::cerr << "cost " << candidate->cost << " loses to "
+                                        << found->second.candidate->cost << std::endl;
+                        )
+                        continue;
+                }
+                // xxx - when do satisfyAssertions produce more than 1 result?
+                // this should only happen when initial result type contains
+                // unbound type parameters, then it should never be pruned by
+                // the previous step, since renameTyVars guarantees the mangled name
+                // is unique.
+                CandidateList satisfied;
+                bool needRecomputeKey = false;
+                if (candidate->need.empty()) {
+                        satisfied.emplace_back(candidate);
+                }
+                else {
+                        satisfyAssertions(candidate, localSyms, satisfied, errors);
+                        needRecomputeKey = true;
+                }
+                for (auto & newCand : satisfied) {
+                        // recomputes type key, if satisfyAssertions changed it
+                        if (needRecomputeKey)
+                        {
+                                ast::ptr< ast::Type > newType = candidate->expr->result;
+                                candidate->env.apply( newType );
+                                ast::ptr< ast::Type > newType = newCand->expr->result;
+                                assertf(newCand->expr->result, "Result of expression %p for candidate is null", newCand->expr.get());
+                                newCand->env.apply( newType );
                                 mangleName = Mangle::mangle( newType );
+                        }
                         auto found = selected.find( mangleName );
                         if ( found != selected.end() ) {
                                 if ( candidate->cost < found->second.candidate->cost ) {
+                                if ( newCand->cost < found->second.candidate->cost ) {
                                         PRINT(
                                                 std::cerr << "cost " << candidate->cost << " beats "
+                                                std::cerr << "cost " << newCand->cost << " beats "
                                                         << found->second.candidate->cost << std::endl;
+                                        )
                                         found->second = PruneStruct{ candidate };
                                 } else if ( candidate->cost == found->second.candidate->cost ) {
                                         // if one of the candidates contains a deleted identifier, can pick the other,
                                         // since deleted expressions should not be ambiguous if there is another option
+                                        found->second = PruneStruct{ newCand };
+                                } else if ( newCand->cost == found->second.candidate->cost ) {
+                                        // if one of the candidates contains a deleted identifier, can pick the other,
+                                        // since deleted expressions should not be ambiguous if there is another option
                                         // that is at least as good
                                         if ( findDeletedExpr( candidate->expr ) ) {
+                                        if ( findDeletedExpr( newCand->expr ) ) {
                                                 // do nothing
                                                 PRINT( std::cerr << "candidate is deleted" << std::endl; )
                                         } else if ( findDeletedExpr( found->second.candidate->expr ) ) {
                                                 PRINT( std::cerr << "current is deleted" << std::endl; )
                                                 found->second = PruneStruct{ candidate };
+                                                found->second = PruneStruct{ newCand };
                                         } else {
                                                 PRINT( std::cerr << "marking ambiguous" << std::endl; )
                                                 found->second.ambiguous = true;
+                                        }
+                                } else {
+                                } else {
+                                        // xxx - can satisfyAssertions increase the cost?
                                         PRINT(
                                                 std::cerr << "cost " << candidate->cost << " loses to "
+                                                std::cerr << "cost " << newCand->cost << " loses to "
                                                         << found->second.candidate->cost << std::endl;
+                                        )
+                                        )
+                                }
                         } else {
+                                selected.emplace_hint( found, mangleName, candidate );
+                        }
+                }
+                // report unambiguous min-cost candidates
+                CandidateList out;
+                for ( auto & target : selected ) {
+                        if ( target.second.ambiguous ) continue;
+                        CandidateRef cand = target.second.candidate;
+                        ast::ptr< ast::Type > newResult = cand->expr->result;
+                        cand->env.applyFree( newResult );
+                        cand->expr = ast::mutate_field(
+                                cand->expr.get(), &ast::Expr::result, move( newResult ) );
+                        out.emplace_back( cand );
+                }
+                return out;
+                                selected.emplace_hint( found, mangleName, newCand );
+                        }
+                }
+        }
+} // anonymous namespace
+        // report unambiguous min-cost candidates
+        // CandidateList out;
+        for ( auto & target : selected ) {
+                if ( target.second.ambiguous ) continue;
+                CandidateRef cand = target.second.candidate;
+                ast::ptr< ast::Type > newResult = cand->expr->result;
+                cand->env.applyFree( newResult );
+                cand->expr = ast::mutate_field(
+                        cand->expr.get(), &ast::Expr::result, move( newResult ) );
+                out.emplace_back( cand );
+        }
+        // if everything is lost in satisfyAssertions, report the error
+        return !selected.empty();
+}
 void CandidateFinder::find( const ast::Expr * expr, ResolvMode mode ) {
 …
         if ( mode.failFast && candidates.empty() ) {
+                SemanticError( expr, "No reasonable alternatives for expression " );
+                switch(finder.core.reason.code) {
+                case Finder::NotFound:
+                        { SemanticError( expr, "No alternatives for expression " ); break; }
+                case Finder::NoMatch:
+                        { SemanticError( expr, "Invalid application of existing declaration(s) in expression " ); break; }
+                case Finder::ArgsToFew:
+                case Finder::ArgsToMany:
+                case Finder::RetsToFew:
+                case Finder::RetsToMany:
+                case Finder::NoReason:
+                default:
+                        { SemanticError( expr->location, "No reasonable alternatives for expression : reasons unkown" ); }
+                }
+        }
+        /*
         if ( mode.satisfyAssns || mode.prune ) {
                 // trim candidates to just those where the assertions are satisfiable
 …
                 std::vector< std::string > errors;
                 for ( CandidateRef & candidate : candidates ) {
                         satisfyAssertions( candidate, symtab, satisfied, errors );
+                        satisfyAssertions( candidate, localSyms, satisfied, errors );
+                }
 …
                 candidates = move( satisfied );
+        }
+        */
         if ( mode.prune ) {
 …
+                )
+                CandidateList pruned = pruneCandidates( candidates );
+                CandidateList pruned;
+                std::vector<std::string> errors;
+                bool found = pruneCandidates( candidates, pruned, errors );
                 if ( mode.failFast && pruned.empty() ) {
                         std::ostringstream stream;
+                        CandidateList winners = findMinCost( candidates );
+                        stream << "Cannot choose between " << winners.size() << " alternatives for "
+                                "expression\n";
+                        ast::print( stream, expr );
+                        stream << " Alternatives are:\n";
+                        print( stream, winners, 1 );
+                        SemanticError( expr->location, stream.str() );
+                        if (found) {
+                                CandidateList winners = findMinCost( candidates );
+                                stream << "Cannot choose between " << winners.size() << " alternatives for "
+                                        "expression\n";
+                                ast::print( stream, expr );
+                                stream << " Alternatives are:\n";
+                                print( stream, winners, 1 );
+                                SemanticError( expr->location, stream.str() );
+                        }
+                        else {
+                                stream << "No alternatives with satisfiable assertions for " << expr << "\n";
+                                for ( const auto& err : errors ) {
+                                        stream << err;
+                                }
+                                SemanticError( expr->location, stream.str() );
+                        }
+                }
 …
+                )
                 PRINT(
                         std::cerr << "there are " << candidates.size() << " alternatives after elimination"
+                        std::cerr << "there are " << candidates.size() << " alternatives after elimination"
                                 << std::endl;
+                )
+        }
         // adjust types after pruning so that types substituted by pruneAlternatives are correctly
+        // adjust types after pruning so that types substituted by pruneAlternatives are correctly
         // adjusted
         if ( mode.adjust ) {
                 for ( CandidateRef & r : candidates ) {
                         r->expr = ast::mutate_field(
                                 r->expr.get(), &ast::Expr::result,
                                 adjustExprType( r->expr->result, r->env, symtab ) );
+                        r->expr = ast::mutate_field(
+                                r->expr.get(), &ast::Expr::result,
+                                adjustExprType( r->expr->result, r->env, localSyms ) );
+                }
+        }
 …
+}
 std::vector< CandidateFinder > CandidateFinder::findSubExprs(
         const std::vector< ast::ptr< ast::Expr > > & xs
+std::vector< CandidateFinder > CandidateFinder::findSubExprs(
+        const std::vector< ast::ptr< ast::Expr > > & xs
 ) {
         std::vector< CandidateFinder > out;
         for ( const auto & x : xs ) {
                 out.emplace_back( symtab, env );
+                out.emplace_back( localSyms, env );
                 out.back().find( x, ResolvMode::withAdjustment() );
                 PRINT(
                         std::cerr << "findSubExprs" << std::endl;

src/ResolvExpr/CandidateFinder.hpp

-              r3c64c668
+              r58fe85a
 // Author           : Aaron B. Moss
 // Created On       : Wed Jun 5 14:30:00 2019
 // Last Modified By : Aaron B. Moss
 // Last Modified On : Wed Jun 5 14:30:00 2019
 // Update Count     : 1
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Oct  1  9:51:00 2019
+// Update Count     : 2
 //
 …
 struct CandidateFinder {
         CandidateList candidates;          ///< List of candidate resolutions
         const ast::SymbolTable & symtab;   ///< Symbol table to lookup candidates
+        const ast::SymbolTable & localSyms;   ///< Symbol table to lookup candidates
         const ast::TypeEnvironment & env;  ///< Substitutions performed in this resolution
         ast::ptr< ast::Type > targetType;  ///< Target type for resolution
+        std::set< std::string > otypeKeys;  /// different type may map to same key
         CandidateFinder(
                 const ast::SymbolTable & symtab, const ast::TypeEnvironment & env,
+        CandidateFinder(
+                const ast::SymbolTable & syms, const ast::TypeEnvironment & env,
                 const ast::Type * tt = nullptr )
         : candidates(), symtab( symtab ), env( env ), targetType( tt ) {}
+        : candidates(), localSyms( syms ), env( env ), targetType( tt ) {}
         /// Fill candidates with feasible resolutions for `expr`
         void find( const ast::Expr * expr, ResolvMode mode = {} );
+        bool pruneCandidates( CandidateList & candidates, CandidateList & out, std::vector<std::string> & errors );
         /// Runs new candidate finder on each element in xs, returning the list of finders
 …
         iterator begin() { return candidates.begin(); }
         const_iterator begin() const { return candidates.begin(); }
         iterator end() { return candidates.end(); }
         const_iterator end() const { return candidates.end(); }
 …
 /// Computes conversion cost between two types
 Cost computeConversionCost(
         const ast::Type * argType, const ast::Type * paramType, const ast::SymbolTable & symtab,
         const ast::TypeEnvironment & env );
+Cost computeConversionCost(
+        const ast::Type * argType, const ast::Type * paramType, bool argIsLvalue,
+        const ast::SymbolTable & symtab, const ast::TypeEnvironment & env );
 } // namespace ResolvExpr

src/ResolvExpr/CastCost.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Sun May 17 06:57:43 2015
 // Last Modified By : Andrew Beach
 // Last Modified On : Thu Aug  8 16:12:00 2019
 // Update Count     : 8
+// Last Modified On : Tue Oct  4 15:00:00 2019
+// Update Count     : 9
 //
 …
                 CastCost_new(
                         const ast::Type * dst, const ast::SymbolTable & symtab,
+                        const ast::Type * dst, bool srcIsLvalue, const ast::SymbolTable & symtab,
                         const ast::TypeEnvironment & env, CostCalculation costFunc )
                 : ConversionCost_new( dst, symtab, env, costFunc ) {}
+                : ConversionCost_new( dst, srcIsLvalue, symtab, env, costFunc ) {}
                 void postvisit( const ast::BasicType * basicType ) {
 …
                                 cost = Cost::unsafe;
                         } else {
                                 cost = conversionCost( basicType, dst, symtab, env );
+                                cost = conversionCost( basicType, dst, srcIsLvalue, symtab, env );
+                        }
+                }
 …
                                 } else {
                                         ast::TypeEnvironment newEnv{ env };
                                         if ( auto wParams = pointerType->base.as< ast::ParameterizedType >() ) {
+                                        if ( auto wParams = pointerType->base.as< ast::FunctionType >() ) {
                                                 newEnv.add( wParams->forall );
+                                        }
 …
+                }
         };
+        #warning For overload resolution between the two versions.
+        int localPtrsCastable(const ast::Type * t1, const ast::Type * t2,
+                        const ast::SymbolTable & symtab, const ast::TypeEnvironment & env ) {
+                return ptrsCastable( t1, t2, symtab, env );
+        }
+        Cost localCastCost(
+                const ast::Type * src, const ast::Type * dst, bool srcIsLvalue,
+                const ast::SymbolTable & symtab, const ast::TypeEnvironment & env
+        ) { return castCost( src, dst, srcIsLvalue, symtab, env ); }
 } // anonymous namespace
 Cost castCost(
         const ast::Type * src, const ast::Type * dst, const ast::SymbolTable & symtab,
         const ast::TypeEnvironment & env
+        const ast::Type * src, const ast::Type * dst, bool srcIsLvalue,
+        const ast::SymbolTable & symtab, const ast::TypeEnvironment & env
 ) {
         if ( auto typeInst = dynamic_cast< const ast::TypeInstType * >( dst ) ) {
                 if ( const ast::EqvClass * eqvClass = env.lookup( typeInst->name ) ) {
+                if ( const ast::EqvClass * eqvClass = env.lookup( *typeInst ) ) {
                         // check cast cost against bound type, if present
                         if ( eqvClass->bound ) {
                                 return castCost( src, eqvClass->bound, symtab, env );
+                                return castCost( src, eqvClass->bound, srcIsLvalue, symtab, env );
                         } else {
                                 return Cost::infinity;
 …
                         auto type = strict_dynamic_cast< const ast::TypeDecl * >( named );
                         if ( type->base ) {
                                 return castCost( src, type->base, symtab, env ) + Cost::safe;
+                                return castCost( src, type->base, srcIsLvalue, symtab, env ) + Cost::safe;
+                        }
+                }
 …
                 #warning cast on ptrsCastable artifact of having two functions, remove when port done
                 return convertToReferenceCost(
+                        src, refType, symtab, env,
+                        ( int (*)(
+                                const ast::Type *, const ast::Type *, const ast::SymbolTable &,
+                                const ast::TypeEnvironment & )
+                        ) ptrsCastable );
+                        src, refType, srcIsLvalue, symtab, env, localPtrsCastable );
         } else {
                 #warning cast on castCost artifact of having two functions, remove when port done
+                ast::Pass< CastCost_new > converter{
+                        dst, symtab, env,
+                        ( Cost (*)(
+                                const ast::Type *, const ast::Type *, const ast::SymbolTable &,
+                                const ast::TypeEnvironment & )
+                        ) castCost };
+                ast::Pass< CastCost_new > converter(
+                        dst, srcIsLvalue, symtab, env, localCastCost );
                 src->accept( converter );
                 return converter.pass.cost;
+                return converter.core.cost;
+        }
+}

src/ResolvExpr/CommonType.cc

-              r3c64c668
+              r58fe85a
                 const ast::OpenVarSet & open;
         public:
+                static size_t traceId;
                 ast::ptr< ast::Type > result;
 …
                         const ast::Type * base = oPtr->base;
                         if ( auto var = dynamic_cast< const ast::TypeInstType * >( base ) ) {
                                 auto entry = open.find( var->name );
+                                auto entry = open.find( *var );
                                 if ( entry != open.end() ) {
                                         ast::AssertionSet need, have;
 …
         };
+        // size_t CommonType_new::traceId = Stats::Heap::new_stacktrace_id("CommonType_new");
         namespace {
                 ast::ptr< ast::Type > handleReference(
 …
                         ast::ptr< ast::Type > result;
                         const ast::ReferenceType * ref1 = type1.as< ast::ReferenceType >();
                         const ast::ReferenceType * ref2 = type1.as< ast::ReferenceType >();
+                        const ast::ReferenceType * ref2 = type2.as< ast::ReferenceType >();
                         if ( depth1 > depth2 ) {
 …
                 ast::Pass<CommonType_new> visitor{ type2, widen, symtab, env, open };
                 type1->accept( visitor );
                 ast::ptr< ast::Type > result = visitor.pass.result;
+                ast::ptr< ast::Type > result = visitor.core.result;
                 // handling for opaque type declarations (?)

src/ResolvExpr/ConversionCost.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Sun May 17 07:06:19 2015
 // Last Modified By : Andrew Beach
 // Last Modified On : Mon Aug 12 10:21:00 2019
 // Update Count     : 27
+// Last Modified On : Wed Jul 29 16:11:00 2020
+// Update Count     : 28
 //
 …
         void ConversionCost::postvisit( const FunctionType * ) {}
-        void ConversionCost::postvisit( const StructInstType * inst ) {
-                if ( const StructInstType * destAsInst = dynamic_cast< const StructInstType * >( dest ) ) {
-                        if ( inst->name == destAsInst->name ) {
-                                cost = Cost::zero;
-                        } // if
-                } // if
+        }
-        void ConversionCost::postvisit( const UnionInstType * inst ) {
-                if ( const UnionInstType * destAsInst = dynamic_cast< const UnionInstType * >( dest ) ) {
-                        if ( inst->name == destAsInst->name ) {
-                                cost = Cost::zero;
-                        } // if
-                } // if
+        }
         void ConversionCost::postvisit( const EnumInstType * ) {
                 static Type::Qualifiers q;
 …
+        }
+static int localPtrsAssignable(const ast::Type * t1, const ast::Type * t2,
                 const ast::SymbolTable &, const ast::TypeEnvironment & env ) {
         return ptrsAssignable( t1, t2, env );
+}
+// TODO: This is used for overload resolution. It might be able to be dropped once the old system
+// is removed.
+static Cost localConversionCost(
         const ast::Type * src, const ast::Type * dst, const ast::SymbolTable & symtab,
         const ast::TypeEnvironment & env
 ) { return conversionCost( src, dst, symtab, env ); }
+namespace {
+        # warning For overload resolution between the two versions.
+        int localPtrsAssignable(const ast::Type * t1, const ast::Type * t2,
+                        const ast::SymbolTable &, const ast::TypeEnvironment & env ) {
+                return ptrsAssignable( t1, t2, env );
+        }
+        Cost localConversionCost(
+                const ast::Type * src, const ast::Type * dst, bool srcIsLvalue,
+                const ast::SymbolTable & symtab, const ast::TypeEnvironment & env
+        ) { return conversionCost( src, dst, srcIsLvalue, symtab, env ); }
+}
 Cost conversionCost(
         const ast::Type * src, const ast::Type * dst, const ast::SymbolTable & symtab,
         const ast::TypeEnvironment & env
+        const ast::Type * src, const ast::Type * dst, bool srcIsLvalue,
+        const ast::SymbolTable & symtab, const ast::TypeEnvironment & env
 ) {
         if ( const ast::TypeInstType * inst = dynamic_cast< const ast::TypeInstType * >( dst ) ) {
                 if ( const ast::EqvClass * eqv = env.lookup( inst->name ) ) {
+                if ( const ast::EqvClass * eqv = env.lookup( *inst ) ) {
                         if ( eqv->bound ) {
                                 return conversionCost(src, eqv->bound, symtab, env );
+                                return conversionCost(src, eqv->bound, srcIsLvalue, symtab, env );
                         } else {
                                 return Cost::infinity;
 …
                         assertf( type, "Unexpected typedef." );
                         if ( type->base ) {
                                 return conversionCost( src, type->base, symtab, env ) + Cost::safe;
+                                return conversionCost( src, type->base, srcIsLvalue, symtab, env ) + Cost::safe;
+                        }
+                }
 …
         } else if ( const ast::ReferenceType * refType =
                          dynamic_cast< const ast::ReferenceType * >( dst ) ) {
                 return convertToReferenceCost( src, refType, symtab, env, localPtrsAssignable );
+                return convertToReferenceCost( src, refType, srcIsLvalue, symtab, env, localPtrsAssignable );
         } else {
+                ast::Pass<ConversionCost_new> converter( dst, symtab, env, localConversionCost );
+                src->accept( converter );
+                return converter.pass.cost;
+        }
+}
+static Cost convertToReferenceCost( const ast::Type * src, const ast::Type * dst,
+                return ast::Pass<ConversionCost_new>::read( src, dst, srcIsLvalue, symtab, env, localConversionCost );
+        }
+}
+static Cost convertToReferenceCost( const ast::Type * src, const ast::Type * dst, bool srcIsLvalue,
                 int diff, const ast::SymbolTable & symtab, const ast::TypeEnvironment & env,
                 NumCostCalculation func ) {
+                PtrsCalculation func ) {
         if ( 0 < diff ) {
                 Cost cost = convertToReferenceCost(
                         strict_dynamic_cast< const ast::ReferenceType * >( src )->base,
                         dst, (diff - 1), symtab, env, func );
+                        strict_dynamic_cast< const ast::ReferenceType * >( src )->base, dst,
+                        srcIsLvalue, (diff - 1), symtab, env, func );
                 cost.incReference();
                 return cost;
 …
                 Cost cost = convertToReferenceCost(
                         src, strict_dynamic_cast< const ast::ReferenceType * >( dst )->base,
                         (diff + 1), symtab, env, func );
+                        srcIsLvalue, (diff + 1), symtab, env, func );
                 cost.incReference();
                 return cost;
 …
+                        }
                 } else {
+                        ast::Pass<ConversionCost_new> converter( dst, symtab, env, localConversionCost );
+                        src->accept( converter );
+                        return converter.pass.cost;
+                        return ast::Pass<ConversionCost_new>::read( src, dst, srcIsLvalue, symtab, env, localConversionCost );
+                }
         } else {
 …
                 assert( dstAsRef );
                 if ( typesCompatibleIgnoreQualifiers( src, dstAsRef->base, symtab, env ) ) {
                         if ( src->is_lvalue() ) {
+                        if ( srcIsLvalue ) {
                                 if ( src->qualifiers == dstAsRef->base->qualifiers ) {
                                         return Cost::reference;
 …
 Cost convertToReferenceCost( const ast::Type * src, const ast::ReferenceType * dst,
             const ast::SymbolTable & symtab, const ast::TypeEnvironment & env,
                 NumCostCalculation func ) {
+                bool srcIsLvalue, const ast::SymbolTable & symtab, const ast::TypeEnvironment & env,
+                PtrsCalculation func ) {
         int sdepth = src->referenceDepth(), ddepth = dst->referenceDepth();
         return convertToReferenceCost( src, dst, sdepth - ddepth, symtab, env, func );
+        return convertToReferenceCost( src, dst, srcIsLvalue, sdepth - ddepth, symtab, env, func );
+}
 …
         assert( nullptr == dynamic_cast< const ast::ReferenceType * >( dst ) );
         cost = costCalc( refType->base, dst, symtab, env );
+        cost = costCalc( refType->base, dst, srcIsLvalue, symtab, env );
         if ( refType->base->qualifiers == dst->qualifiers ) {
                 cost.incReference();
 …
+}
-void ConversionCost_new::postvisit( const ast::StructInstType * structInstType ) {
-        if ( const ast::StructInstType * dstAsInst =
-                        dynamic_cast< const ast::StructInstType * >( dst ) ) {
-                if ( structInstType->name == dstAsInst->name ) {
-                        cost = Cost::zero;
+                }
+        }
+}
-void ConversionCost_new::postvisit( const ast::UnionInstType * unionInstType ) {
-        if ( const ast::UnionInstType * dstAsInst =
-                        dynamic_cast< const ast::UnionInstType * >( dst ) ) {
-                if ( unionInstType->name == dstAsInst->name ) {
-                        cost = Cost::zero;
+                }
+        }
+}
 void ConversionCost_new::postvisit( const ast::EnumInstType * enumInstType ) {
         (void)enumInstType;
         static const ast::BasicType integer( ast::BasicType::SignedInt );
         cost = costCalc( &integer, dst, symtab, env );
+        static ast::ptr<ast::BasicType> integer = { new ast::BasicType( ast::BasicType::SignedInt ) };
+        cost = costCalc( integer, dst, srcIsLvalue, symtab, env );
         if ( cost < Cost::unsafe ) {
                 cost.incSafe();
 …
 void ConversionCost_new::postvisit( const ast::TypeInstType * typeInstType ) {
         if ( const ast::EqvClass * eqv = env.lookup( typeInstType->name ) ) {
                 cost = costCalc( eqv->bound, dst, symtab, env );
+        if ( const ast::EqvClass * eqv = env.lookup( *typeInstType ) ) {
+                cost = costCalc( eqv->bound, dst, srcIsLvalue, symtab, env );
         } else if ( const ast::TypeInstType * dstAsInst =
                         dynamic_cast< const ast::TypeInstType * >( dst ) ) {
                 if ( typeInstType->name == dstAsInst->name ) {
+                if ( *typeInstType == *dstAsInst ) {
                         cost = Cost::zero;
+                }
 …
                 assertf( type, "Unexpected typedef.");
                 if ( type->base ) {
                         cost = costCalc( type->base, dst, symtab, env ) + Cost::safe;
+                        cost = costCalc( type->base, dst, srcIsLvalue, symtab, env ) + Cost::safe;
+                }
+        }
 …
                 auto dstEnd = dstAsTuple->types.end();
                 while ( srcIt != srcEnd && dstIt != dstEnd ) {
                         Cost newCost = costCalc( * srcIt++, * dstIt++, symtab, env );
+                        Cost newCost = costCalc( * srcIt++, * dstIt++, srcIsLvalue, symtab, env );
                         if ( newCost == Cost::infinity ) {
                                 return;
 …
                         cost.incSign( signMatrix[ ast::BasicType::SignedInt ][ dstAsBasic->kind ] );
+                }
+        } else if ( dynamic_cast< const ast::PointerType * >( dst ) ) {
+                cost = Cost::zero;
+                // +1 for zero_t ->, +1 for disambiguation
+                cost.incSafe( maxIntCost + 2 );
+        }
+}
 …
                         cost.incSign( signMatrix[ ast::BasicType::SignedInt ][ dstAsBasic->kind ] );
+                }
+        } else if ( dynamic_cast< const ast::PointerType * >( dst ) ) {
+                cost = Cost::zero;
+                cost.incSafe( maxIntCost + 2 );
+        }
+}
+        }
+}
+// size_t ConversionCost_new::traceId = Stats::Heap::new_stacktrace_id("ConversionCost");
 } // namespace ResolvExpr

src/ResolvExpr/ConversionCost.h

-              r3c64c668
+              r58fe85a
 // Created On       : Sun May 17 09:37:28 2015
 // Last Modified By : Andrew Beach
 // Last Modified On : Thu Aug  8 16:13:00 2019
 // Update Count     : 6
+// Last Modified On : Wed Jul 29 16:12:00 2020
+// Update Count     : 7
 //
 …
                 void postvisit( const ReferenceType * refType );
                 void postvisit( const FunctionType * functionType );
-                void postvisit( const StructInstType * aggregateUseType );
-                void postvisit( const UnionInstType * aggregateUseType );
                 void postvisit( const EnumInstType * aggregateUseType );
                 void postvisit( const TraitInstType * aggregateUseType );
 …
 // Some function pointer types, differ in return type.
 using CostCalculation = std::function<Cost(const ast::Type *, const ast::Type *,
+using CostCalculation = std::function<Cost(const ast::Type *, const ast::Type *, bool,
         const ast::SymbolTable &, const ast::TypeEnvironment &)>;
 using NumCostCalculation = std::function<int(const ast::Type *, const ast::Type *,
+using PtrsCalculation = std::function<int(const ast::Type *, const ast::Type *,
         const ast::SymbolTable &, const ast::TypeEnvironment &)>;
 …
 protected:
         const ast::Type * dst;
+        bool srcIsLvalue;
         const ast::SymbolTable & symtab;
         const ast::TypeEnvironment & env;
         CostCalculation costCalc;
 public:
+        static size_t traceId;
         Cost cost;
+        Cost result() { return cost; }
         ConversionCost_new( const ast::Type * dst, const ast::SymbolTable & symtab,
+        ConversionCost_new( const ast::Type * dst, bool srcIsLvalue, const ast::SymbolTable & symtab,
                         const ast::TypeEnvironment & env, CostCalculation costCalc ) :
+                dst( dst ), symtab( symtab ), env( env ), costCalc( costCalc ), cost( Cost::infinity )
+                dst( dst ), srcIsLvalue( srcIsLvalue ), symtab( symtab ), env( env ),
+                costCalc( costCalc ), cost( Cost::infinity )
         {}
 …
         void postvisit( const ast::ReferenceType * refType );
         void postvisit( const ast::FunctionType * functionType );
-        void postvisit( const ast::StructInstType * structInstType );
-        void postvisit( const ast::UnionInstType * unionInstType );
         void postvisit( const ast::EnumInstType * enumInstType );
         void postvisit( const ast::TraitInstType * traitInstType );
 …
 Cost convertToReferenceCost( const ast::Type * src, const ast::ReferenceType * dest,
+        const ast::SymbolTable & indexer, const ast::TypeEnvironment & env, NumCostCalculation func );
+        bool srcIsLvalue, const ast::SymbolTable & indexer, const ast::TypeEnvironment & env,
+        PtrsCalculation func );
 } // namespace ResolvExpr

src/ResolvExpr/CurrentObject.cc

-              r3c64c668
+              r58fe85a
 #include <string>                      // for string, operator<<, allocator
+#include "AST/Copy.hpp"                // for shallowCopy
 #include "AST/Expr.hpp"                // for InitAlternative
 #include "AST/GenericSubstitution.hpp" // for genericSubstitution
 #include "AST/Init.hpp"                // for Designation
 #include "AST/Node.hpp"                // for readonly
+#include "AST/Print.hpp"                // for readonly
 #include "AST/Type.hpp"
 #include "Common/Indenter.h"           // for Indenter, operator<<
 …
         class SimpleIterator final : public MemberIterator {
                 CodeLocation location;
                 readonly< Type > type = nullptr;
+                const Type * type = nullptr;
         public:
                 SimpleIterator( const CodeLocation & loc, const Type * t ) : location( loc ), type( t ) {}
                 void setPosition(
                         std::deque< ptr< Expr > >::const_iterator begin,
+                void setPosition(
+                        std::deque< ptr< Expr > >::const_iterator begin,
                         std::deque< ptr< Expr > >::const_iterator end
                 ) override {
 …
         class ArrayIterator final : public MemberIterator {
                 CodeLocation location;
                 readonly< ArrayType > array = nullptr;
                 readonly< Type > base = nullptr;
+                const ArrayType * array = nullptr;
+                const Type * base = nullptr;
                 size_t index = 0;
                 size_t size = 0;
 …
                         auto res = eval(expr);
                         if ( ! res.second ) {
                                 SemanticError( location,
+                                SemanticError( location,
                                         toString("Array designator must be a constant expression: ", expr ) );
+                        }
 …
         public:
                 ArrayIterator( const CodeLocation & loc, const ArrayType * at )
+                ArrayIterator( const CodeLocation & loc, const ArrayType * at )
                 : location( loc ), array( at ), base( at->base ) {
                         PRINT( std::cerr << "Creating array iterator: " << at << std::endl; )
 …
                 void setPosition( const Expr * expr ) {
                         // need to permit integer-constant-expressions, including: integer constants,
                         // enumeration constants, character constants, sizeof expressions, alignof expressions,
+                        // need to permit integer-constant-expressions, including: integer constants,
+                        // enumeration constants, character constants, sizeof expressions, alignof expressions,
                         // cast expressions
                         if ( auto constExpr = dynamic_cast< const ConstantExpr * >( expr ) ) {
 …
                                         index = constExpr->intValue();
                                 } catch ( SemanticErrorException & ) {
                                         SemanticError( expr,
+                                        SemanticError( expr,
                                                 "Constant expression of non-integral type in array designator: " );
+                                }
                         } else if ( auto castExpr = dynamic_cast< const CastExpr * >( expr ) ) {
                                 setPosition( castExpr->arg );
                         } else if (
                                 dynamic_cast< const SizeofExpr * >( expr )
                                 || dynamic_cast< const AlignofExpr * >( expr )
+                        } else if (
+                                dynamic_cast< const SizeofExpr * >( expr )
+                                || dynamic_cast< const AlignofExpr * >( expr )
                         ) {
                                 index = 0;
                         } else {
                                 assertf( false,
+                                assertf( false,
                                         "bad designator given to ArrayIterator: %s", toString( expr ).c_str() );
+                        }
+                }
                 void setPosition(
                         std::deque< ptr< Expr > >::const_iterator begin,
+                void setPosition(
+                        std::deque< ptr< Expr > >::const_iterator begin,
                         std::deque< ptr< Expr > >::const_iterator end
                 ) override {
 …
+                }
                 AggregateIterator(
                         const CodeLocation & loc, const std::string k, const std::string & n, const Type * i,
+                AggregateIterator(
+                        const CodeLocation & loc, const std::string k, const std::string & n, const Type * i,
                         const MemberList & ms )
                 : location( loc ), kind( k ), name( n ), inst( i ), members( ms ), curMember( ms.begin() ),
+                : location( loc ), kind( k ), name( n ), inst( i ), members( ms ), curMember( ms.begin() ),
                   sub( genericSubstitution( i ) ) {
                         PRINT( std::cerr << "Creating " << kind << "(" << name << ")"; )
 …
         public:
                 void setPosition(
                         std::deque< ptr< Expr > >::const_iterator begin,
+                void setPosition(
+                        std::deque< ptr< Expr > >::const_iterator begin,
                         std::deque< ptr< Expr > >::const_iterator end
                 ) final {
 …
                                         return;
+                                }
                                 assertf( false,
+                                assertf( false,
                                         "could not find member in %s: %s", kind.c_str(), toString( varExpr ).c_str() );
                         } else {
                                 assertf( false,
+                                assertf( false,
                                         "bad designator given to %s: %s", kind.c_str(), toString( *begin ).c_str() );
+                        }
 …
                                                 new VariableExpr{ location, curMember->strict_as< ObjectDecl >() } );
                                         // need to substitute for generic types so that casts are to concrete types
+                                        alt.type = shallowCopy(alt.type.get());
                                         PRINT( std::cerr << "  type is: " << alt.type; )
                                         sub.apply( alt.type ); // also apply to designation??
 …
                                 for ( InitAlternative & alt : ret ) {
                                         PRINT( std::cerr << "iterating and adding designators" << std::endl; )
                                         alt.designation.get_and_mutate()->designators.emplace_front(
+                                        alt.designation.get_and_mutate()->designators.emplace_front(
                                                 new VariableExpr{ location, curMember->strict_as< ObjectDecl >() } );
+                                }
 …
         class TupleIterator final : public AggregateIterator {
         public:
                 TupleIterator( const CodeLocation & loc, const TupleType * inst )
                 : AggregateIterator(
                         loc, "TupleIterator", toString("Tuple", inst->size()), inst, inst->members
+                TupleIterator( const CodeLocation & loc, const TupleType * inst )
+                : AggregateIterator(
+                        loc, "TupleIterator", toString("Tuple", inst->size()), inst, inst->members
                 ) {}
 …
         MemberIterator * createMemberIterator( const CodeLocation & loc, const Type * type ) {
                 if ( auto aggr = dynamic_cast< const ReferenceToType * >( type ) ) {
+                if ( auto aggr = dynamic_cast< const BaseInstType * >( type ) ) {
                         if ( auto sit = dynamic_cast< const StructInstType * >( aggr ) ) {
                                 return new StructIterator{ loc, sit };
 …
                                 return new UnionIterator{ loc, uit };
                         } else {
                                 assertf(
                                         dynamic_cast< const EnumInstType * >( aggr )
                                                 || dynamic_cast< const TypeInstType * >( aggr ),
                                         "Encountered unhandled ReferenceToType in createMemberIterator: %s",
+                                assertf(
+                                        dynamic_cast< const EnumInstType * >( type )
+                                                || dynamic_cast< const TypeInstType * >( type ),
+                                        "Encountered unhandled BaseInstType in createMemberIterator: %s",
                                                 toString( type ).c_str() );
                                 return new SimpleIterator{ loc, type };
 …
                 using DesignatorChain = std::deque< ptr< Expr > >;
                 PRINT( std::cerr << "___findNext" << std::endl; )
                 // find all the d's
                 std::vector< DesignatorChain > desigAlts{ {} }, newDesigAlts;
 …
                                         DesignatorChain & d = *dit;
                                         PRINT( std::cerr << "____actual: " << t << std::endl; )
                                         if ( auto refType = dynamic_cast< const ReferenceToType * >( t ) ) {
+                                        if ( auto refType = dynamic_cast< const BaseInstType * >( t ) ) {
                                                 // concatenate identical field names
                                                 for ( const Decl * mem : refType->lookup( nexpr->name ) ) {
 …
                 // set new designators
                 assertf( ! objStack.empty(), "empty object stack when setting designation" );
                 Designation * actualDesignation =
+                Designation * actualDesignation =
                         new Designation{ designation->location, DesignatorChain{d} };
                 objStack.back()->setPosition( d ); // destroys d

src/ResolvExpr/FindOpenVars.cc

-              r3c64c668
+              r58fe85a
                                 // mark open/closed variables
                                 if ( nextIsOpen ) {
                                         for ( const ast::TypeDecl * decl : type->forall ) {
                                                 open[ decl->name ] = ast::TypeDecl::Data{ decl };
                                                 for ( const ast::DeclWithType * assert : decl->assertions ) {
                                                         need[ assert ].isUsed = false;
+                                                }
+                                        for ( auto & decl : type->forall ) {
+                                                open[ *decl ] = ast::TypeDecl::Data{ decl->base };
+                                        }
+                                        for ( auto & assert : type->assertions ) {
+                                                need[ assert ].isUsed = false;
+                                        }
                                 } else {
                                         for ( const ast::TypeDecl * decl : type->forall ) {
                                                 closed[ decl->name ] = ast::TypeDecl::Data{ decl };
                                                 for ( const ast::DeclWithType * assert : decl->assertions ) {
                                                         have[ assert ].isUsed = false;
+                                                }
+                                        for ( auto & decl : type->forall ) {
+                                                closed[ *decl ] = ast::TypeDecl::Data{ decl->base };
+                                        }
+                                        for ( auto & assert : type->assertions ) {
+                                                have[ assert ].isUsed = false;
+                                        }
+                                }

src/ResolvExpr/PolyCost.cc

-              r3c64c668
+              r58fe85a
 // TODO: When the old PolyCost is torn out get rid of the _new suffix.
+struct PolyCost_new {
+class PolyCost_new {
+        const ast::SymbolTable &symtab;
+public:
         int result;
-        const ast::SymbolTable &symtab;
         const ast::TypeEnvironment &env_;
         PolyCost_new( const ast::SymbolTable & symtab, const ast::TypeEnvironment & env ) :
                 result( 0 ), symtab( symtab ), env_( env ) {}
+        PolyCost_new( const ast::SymbolTable & symtab, const ast::TypeEnvironment & env )
+        : symtab( symtab ), result( 0 ), env_( env ) {}
         void previsit( const ast::TypeInstType * type ) {
                 if ( const ast::EqvClass * eqv = env_.lookup( type->name ) ) /* && */ if ( eqv->bound ) {
+                if ( const ast::EqvClass * eqv = env_.lookup( *type ) ) /* && */ if ( eqv->bound ) {
                         if ( const ast::TypeInstType * otherType = eqv->bound.as< ast::TypeInstType >() ) {
                                 if ( symtab.lookupType( otherType->name ) ) {
 …
         ast::Pass<PolyCost_new> costing( symtab, env );
         type->accept( costing );
         return costing.pass.result;
+        return costing.core.result;
+}

src/ResolvExpr/PtrsAssignable.cc

-              r3c64c668
+              r58fe85a
+        }
         void postvisit( const ast::TypeInstType * inst ) {
                 if ( const ast::EqvClass * eqv = typeEnv.lookup( inst->name ) ) {
+                if ( const ast::EqvClass * eqv = typeEnv.lookup( *inst ) ) {
                         if ( eqv->bound ) {
                                 // T * = S * for any S depends on the type bound to T
 …
                 const ast::TypeEnvironment & env ) {
         if ( const ast::TypeInstType * dstAsInst = dynamic_cast< const ast::TypeInstType * >( dst ) ) {
                 if ( const ast::EqvClass * eqv = env.lookup( dstAsInst->name ) ) {
+                if ( const ast::EqvClass * eqv = env.lookup( *dstAsInst ) ) {
                         return ptrsAssignable( src, eqv->bound, env );
+                }
 …
                 ast::Pass<PtrsAssignable_new> visitor( dst, env );
                 src->accept( visitor );
                 return visitor.pass.result;
+                return visitor.core.result;
+        }

src/ResolvExpr/PtrsCastable.cc

r3c64c668	r58fe85a
180	180	}
181	181	}
182		} else if ( const ast::EqvClass * eqvClass = env.lookup( ~~inst->name~~ ) ) {
	182	} else if ( const ast::EqvClass * eqvClass = env.lookup( *inst ) ) {
183	183	if ( eqvClass->data.kind == ast::TypeDecl::Ftype ) {
184	184	return -1;
…	…
283	283	) {
284	284	if ( auto inst = dynamic_cast< const ast::TypeInstType * >( dst ) ) {
285		if ( const ast::EqvClass * eqvClass = env.lookup( ~~inst->name~~ ) ) {
	285	if ( const ast::EqvClass * eqvClass = env.lookup( *inst ) ) {
286	286	return ptrsAssignable( src, eqvClass->bound, env );
287	287	}
…	…
293	293	ast::Pass< PtrsCastable_new > ptrs{ dst, env, symtab };
294	294	src->accept( ptrs );
295		return ptrs.~~pass~~.result;
	295	return ptrs.core.result;
296	296	}
297	297	}

src/ResolvExpr/RenameVars.cc

-              r3c64c668
+              r58fe85a
 #include "SynTree/Visitor.h"       // for acceptAll, maybeAccept
+#include "AST/Copy.hpp"
 namespace ResolvExpr {
 …
                 int level = 0;
                 int resetCount = 0;
+                int next_expr_id = 1;
+                int next_usage_id = 1;
                 ScopedMap< std::string, std::string > nameMap;
+                ScopedMap< std::string, ast::TypeInstType::TypeEnvKey > idMap;
         public:
                 void reset() {
 …
+                }
-                using mapConstIterator = ScopedMap< std::string, std::string >::const_iterator;
                 void rename( TypeInstType * type ) {
                         mapConstIterator it = nameMap.find( type->name );
+                        auto it = nameMap.find( type->name );
                         if ( it != nameMap.end() ) {
                                 type->name = it->second;
+                        }
+                }
+                void nextUsage() {
+                        ++next_usage_id;
+                }
 …
                                         // ditto for assertion names, the next level in
                                         level++;
+                                        // acceptAll( td->assertions, *this );
+                                } // for
+                        } // if
+                                }
+                        }
+                }
 …
                 const ast::TypeInstType * rename( const ast::TypeInstType * type ) {
+                        mapConstIterator it = nameMap.find( type->name );
+                        if ( it != nameMap.end() ) {
+                                ast::TypeInstType * mutType = ast::mutate( type );
+                                mutType->name = it->second;
+                    type = mutType;
+                        }
+                        // rename
+                        auto it = idMap.find( type->name );
+                        if ( it != idMap.end() ) {
+                                // unconditionally mutate because map will *always* have different name
+                                ast::TypeInstType * mut = ast::shallowCopy( type );
+                                // reconcile base node since some copies might have been made
+                                mut->base = it->second.base;
+                                mut->formal_usage = it->second.formal_usage;
+                                mut->expr_id = it->second.expr_id;
+                    type = mut;
+                        }
                         return type;
+                }
+                template<typename NodeT>
+                const NodeT * openLevel( const NodeT * type ) {
+                        if ( !type->forall.empty() ) {
+                                nameMap.beginScope();
+                                // Load new names from this forall clause and perform renaming.
+                                NodeT * mutType = ast::mutate( type );
+                                for ( ast::ptr< ast::TypeDecl > & td : mutType->forall ) {
+                                        std::ostringstream output;
+                                        output << "_" << resetCount << "_" << level << "_" << td->name;
+                                        std::string newname( output.str() );
+                                        nameMap[ td->name ] = newname;
+                                        ++level;
+                                        ast::TypeDecl * decl = ast::mutate( td.get() );
+                                        decl->name = newname;
+                                        td = decl;
+                                }
+                        }
+                        return type;
+                }
+                template<typename NodeT>
+                const NodeT * closeLevel( const NodeT * type ) {
+                        if ( !type->forall.empty() ) {
+                                nameMap.endScope();
+                        }
+                        return type;
+                const ast::FunctionType * openLevel( const ast::FunctionType * type, RenameMode mode ) {
+                        if ( type->forall.empty() ) return type;
+                        idMap.beginScope();
+                        // Load new names from this forall clause and perform renaming.
+                        auto mutType = ast::shallowCopy( type );
+                        // assert( type == mutType && "mutated type must be unique from ForallSubstitutor" );
+                        for ( auto & td : mutType->forall ) {
+                                auto mut = ast::shallowCopy( td.get() );
+                                // assert( td == mutDecl && "mutated decl must be unique from ForallSubstitutor" );
+                                if (mode == GEN_EXPR_ID) {
+                                        mut->expr_id = next_expr_id;
+                                        mut->formal_usage = -1;
+                                        ++next_expr_id;
+                                }
+                                else if (mode == GEN_USAGE) {
+                                        assertf(mut->expr_id, "unfilled expression id in generating candidate type");
+                                        mut->formal_usage = next_usage_id;
+                                }
+                                else {
+                                        assert(false);
+                                }
+                                idMap[ td->name ] = ast::TypeInstType::TypeEnvKey(*mut);
+                                td = mut;
+                        }
+                        return mutType;
+                }
+                void closeLevel( const ast::FunctionType * type ) {
+                        if ( type->forall.empty() ) return;
+                        idMap.endScope();
+                }
         };
 …
         RenamingData renaming;
         struct RenameVars {
+        struct RenameVars_old {
                 void previsit( TypeInstType * instType ) {
                         renaming.openLevel( (Type*)instType );
 …
                         renaming.closeLevel( type );
+                }
+        };
+        struct RenameVars_new : public ast::PureVisitor /*: public ast::WithForallSubstitutor*/ {
+                RenameMode mode;
                 const ast::FunctionType * previsit( const ast::FunctionType * type ) {
+                        return renaming.openLevel( type );
+                }
+                        return renaming.openLevel( type, mode );
+                }
+                /*
                 const ast::StructInstType * previsit( const ast::StructInstType * type ) {
                         return renaming.openLevel( type );
 …
                         return renaming.openLevel( type );
+                }
+                */
                 const ast::TypeInstType * previsit( const ast::TypeInstType * type ) {
+                        return renaming.rename( renaming.openLevel( type ) );
+                }
+                const ast::ParameterizedType * postvisit( const ast::ParameterizedType * type ) {
+                        return renaming.closeLevel( type );
+                        if (mode == GEN_USAGE && !type->formal_usage) return type; // do not rename an actual type
+                        return renaming.rename( type );
+                }
+                void postvisit( const ast::FunctionType * type ) {
+                        renaming.closeLevel( type );
+                }
         };
 …
 void renameTyVars( Type * t ) {
         PassVisitor<RenameVars> renamer;
+        PassVisitor<RenameVars_old> renamer;
         t->accept( renamer );
+}
+const ast::Type * renameTyVars( const ast::Type * t ) {
+        ast::Pass<RenameVars> renamer;
+const ast::Type * renameTyVars( const ast::Type * t, RenameMode mode, bool reset ) {
+        // ast::Type *tc = ast::deepCopy(t);
+        ast::Pass<RenameVars_new> renamer;
+        renamer.core.mode = mode;
+        if (mode == GEN_USAGE && reset) {
+                renaming.nextUsage();
+        }
         return t->accept( renamer );
+}
 …
 void resetTyVarRenaming() {
         renaming.reset();
+        renaming.nextUsage();
+}

src/ResolvExpr/RenameVars.h

-              r3c64c668
+              r58fe85a
         /// Provides a consistent renaming of forall type names in a hierarchy by prefixing them with a unique "level" ID
         void renameTyVars( Type * );
+        const ast::Type * renameTyVars( const ast::Type * );
+        enum RenameMode {
+                GEN_USAGE, // for type in VariableExpr
+                GEN_EXPR_ID // for type in decl
+        };
+        const ast::Type * renameTyVars( const ast::Type *, RenameMode mode = GEN_USAGE, bool reset = true );
         /// resets internal state of renamer to avoid overflow
         void resetTyVarRenaming();
 } // namespace ResolvExpr

src/ResolvExpr/ResolvMode.h

-              r3c64c668
+              r58fe85a
                 const bool prune;            ///< Prune alternatives to min-cost per return type? [true]
                 const bool failFast;         ///< Fail on no resulting alternatives? [true]
-                const bool satisfyAssns;     ///< Satisfy assertions? [false]
+        private:
+                constexpr ResolvMode(bool a, bool p, bool ff, bool sa)
+                : adjust(a), prune(p), failFast(ff), satisfyAssns(sa) {}
+                constexpr ResolvMode(bool a, bool p, bool ff)
+                : adjust(a), prune(p), failFast(ff) {}
-        public:
                 /// Default settings
                 constexpr ResolvMode() : adjust(false), prune(true), failFast(true), satisfyAssns(false) {}
+                constexpr ResolvMode() : adjust(false), prune(true), failFast(true) {}
                 /// With adjust flag set; turns array and function types into equivalent pointers
                 static constexpr ResolvMode withAdjustment() { return { true, true, true, false }; }
+                static constexpr ResolvMode withAdjustment() { return { true, true, true }; }
                 /// With adjust flag set but prune unset; pruning ensures there is at least one alternative
                 /// per result type
                 static constexpr ResolvMode withoutPrune() { return { true, false, true, false }; }
+                static constexpr ResolvMode withoutPrune() { return { true, false, true }; }
                 /// With adjust and prune flags set but failFast unset; failFast ensures there is at least
                 /// one resulting alternative
                 static constexpr ResolvMode withoutFailFast() { return { true, true, false, false }; }
+                static constexpr ResolvMode withoutFailFast() { return { true, true, false }; }
                 /// The same mode, but with satisfyAssns turned on; for top-level calls
                 ResolvMode atTopLevel() const { return { adjust, prune, failFast, true }; }
+                ResolvMode atTopLevel() const { return { adjust, true, failFast }; }
         };
 } // namespace ResolvExpr

src/ResolvExpr/ResolveAssertions.cc

-              r3c64c668
+              r58fe85a
                         const DeclarationWithType * candidate = cdata.id;
+                        // build independent unification context for candidate
+                        // ignore deleted candidates.
+                        // NOTE: this behavior is different from main resolver.
+                        // further investigations might be needed to determine
+                        // if we should implement the same rule here
+                        // (i.e. error if unique best match is deleted)
+                        if (candidate->isDeleted) continue;
+                        // build independent unification context. for candidate
                         AssertionSet have, newNeed;
                         TypeEnvironment newEnv{ resn.alt.env };
 …
         /// Limit to depth of recursion of assertion satisfaction
         static const int recursionLimit = 4;
+        static const int recursionLimit = 7;
         /// Maximum number of simultaneously-deferred assertions to attempt concurrent satisfaction of
         static const int deferLimit = 10;

src/ResolvExpr/ResolveTypeof.cc

-              r3c64c668
+              r58fe85a
 #include "ResolveTypeof.h"
+#include "RenameVars.h"
 #include <cassert>               // for assert
 …
 #include "SynTree/Mutator.h"     // for Mutator
 #include "SynTree/Type.h"        // for TypeofType, Type
+#include "SymTab/Mangler.h"
+#include "InitTweak/InitTweak.h" // for isConstExpr
 namespace SymTab {
 …
                         // replace basetypeof(<enum>) by int
                         if ( dynamic_cast<EnumInstType*>(newType) ) {
                                 Type* newerType =
                                         new BasicType{ newType->get_qualifiers(), BasicType::SignedInt,
+                                Type* newerType =
+                                        new BasicType{ newType->get_qualifiers(), BasicType::SignedInt,
                                         newType->attributes };
                                 delete newType;
                                 newType = newerType;
+                        }
                         newType->get_qualifiers().val
+                        newType->get_qualifiers().val
                                 = ( newType->get_qualifiers().val & ~Type::Qualifiers::Mask ) | oldQuals;
                 } else {
                         newType->get_qualifiers().val |= oldQuals;
+                }
                 return newType;
+        }
 …
                 ResolveTypeof_new( const ast::SymbolTable & syms ) : localSymtab( syms ) {}
                 void premutate( const ast::TypeofType * ) { visit_children = false; }
                 const ast::Type * postmutate( const ast::TypeofType * typeofType ) {
+                void previsit( const ast::TypeofType * ) { visit_children = false; }
+                const ast::Type * postvisit( const ast::TypeofType * typeofType ) {
                         // pass on null expression
                         if ( ! typeofType->expr ) return typeofType;
 …
                                 // typeof wrapping expression
                                 ast::TypeEnvironment dummy;
                                 ast::ptr< ast::Expr > newExpr =
+                                ast::ptr< ast::Expr > newExpr =
                                         resolveInVoidContext( typeofType->expr, localSymtab, dummy );
                                 assert( newExpr->result && ! newExpr->result->isVoid() );
 …
                                 // replace basetypeof(<enum>) by int
                                 if ( newType.as< ast::EnumInstType >() ) {
                                         newType = new ast::BasicType{
+                                        newType = new ast::BasicType{
                                                 ast::BasicType::SignedInt, newType->qualifiers, copy(newType->attributes) };
+                                }
                                 reset_qualifiers(
                                         newType,
+                                reset_qualifiers(
+                                        newType,
                                         ( newType->qualifiers & ~ast::CV::EquivQualifiers ) | typeofType->qualifiers );
                         } else {
 …
+                        }
                         return newType;
+                        return newType.release();
+                }
         };
 …
         ast::Pass< ResolveTypeof_new > mutator{ symtab };
         return type->accept( mutator );
+}
+struct FixArrayDimension {
+        // should not require a mutable symbol table - prevent pass template instantiation
+        const ast::SymbolTable & _symtab;
+        FixArrayDimension(const ast::SymbolTable & symtab): _symtab(symtab) {}
+        const ast::ArrayType * previsit (const ast::ArrayType * arrayType) {
+                if (!arrayType->dimension) return arrayType;
+                auto mutType = mutate(arrayType);
+                ast::ptr<ast::Type> sizetype = ast::sizeType ? ast::sizeType : new ast::BasicType(ast::BasicType::LongUnsignedInt);
+                mutType->dimension = findSingleExpression(arrayType->dimension, sizetype, _symtab);
+                if (InitTweak::isConstExpr(mutType->dimension)) {
+                        mutType->isVarLen = ast::LengthFlag::FixedLen;
+                }
+                else {
+                        mutType->isVarLen = ast::LengthFlag::VariableLen;
+                }
+                return mutType;
+        }
+};
+const ast::Type * fixArrayType( const ast::Type * type, const ast::SymbolTable & symtab) {
+        ast::Pass<FixArrayDimension> visitor {symtab};
+        return type->accept(visitor);
+}
+const ast::ObjectDecl * fixObjectType( const ast::ObjectDecl * decl , const ast::SymbolTable & symtab ) {
+        if (!decl->isTypeFixed) {
+                auto mutDecl = mutate(decl);
+                auto resolvedType = resolveTypeof(decl->type, symtab);
+                resolvedType = fixArrayType(resolvedType, symtab);
+                mutDecl->type = resolvedType;
+                // check variable length if object is an array.
+                // xxx - should this be part of fixObjectType?
+                /*
+                if (auto arrayType = dynamic_cast<const ast::ArrayType *>(resolvedType)) {
+                        auto dimExpr = findSingleExpression(arrayType->dimension, ast::sizeType, symtab);
+                        if (auto varexpr = arrayType->dimension.as<ast::VariableExpr>()) {// hoisted previously
+                                if (InitTweak::isConstExpr(varexpr->var.strict_as<ast::ObjectDecl>()->init)) {
+                                        auto mutType = mutate(arrayType);
+                                        mutType->isVarLen = ast::LengthFlag::VariableLen;
+                                        mutDecl->type = mutType;
+                                }
+                        }
+                }
+                */
+                if (!mutDecl->name.empty())
+                        mutDecl->mangleName = Mangle::mangle(mutDecl); // do not mangle unnamed variables
+                mutDecl->type = renameTyVars(mutDecl->type, RenameMode::GEN_EXPR_ID);
+                mutDecl->isTypeFixed = true;
+                return mutDecl;
+        }
+        return decl;
+}

src/ResolvExpr/ResolveTypeof.h

-              r3c64c668
+              r58fe85a
         class Type;
         class SymbolTable;
+        class ObjectDecl;
+}
 …
         Type *resolveTypeof( Type*, const SymTab::Indexer &indexer );
         const ast::Type * resolveTypeof( const ast::Type *, const ast::SymbolTable & );
+        const ast::ObjectDecl * fixObjectType( const ast::ObjectDecl * decl , const ast::SymbolTable & symtab );
 } // namespace ResolvExpr

src/ResolvExpr/Resolver.cc

-              r3c64c668
+              r58fe85a
 // Author           : Aaron B. Moss
 // Created On       : Sun May 17 12:17:01 2015
 // Last Modified By : Aaron B. Moss
 // Last Modified On : Wed May 29 11:00:00 2019
 // Update Count     : 241
+// Last Modified By : Andrew Beach
+// Last Modified On : Fri Mar 27 11:58:00 2020
+// Update Count     : 242
 //
 …
 #include "RenameVars.h"                  // for RenameVars, global_renamer
 #include "Resolver.h"
+#include "ResolveTypeof.h"
 #include "ResolvMode.h"                  // for ResolvMode
 #include "typeops.h"                     // for extractResultType
 #include "Unify.h"                       // for unify
+#include "CompilationState.h"
 #include "AST/Chain.hpp"
 #include "AST/Decl.hpp"
 …
 #include "Common/PassVisitor.h"          // for PassVisitor
 #include "Common/SemanticError.h"        // for SemanticError
+#include "Common/Stats/ResolveTime.h"    // for ResolveTime::start(), ResolveTime::stop()
 #include "Common/utility.h"              // for ValueGuard, group_iterate
 #include "InitTweak/GenInit.h"
 …
 #include "SymTab/Autogen.h"              // for SizeType
 #include "SymTab/Indexer.h"              // for Indexer
+#include "SymTab/Mangler.h"              // for Mangler
 #include "SynTree/Declaration.h"         // for ObjectDecl, TypeDecl, Declar...
 #include "SynTree/Expression.h"          // for Expression, CastExpr, InitExpr
 …
                 // TODO: Replace *exception type with &exception type.
                 if ( throwStmt->get_expr() ) {
                         const StructDecl * exception_decl = indexer.lookupStruct( "__cfaabi_ehm__base_exception_t" );
+                        const StructDecl * exception_decl = indexer.lookupStruct( "__cfaehm_base_exception_t" );
                         assert( exception_decl );
                         Type * exceptType = new PointerType( noQualifiers, new StructInstType( noQualifiers, const_cast<StructDecl *>(exception_decl) ) );
 …
         namespace {
                 /// Finds deleted expressions in an expression tree
                 struct DeleteFinder_new final : public ast::WithShortCircuiting {
                         const ast::DeletedExpr * delExpr = nullptr;
+                struct DeleteFinder_new final : public ast::WithShortCircuiting, public ast::WithVisitorRef<DeleteFinder_new> {
+                        const ast::DeletedExpr * result = nullptr;
                         void previsit( const ast::DeletedExpr * expr ) {
+                                if ( delExpr ) { visit_children = false; }
+                                else { delExpr = expr; }
+                        }
+                        void previsit( const ast::Expr * ) {
+                                if ( delExpr ) { visit_children = false; }
+                                if ( result ) { visit_children = false; }
+                                else { result = expr; }
+                        }
+                        void previsit( const ast::Expr * expr ) {
+                                if ( result ) { visit_children = false; }
+                                if (expr->inferred.hasParams()) {
+                                        for (auto & imp : expr->inferred.inferParams() ) {
+                                                imp.second.expr->accept(*visitor);
+                                        }
+                                }
+                        }
                 };
         } // anonymous namespace
         /// Check if this expression is or includes a deleted expression
         const ast::DeletedExpr * findDeletedExpr( const ast::Expr * expr ) {
+                ast::Pass<DeleteFinder_new> finder;
+                expr->accept( finder );
+                return finder.pass.delExpr;
+                return ast::Pass<DeleteFinder_new>::read( expr );
+        }
 …
                 /// Strips extraneous casts out of an expression
                 struct StripCasts_new final {
                         const ast::Expr * postmutate( const ast::CastExpr * castExpr ) {
+                        const ast::Expr * postvisit( const ast::CastExpr * castExpr ) {
                                 if (
                                         castExpr->isGenerated
+                                        castExpr->isGenerated == ast::GeneratedCast
                                         && typesCompatible( castExpr->arg->result, castExpr->result )
                                 ) {
 …
+                }
+                /// Establish post-resolver invariants for expressions
+        } // anonymous namespace
+/// Establish post-resolver invariants for expressions
                 void finishExpr(
                         ast::ptr< ast::Expr > & expr, const ast::TypeEnvironment & env,
 …
                         StripCasts_new::strip( expr );
+                }
-        } // anonymous namespace
         ast::ptr< ast::Expr > resolveInVoidContext(
 …
                 // set up and resolve expression cast to void
                 ast::CastExpr * untyped = new ast::CastExpr{ expr };
+                ast::ptr< ast::CastExpr > untyped = new ast::CastExpr{ expr };
                 CandidateRef choice = findUnfinishedKindExpression(
                         untyped, symtab, "", anyCandidate, ResolvMode::withAdjustment() );
 …
+        }
+        namespace {
+                /// Resolve `untyped` to the expression whose candidate is the best match for a `void`
+        /// Resolve `untyped` to the expression whose candidate is the best match for a `void`
                 /// context.
                 ast::ptr< ast::Expr > findVoidExpression(
                         const ast::Expr * untyped, const ast::SymbolTable & symtab
                 ) {
-                        resetTyVarRenaming();
                         ast::TypeEnvironment env;
                         ast::ptr< ast::Expr > newExpr = resolveInVoidContext( untyped, symtab, env );
 …
                         return newExpr;
+                }
+        namespace {
                 /// resolve `untyped` to the expression whose candidate satisfies `pred` with the
 …
                         CandidateRef choice =
                                 findUnfinishedKindExpression( untyped, symtab, kind, pred, mode );
                         finishExpr( choice->expr, choice->env, untyped->env );
+                        ResolvExpr::finishExpr( choice->expr, choice->env, untyped->env );
                         return std::move( choice->expr );
+                }
 …
                         const ast::Expr * untyped, const ast::SymbolTable & symtab
                 ) {
+                        return findKindExpression( untyped, symtab );
+                        Stats::ResolveTime::start( untyped );
+                        auto res = findKindExpression( untyped, symtab );
+                        Stats::ResolveTime::stop();
+                        return res;
+                }
         } // anonymous namespace
                 ast::ptr< ast::Expr > findSingleExpression(
                         const ast::Expr * untyped, const ast::Type * type, const ast::SymbolTable & symtab
                 ) {
                         assert( untyped && type );
                         ast::ptr< ast::Expr > castExpr = new ast::CastExpr{ untyped, type };
                         ast::ptr< ast::Expr > newExpr = findSingleExpression( castExpr, symtab );
                         removeExtraneousCast( newExpr, symtab );
                         return newExpr;
+                }
+        ast::ptr< ast::Expr > findSingleExpression(
+                const ast::Expr * untyped, const ast::Type * type, const ast::SymbolTable & symtab
+        ) {
+                assert( untyped && type );
+                ast::ptr< ast::Expr > castExpr = new ast::CastExpr{ untyped, type };
+                ast::ptr< ast::Expr > newExpr = findSingleExpression( castExpr, symtab );
+                removeExtraneousCast( newExpr, symtab );
+                return newExpr;
+        }
         namespace {
+                bool structOrUnion( const Candidate & i ) {
+                        const ast::Type * t = i.expr->result->stripReferences();
+                        return dynamic_cast< const ast::StructInstType * >( t ) || dynamic_cast< const ast::UnionInstType * >( t );
+                }
                 /// Predicate for "Candidate has integral type"
                 bool hasIntegralType( const Candidate & i ) {
 …
                 template<typename Iter>
                 inline bool nextMutex( Iter & it, const Iter & end ) {
                         while ( it != end && ! (*it)->get_type()->is_mutex() ) { ++it; }
+                        while ( it != end && ! (*it)->is_mutex() ) { ++it; }
                         return it != end;
+                }
 …
                 ast::ptr< ast::Type > functionReturn = nullptr;
                 ast::CurrentObject currentObject;
+                // for work previously in GenInit
+                static InitTweak::ManagedTypes_new managedTypes;
                 bool inEnumDecl = false;
         public:
+                static size_t traceId;
                 Resolver_new() = default;
                 Resolver_new( const ast::SymbolTable & syms ) { symtab = syms; }
                 void previsit( const ast::FunctionDecl * );
+                const ast::FunctionDecl * previsit( const ast::FunctionDecl * );
                 const ast::FunctionDecl * postvisit( const ast::FunctionDecl * );
+                void previsit( const ast::ObjectDecl * );
+                const ast::ObjectDecl * previsit( const ast::ObjectDecl * );
+                void previsit( const ast::AggregateDecl * );
+                void previsit( const ast::StructDecl * );
                 void previsit( const ast::EnumDecl * );
                 const ast::StaticAssertDecl * previsit( const ast::StaticAssertDecl * );
 …
                 const ast::ThrowStmt *       previsit( const ast::ThrowStmt * );
                 const ast::CatchStmt *       previsit( const ast::CatchStmt * );
+                const ast::CatchStmt *       postvisit( const ast::CatchStmt * );
                 const ast::WaitForStmt *     previsit( const ast::WaitForStmt * );
+                const ast::WithStmt *        previsit( const ast::WithStmt * );
                 const ast::SingleInit *      previsit( const ast::SingleInit * );
                 const ast::ListInit *        previsit( const ast::ListInit * );
                 const ast::ConstructorInit * previsit( const ast::ConstructorInit * );
+                void resolveWithExprs(std::vector<ast::ptr<ast::Expr>> & exprs, std::list<ast::ptr<ast::Stmt>> & stmtsToAdd);
+                void beginScope() { managedTypes.beginScope(); }
+                void endScope() { managedTypes.endScope(); }
+                bool on_error(ast::ptr<ast::Decl> & decl);
         };
+        void resolve( std::list< ast::ptr<ast::Decl> >& translationUnit ) {
+                ast::Pass< Resolver_new > resolver;
+                accept_all( translationUnit, resolver );
+        // size_t Resolver_new::traceId = Stats::Heap::new_stacktrace_id("Resolver");
+        InitTweak::ManagedTypes_new Resolver_new::managedTypes;
+        void resolve( ast::TranslationUnit& translationUnit ) {
+                ast::Pass< Resolver_new >::run( translationUnit );
+        }
 …
+        }
         ast::ptr< ast::Expr > resolveStmtExpr(
+        const ast::Expr * resolveStmtExpr(
                 const ast::StmtExpr * stmtExpr, const ast::SymbolTable & symtab
         ) {
                 assert( stmtExpr );
                 ast::Pass< Resolver_new > resolver{ symtab };
+                ast::ptr< ast::Expr > ret = stmtExpr;
+                ret = ret->accept( resolver );
+                strict_dynamic_cast< ast::StmtExpr * >( ret.get_and_mutate() )->computeResult();
+                auto ret = mutate(stmtExpr->accept(resolver));
+                strict_dynamic_cast< ast::StmtExpr * >( ret )->computeResult();
                 return ret;
+        }
+        void Resolver_new::previsit( const ast::FunctionDecl * functionDecl ) {
+        namespace {
+                const ast::Attribute * handleAttribute(const CodeLocation & loc, const ast::Attribute * attr, const ast::SymbolTable & symtab) {
+                        std::string name = attr->normalizedName();
+                        if (name == "constructor" || name == "destructor") {
+                                if (attr->params.size() == 1) {
+                                        auto arg = attr->params.front();
+                                        auto resolved = ResolvExpr::findSingleExpression( arg, new ast::BasicType( ast::BasicType::LongLongSignedInt ), symtab );
+                                        auto result = eval(arg);
+                                        auto mutAttr = mutate(attr);
+                                        mutAttr->params.front() = resolved;
+                                        if (! result.second) {
+                                                SemanticWarning(loc, Warning::GccAttributes,
+                                                        toCString( name, " priorities must be integers from 0 to 65535 inclusive: ", arg ) );
+                                        }
+                                        else {
+                                                auto priority = result.first;
+                                                if (priority < 101) {
+                                                        SemanticWarning(loc, Warning::GccAttributes,
+                                                                toCString( name, " priorities from 0 to 100 are reserved for the implementation" ) );
+                                                } else if (priority < 201 && ! buildingLibrary()) {
+                                                        SemanticWarning(loc, Warning::GccAttributes,
+                                                                toCString( name, " priorities from 101 to 200 are reserved for the implementation" ) );
+                                                }
+                                        }
+                                        return mutAttr;
+                                } else if (attr->params.size() > 1) {
+                                        SemanticWarning(loc, Warning::GccAttributes, toCString( "too many arguments to ", name, " attribute" ) );
+                                } else {
+                                        SemanticWarning(loc, Warning::GccAttributes, toCString( "too few arguments to ", name, " attribute" ) );
+                                }
+                        }
+                        return attr;
+                }
+        }
+        const ast::FunctionDecl * Resolver_new::previsit( const ast::FunctionDecl * functionDecl ) {
                 GuardValue( functionReturn );
+                assert (functionDecl->unique());
+                if (!functionDecl->has_body() && !functionDecl->withExprs.empty()) {
+                        SemanticError(functionDecl->location, functionDecl, "Function without body has with declarations");
+                }
+                if (!functionDecl->isTypeFixed) {
+                        auto mutDecl = mutate(functionDecl);
+                        auto mutType = mutDecl->type.get_and_mutate();
+                        for (auto & attr: mutDecl->attributes) {
+                                attr = handleAttribute(mutDecl->location, attr, symtab);
+                        }
+                        // handle assertions
+                        symtab.enterScope();
+                        mutType->forall.clear();
+                        mutType->assertions.clear();
+                        for (auto & typeParam : mutDecl->type_params) {
+                                symtab.addType(typeParam);
+                                mutType->forall.emplace_back(new ast::TypeInstType(typeParam->name, typeParam));
+                        }
+                        for (auto & asst : mutDecl->assertions) {
+                                asst = fixObjectType(asst.strict_as<ast::ObjectDecl>(), symtab);
+                                symtab.addId(asst);
+                                mutType->assertions.emplace_back(new ast::VariableExpr(functionDecl->location, asst));
+                        }
+                        // temporarily adds params to symbol table.
+                        // actual scoping rules for params and withexprs differ - see Pass::visit(FunctionDecl)
+                        std::vector<ast::ptr<ast::Type>> paramTypes;
+                        std::vector<ast::ptr<ast::Type>> returnTypes;
+                        for (auto & param : mutDecl->params) {
+                                param = fixObjectType(param.strict_as<ast::ObjectDecl>(), symtab);
+                                symtab.addId(param);
+                                paramTypes.emplace_back(param->get_type());
+                        }
+                        for (auto & ret : mutDecl->returns) {
+                                ret = fixObjectType(ret.strict_as<ast::ObjectDecl>(), symtab);
+                                returnTypes.emplace_back(ret->get_type());
+                        }
+                        // since function type in decl is just a view of param types, need to update that as well
+                        mutType->params = std::move(paramTypes);
+                        mutType->returns = std::move(returnTypes);
+                        auto renamedType = strict_dynamic_cast<const ast::FunctionType *>(renameTyVars(mutType, RenameMode::GEN_EXPR_ID));
+                        std::list<ast::ptr<ast::Stmt>> newStmts;
+                        resolveWithExprs (mutDecl->withExprs, newStmts);
+                        if (mutDecl->stmts) {
+                                auto mutStmt = mutDecl->stmts.get_and_mutate();
+                                mutStmt->kids.splice(mutStmt->kids.begin(), std::move(newStmts));
+                                mutDecl->stmts = mutStmt;
+                        }
+                        symtab.leaveScope();
+                        mutDecl->type = renamedType;
+                        mutDecl->mangleName = Mangle::mangle(mutDecl);
+                        mutDecl->isTypeFixed = true;
+                        functionDecl = mutDecl;
+                }
+                managedTypes.handleDWT(functionDecl);
                 functionReturn = extractResultType( functionDecl->type );
+                return functionDecl;
+        }
 …
                 // default value expressions have an environment which shouldn't be there and trips up
                 // later passes.
                 ast::ptr< ast::FunctionDecl > ret = functionDecl;
                 for ( unsigned i = 0; i < functionDecl->type->params.size(); ++i ) {
+                        const ast::ptr<ast::DeclWithType> & d = functionDecl->type->params[i];
                         if ( const ast::ObjectDecl * obj = d.as< ast::ObjectDecl >() ) {
+                assert( functionDecl->unique() );
+                ast::FunctionType * mutType = mutate( functionDecl->type.get() );
+                for ( unsigned i = 0 ; i < mutType->params.size() ; ++i ) {
+                        if ( const ast::ObjectDecl * obj = mutType->params[i].as< ast::ObjectDecl >() ) {
                                 if ( const ast::SingleInit * init = obj->init.as< ast::SingleInit >() ) {
                                         if ( init->value->env == nullptr ) continue;
                                         // clone initializer minus the initializer environment
+                                        ast::chain_mutate( ret )
+                                                ( &ast::FunctionDecl::type )
+                                                        ( &ast::FunctionType::params )[i]
+                                                                ( &ast::ObjectDecl::init )
+                                                                        ( &ast::SingleInit::value )->env = nullptr;
+                                        assert( functionDecl != ret.get() || functionDecl->unique() );
+                                        assert( ! ret->type->params[i].strict_as< ast::ObjectDecl >()->init.strict_as< ast::SingleInit >()->value->env );
+                                        auto mutParam = mutate( mutType->params[i].strict_as< ast::ObjectDecl >() );
+                                        auto mutInit = mutate( mutParam->init.strict_as< ast::SingleInit >() );
+                                        auto mutValue = mutate( mutInit->value.get() );
+                                        mutValue->env = nullptr;
+                                        mutInit->value = mutValue;
+                                        mutParam->init = mutInit;
+                                        mutType->params[i] = mutParam;
+                                        assert( ! mutType->params[i].strict_as< ast::ObjectDecl >()->init.strict_as< ast::SingleInit >()->value->env);
+                                }
+                        }
+                }
+                return ret.get();
+        }
+        void Resolver_new::previsit( const ast::ObjectDecl * objectDecl ) {
+                mutate_field(functionDecl, &ast::FunctionDecl::type, mutType);
+                return functionDecl;
+        }
+        const ast::ObjectDecl * Resolver_new::previsit( const ast::ObjectDecl * objectDecl ) {
                 // To handle initialization of routine pointers [e.g. int (*fp)(int) = foo()],
                 // class-variable `initContext` is changed multiple times because the LHS is analyzed
 …
                 // selecting the RHS.
                 GuardValue( currentObject );
+                currentObject = ast::CurrentObject{ objectDecl->location, objectDecl->get_type() };
                 if ( inEnumDecl && dynamic_cast< const ast::EnumInstType * >( objectDecl->get_type() ) ) {
                         // enumerator initializers should not use the enum type to initialize, since the
                         // enum type is still incomplete at this point. Use `int` instead.
+                        objectDecl = fixObjectType(objectDecl, symtab);
                         currentObject = ast::CurrentObject{
                                 objectDecl->location, new ast::BasicType{ ast::BasicType::SignedInt } };
+                }
+                else {
+                        if (!objectDecl->isTypeFixed) {
+                                auto newDecl = fixObjectType(objectDecl, symtab);
+                                auto mutDecl = mutate(newDecl);
+                                // generate CtorInit wrapper when necessary.
+                                // in certain cases, fixObjectType is called before reaching
+                                // this object in visitor pass, thus disabling CtorInit codegen.
+                                // this happens on aggregate members and function parameters.
+                                if ( InitTweak::tryConstruct( mutDecl ) && ( managedTypes.isManaged( mutDecl ) || ((! isInFunction() || mutDecl->storage.is_static ) && ! InitTweak::isConstExpr( mutDecl->init ) ) ) ) {
+                                        // constructed objects cannot be designated
+                                        if ( InitTweak::isDesignated( mutDecl->init ) ) SemanticError( mutDecl, "Cannot include designations in the initializer for a managed Object. If this is really what you want, then initialize with @=.\n" );
+                                        // constructed objects should not have initializers nested too deeply
+                                        if ( ! InitTweak::checkInitDepth( mutDecl ) ) SemanticError( mutDecl, "Managed object's initializer is too deep " );
+                                        mutDecl->init = InitTweak::genCtorInit( mutDecl->location, mutDecl );
+                                }
+                                objectDecl = mutDecl;
+                        }
+                        currentObject = ast::CurrentObject{ objectDecl->location, objectDecl->get_type() };
+                }
+                return objectDecl;
+        }
+        void Resolver_new::previsit( const ast::AggregateDecl * _aggDecl ) {
+                auto aggDecl = mutate(_aggDecl);
+                assertf(aggDecl == _aggDecl, "type declarations must be unique");
+                for (auto & member: aggDecl->members) {
+                        // nested type decls are hoisted already. no need to do anything
+                        if (auto obj = member.as<ast::ObjectDecl>()) {
+                                member = fixObjectType(obj, symtab);
+                        }
+                }
+        }
+        void Resolver_new::previsit( const ast::StructDecl * structDecl ) {
+                previsit(static_cast<const ast::AggregateDecl *>(structDecl));
+                managedTypes.handleStruct(structDecl);
+        }
 …
                 // in case we decide to allow nested enums
                 GuardValue( inEnumDecl );
+                inEnumDecl = false;
+        }
+                inEnumDecl = true;
+                // don't need to fix types for enum fields
+        }
         const ast::StaticAssertDecl * Resolver_new::previsit(
 …
         const PtrType * handlePtrType( const PtrType * type, const ast::SymbolTable & symtab ) {
                 if ( type->dimension ) {
+                        #warning should use new equivalent to Validate::SizeType rather than sizeType here
+                        ast::ptr< ast::Type > sizeType = new ast::BasicType{ ast::BasicType::LongUnsignedInt };
+                        ast::ptr< ast::Type > sizeType = ast::sizeType;
                         ast::mutate_field(
                                 type, &PtrType::dimension,
 …
                 if ( throwStmt->expr ) {
                         const ast::StructDecl * exceptionDecl =
                                 symtab.lookupStruct( "__cfaabi_ehm__base_exception_t" );
+                                symtab.lookupStruct( "__cfaehm_base_exception_t" );
                         assert( exceptionDecl );
                         ast::ptr< ast::Type > exceptType =
 …
         const ast::CatchStmt * Resolver_new::previsit( const ast::CatchStmt * catchStmt ) {
+                // TODO: This will need a fix for the decl/cond scoping problem.
+                // Until we are very sure this invarent (ifs that move between passes have thenPart)
+                // holds, check it. This allows a check for when to decode the mangling.
+                if ( auto ifStmt = catchStmt->body.as<ast::IfStmt>() ) {
+                        assert( ifStmt->thenPart );
+                }
+                // Encode the catchStmt so the condition can see the declaration.
                 if ( catchStmt->cond ) {
+                        ast::ptr< ast::Type > boolType = new ast::BasicType{ ast::BasicType::Bool };
+                        catchStmt = ast::mutate_field(
+                                catchStmt, &ast::CatchStmt::cond,
+                                findSingleExpression( catchStmt->cond, boolType, symtab ) );
+                        ast::CatchStmt * stmt = mutate( catchStmt );
+                        stmt->body = new ast::IfStmt( stmt->location, stmt->cond, nullptr, stmt->body );
+                        stmt->cond = nullptr;
+                        return stmt;
+                }
+                return catchStmt;
+        }
+        const ast::CatchStmt * Resolver_new::postvisit( const ast::CatchStmt * catchStmt ) {
+                // Decode the catchStmt so everything is stored properly.
+                const ast::IfStmt * ifStmt = catchStmt->body.as<ast::IfStmt>();
+                if ( nullptr != ifStmt && nullptr == ifStmt->thenPart ) {
+                        assert( ifStmt->cond );
+                        assert( ifStmt->elsePart );
+                        ast::CatchStmt * stmt = ast::mutate( catchStmt );
+                        stmt->cond = ifStmt->cond;
+                        stmt->body = ifStmt->elsePart;
+                        // ifStmt should be implicately deleted here.
+                        return stmt;
+                }
                 return catchStmt;
 …
                                                                 // Check if the argument matches the parameter type in the current
                                                                 // scope
                                                                 ast::ptr< ast::Type > paramType = (*param)->get_type();
+                                                                // ast::ptr< ast::Type > paramType = (*param)->get_type();
                                                                 if (
                                                                         ! unify(
                                                                                 arg->expr->result, paramType, resultEnv, need, have, open,
+                                                                                arg->expr->result, *param, resultEnv, need, have, open,
                                                                                 symtab )
                                                                 ) {
 …
                                                                         ss << "candidate function not viable: no known conversion "
                                                                                 "from '";
                                                                         ast::print( ss, (*param)->get_type() );
+                                                                        ast::print( ss, *param );
                                                                         ss << "' to '";
                                                                         ast::print( ss, arg->expr->result );
 …
+        }
+        const ast::WithStmt * Resolver_new::previsit( const ast::WithStmt * withStmt ) {
+                auto mutStmt = mutate(withStmt);
+                resolveWithExprs(mutStmt->exprs, stmtsToAddBefore);
+                return mutStmt;
+        }
+        void Resolver_new::resolveWithExprs(std::vector<ast::ptr<ast::Expr>> & exprs, std::list<ast::ptr<ast::Stmt>> & stmtsToAdd) {
+                for (auto & expr : exprs) {
+                        // only struct- and union-typed expressions are viable candidates
+                        expr = findKindExpression( expr, symtab, structOrUnion, "with expression" );
+                        // if with expression might be impure, create a temporary so that it is evaluated once
+                        if ( Tuples::maybeImpure( expr ) ) {
+                                static UniqueName tmpNamer( "_with_tmp_" );
+                                const CodeLocation loc = expr->location;
+                                auto tmp = new ast::ObjectDecl(loc, tmpNamer.newName(), expr->result, new ast::SingleInit(loc, expr ) );
+                                expr = new ast::VariableExpr( loc, tmp );
+                                stmtsToAdd.push_back( new ast::DeclStmt(loc, tmp ) );
+                                if ( InitTweak::isConstructable( tmp->type ) ) {
+                                        // generate ctor/dtor and resolve them
+                                        tmp->init = InitTweak::genCtorInit( loc, tmp );
+                                }
+                                // since tmp is freshly created, this should modify tmp in-place
+                                tmp->accept( *visitor );
+                        }
+                }
+        }
 …
+        }
+        // suppress error on autogen functions and mark invalid autogen as deleted.
+        bool Resolver_new::on_error(ast::ptr<ast::Decl> & decl) {
+                if (auto functionDecl = decl.as<ast::FunctionDecl>()) {
+                        // xxx - can intrinsic gen ever fail?
+                        if (functionDecl->linkage == ast::Linkage::AutoGen) {
+                                auto mutDecl = mutate(functionDecl);
+                                mutDecl->isDeleted = true;
+                                mutDecl->stmts = nullptr;
+                                decl = mutDecl;
+                                return false;
+                        }
+                }
+                return true;
+        }
 } // namespace ResolvExpr

src/ResolvExpr/Resolver.h

-              r3c64c668
+              r58fe85a
         class StmtExpr;
         class SymbolTable;
+        struct TranslationUnit;
         class Type;
         class TypeEnvironment;
 …
         /// Checks types and binds syntactic constructs to typed representations
         void resolve( std::list< ast::ptr<ast::Decl> >& translationUnit );
+        void resolve( ast::TranslationUnit& translationUnit );
         /// Searches expr and returns the first DeletedExpr found, otherwise nullptr
         const ast::DeletedExpr * findDeletedExpr( const ast::Expr * expr );
 …
         ast::ptr< ast::Expr > resolveInVoidContext(
                 const ast::Expr * expr, const ast::SymbolTable & symtab, ast::TypeEnvironment & env );
         /// Resolve `untyped` to the single expression whose candidate is the best match for the
+        /// Resolve `untyped` to the single expression whose candidate is the best match for the
         /// given type.
         ast::ptr< ast::Expr > findSingleExpression(
                 const ast::Expr * untyped, const ast::Type * type, const ast::SymbolTable & symtab );
+        ast::ptr< ast::Expr > findVoidExpression(
+                const ast::Expr * untyped, const ast::SymbolTable & symtab);
         /// Resolves a constructor init expression
         ast::ptr< ast::Init > resolveCtorInit(
+        ast::ptr< ast::Init > resolveCtorInit(
                 const ast::ConstructorInit * ctorInit, const ast::SymbolTable & symtab );
         /// Resolves a statement expression
         ast::ptr< ast::Expr > resolveStmtExpr(
+        /// Resolves a statement expression
+        const ast::Expr * resolveStmtExpr(
                 const ast::StmtExpr * stmtExpr, const ast::SymbolTable & symtab );
 } // namespace ResolvExpr

src/ResolvExpr/SatisfyAssertions.cpp

-              r3c64c668
+              r58fe85a
 // Author           : Aaron B. Moss
 // Created On       : Mon Jun 10 17:45:00 2019
 // Last Modified By : Aaron B. Moss
 // Last Modified On : Mon Jun 10 17:45:00 2019
 // Update Count     : 1
+// Last Modified By : Andrew Beach
+// Last Modified On : Tue Oct  1 13:56:00 2019
+// Update Count     : 2
 //
 …
                 ast::UniqueId resnSlot;          ///< Slot for any recursive assertion IDs
                 AssnCandidate(
                         const ast::SymbolTable::IdData c, const ast::Type * at, ast::TypeEnvironment && e,
+                AssnCandidate(
+                        const ast::SymbolTable::IdData c, const ast::Type * at, ast::TypeEnvironment && e,
                         ast::AssertionSet && h, ast::AssertionSet && n, ast::OpenVarSet && o, ast::UniqueId rs )
                 : cdata( c ), adjType( at ), env( std::move( e ) ), have( std::move( h ) ),
+                : cdata( c ), adjType( at ), env( std::move( e ) ), have( std::move( h ) ),
                   need( std::move( n ) ), open( std::move( o ) ), resnSlot( rs ) {}
         };
 …
         /// Reference to a single deferred item
         struct DeferRef {
                 const ast::DeclWithType * decl;
+                const ast::VariableExpr * expr;
                 const ast::AssertionSetValue & info;
                 const AssnCandidate & match;
         };
         /// Wrapper for the deferred items from a single assertion satisfaction.
+        /// Wrapper for the deferred items from a single assertion satisfaction.
         /// Acts like an indexed list of DeferRef
         struct DeferItem {
                 const ast::DeclWithType * decl;
+                const ast::VariableExpr * expr;
                 const ast::AssertionSetValue & info;
                 AssnCandidateList matches;
                 DeferItem(
                         const ast::DeclWithType * d, const ast::AssertionSetValue & i, AssnCandidateList && ms )
                 : decl( d ), info( i ), matches( std::move( ms ) ) {}
+                DeferItem(
+                        const ast::VariableExpr * d, const ast::AssertionSetValue & i, AssnCandidateList && ms )
+                : expr( d ), info( i ), matches( std::move( ms ) ) {}
                 bool empty() const { return matches.empty(); }
 …
                 AssnCandidateList::size_type size() const { return matches.size(); }
                 DeferRef operator[] ( unsigned i ) const { return { decl, info, matches[i] }; }
+                DeferRef operator[] ( unsigned i ) const { return { expr, info, matches[i] }; }
         };
 …
                 /// Initial satisfaction state for a candidate
                 SatState( CandidateRef & c, const ast::SymbolTable & syms )
                 : cand( c ), need(), newNeed(), deferred(), inferred(), costs{ Cost::zero },
+                : cand( c ), need(), newNeed(), deferred(), inferred(), costs{ Cost::zero },
                   symtab( syms ) { need.swap( c->need ); }
                 /// Update satisfaction state for next step after previous state
                 SatState( SatState && o, IterateFlag )
                 : cand( std::move( o.cand ) ), need( o.newNeed.begin(), o.newNeed.end() ), newNeed(),
                   deferred(), inferred( std::move( o.inferred ) ), costs( std::move( o.costs ) ),
+                : cand( std::move( o.cand ) ), need( o.newNeed.begin(), o.newNeed.end() ), newNeed(),
+                  deferred(), inferred( std::move( o.inferred ) ), costs( std::move( o.costs ) ),
                   symtab( o.symtab ) { costs.emplace_back( Cost::zero ); }
                 /// Field-wise next step constructor
                 SatState(
                         CandidateRef && c, ast::AssertionSet && nn, InferCache && i, CostVec && cs,
+                        CandidateRef && c, ast::AssertionSet && nn, InferCache && i, CostVec && cs,
                         ast::SymbolTable && syms )
                 : cand( std::move( c ) ), need( nn.begin(), nn.end() ), newNeed(), deferred(),
+                : cand( std::move( c ) ), need( nn.begin(), nn.end() ), newNeed(), deferred(),
                   inferred( std::move( i ) ), costs( std::move( cs ) ), symtab( std::move( syms ) )
                   { costs.emplace_back( Cost::zero ); }
 …
         void addToSymbolTable( const ast::AssertionSet & have, ast::SymbolTable & symtab ) {
                 for ( auto & i : have ) {
                         if ( i.second.isUsed ) { symtab.addId( i.first ); }
+                        if ( i.second.isUsed ) { symtab.addId( i.first->var ); }
+                }
+        }
         /// Binds a single assertion, updating satisfaction state
         void bindAssertion(
                 const ast::DeclWithType * decl, const ast::AssertionSetValue & info, CandidateRef & cand,
+        void bindAssertion(
+                const ast::VariableExpr * expr, const ast::AssertionSetValue & info, CandidateRef & cand,
                 AssnCandidate & match, InferCache & inferred
         ) {
                 const ast::DeclWithType * candidate = match.cdata.id;
                 assertf( candidate->uniqueId,
+                assertf( candidate->uniqueId,
                         "Assertion candidate does not have a unique ID: %s", toString( candidate ).c_str() );
                 ast::Expr * varExpr = match.cdata.combine( cand->expr->location, cand->cvtCost );
                 varExpr->result = match.adjType;
 …
                 // place newly-inferred assertion in proper location in cache
                 inferred[ info.resnSlot ][ decl->uniqueId ] = ast::ParamEntry{
                         candidate->uniqueId, candidate, match.adjType, decl->get_type(), varExpr };
+                inferred[ info.resnSlot ][ expr->var->uniqueId ] = ast::ParamEntry{
+                        candidate->uniqueId, candidate, match.adjType, expr->result, varExpr };
+        }
 …
                 // find candidates that unify with the desired type
                 AssnCandidateList matches;
+                for ( const ast::SymbolTable::IdData & cdata : sat.symtab.lookupId( assn.first->name ) ) {
+                std::vector<ast::SymbolTable::IdData> candidates;
+                auto kind = ast::SymbolTable::getSpecialFunctionKind(assn.first->var->name);
+                if (kind != ast::SymbolTable::SpecialFunctionKind::NUMBER_OF_KINDS) {
+                        // prefilter special decls by argument type, if already known
+                        ast::ptr<ast::Type> thisArgType = assn.first->result.strict_as<ast::PointerType>()->base
+                                .strict_as<ast::FunctionType>()->params[0]
+                                .strict_as<ast::ReferenceType>()->base;
+                        sat.cand->env.apply(thisArgType);
+                        std::string otypeKey = "";
+                        if (thisArgType.as<ast::PointerType>()) otypeKey = Mangle::Encoding::pointer;
+                        else if (!isUnboundType(thisArgType)) otypeKey = Mangle::mangle(thisArgType, Mangle::Type | Mangle::NoGenericParams);
+                        candidates = sat.symtab.specialLookupId(kind, otypeKey);
+                }
+                else {
+                        candidates = sat.symtab.lookupId(assn.first->var->name);
+                }
+                for ( const ast::SymbolTable::IdData & cdata : candidates ) {
                         const ast::DeclWithType * candidate = cdata.id;
+                        // ignore deleted candidates.
+                        // NOTE: this behavior is different from main resolver.
+                        // further investigations might be needed to determine
+                        // if we should implement the same rule here
+                        // (i.e. error if unique best match is deleted)
+                        if (candidate->isDeleted && candidate->linkage == ast::Linkage::AutoGen) continue;
                         // build independent unification context for candidate
 …
                         ast::TypeEnvironment newEnv{ sat.cand->env };
                         ast::OpenVarSet newOpen{ sat.cand->open };
                         ast::ptr< ast::Type > toType = assn.first->get_type();
                         ast::ptr< ast::Type > adjType =
                                 renameTyVars( adjustExprType( candidate->get_type(), newEnv, sat.symtab ) );
+                        ast::ptr< ast::Type > toType = assn.first->result;
+                        ast::ptr< ast::Type > adjType =
+                                renameTyVars( adjustExprType( candidate->get_type(), newEnv, sat.symtab ), GEN_USAGE, false );
                         // only keep candidates which unify
 …
+                                }
                                 matches.emplace_back(
                                         cdata, adjType, std::move( newEnv ), std::move( newNeed ), std::move( have ),
+                                matches.emplace_back(
+                                        cdata, adjType, std::move( newEnv ), std::move( have ), std::move( newNeed ),
                                         std::move( newOpen ), crntResnSlot );
+                        }
 …
                 InferMatcher( InferCache & inferred ) : inferred( inferred ) {}
                 const ast::Expr * postmutate( const ast::Expr * expr ) {
+                const ast::Expr * postvisit( const ast::Expr * expr ) {
                         // Skip if no slots to find
+                        if ( expr->inferred.mode != ast::Expr::InferUnion::Slots ) return expr;
+                        if ( !expr->inferred.hasSlots() ) return expr;
+                        // if ( expr->inferred.mode != ast::Expr::InferUnion::Slots ) return expr;
+                        std::vector<UniqueId> missingSlots;
                         // find inferred parameters for resolution slots
                         ast::InferredParams newInferred;
+                        ast::InferredParams * newInferred = new ast::InferredParams();
                         for ( UniqueId slot : expr->inferred.resnSlots() ) {
                                 // fail if no matching assertions found
                                 auto it = inferred.find( slot );
                                 if ( it == inferred.end() ) {
+                                        assert(!"missing assertion");
+                                        // std::cerr << "missing assertion " << slot << std::endl;
+                                        missingSlots.push_back(slot);
+                                        continue;
+                                }
 …
                                 for ( auto & entry : it->second ) {
                                         // recurse on inferParams of resolved expressions
                                         entry.second.expr = postmutate( entry.second.expr );
                                         auto res = newInferred.emplace( entry );
+                                        entry.second.expr = postvisit( entry.second.expr );
+                                        auto res = newInferred->emplace( entry );
                                         assert( res.second && "all assertions newly placed" );
+                                }
 …
                         ast::Expr * ret = mutate( expr );
+                        ret->inferred.set_inferParams( std::move( newInferred ) );
+                        ret->inferred.set_inferParams( newInferred );
+                        if (!missingSlots.empty()) ret->inferred.resnSlots() = missingSlots;
                         return ret;
+                }
         };
         /// Replace ResnSlots with InferParams and add alternative to output list, if it meets pruning
+        /// Replace ResnSlots with InferParams and add alternative to output list, if it meets pruning
         /// threshold.
         void finalizeAssertions(
                 CandidateRef & cand, InferCache & inferred, PruneMap & thresholds, CostVec && costs,
                 CandidateList & out
+        void finalizeAssertions(
+                CandidateRef & cand, InferCache & inferred, PruneMap & thresholds, CostVec && costs,
+                CandidateList & out
         ) {
                 // prune if cheaper alternative for same key has already been generated
 …
+        }
         /// Combo iterator that combines candidates into an output list, merging their environments.
         /// Rejects an appended candidate if environments cannot be merged. See `Common/FilterCombos.h`
+        /// Combo iterator that combines candidates into an output list, merging their environments.
+        /// Rejects an appended candidate if environments cannot be merged. See `Common/FilterCombos.h`
         /// for description of "combo iterator".
         class CandidateEnvMerger {
 …
                         Cost cost;
                         OutType(
                                 const ast::TypeEnvironment & e, const ast::OpenVarSet & o,
+                        OutType(
+                                const ast::TypeEnvironment & e, const ast::OpenVarSet & o,
                                 const std::vector< DeferRef > & as, const ast::SymbolTable & symtab )
                         : env( e ), open( o ), assns( as ), cost( Cost::zero ) {
 …
                                 for ( const DeferRef & assn : assns ) {
                                         // compute conversion cost from satisfying decl to assertion
                                         cost += computeConversionCost(
                                                 assn.match.adjType, assn.decl->get_type(), symtab, env );
+                                        cost += computeConversionCost(
+                                                assn.match.adjType, assn.expr->result, false, symtab, env );
                                         // mark vars+specialization on function-type assertions
                                         const ast::FunctionType * func =
+                                        const ast::FunctionType * func =
                                                 GenPoly::getFunctionType( assn.match.cdata.id->get_type() );
                                         if ( ! func ) continue;
                                         for ( const ast::DeclWithType * param : func->params ) {
                                                 cost.decSpec( specCost( param->get_type() ) );
+                                        for ( const auto & param : func->params ) {
+                                                cost.decSpec( specCost( param ) );
+                                        }
                                         cost.incVar( func->forall.size() );
+                                        for ( const ast::TypeDecl * td : func->forall ) {
+                                                cost.decSpec( td->assertions.size() );
+                                        }
+                                        cost.decSpec( func->assertions.size() );
+                                }
+                        }
 …
                 };
                 CandidateEnvMerger(
                         const ast::TypeEnvironment & env, const ast::OpenVarSet & open,
+                CandidateEnvMerger(
+                        const ast::TypeEnvironment & env, const ast::OpenVarSet & open,
                         const ast::SymbolTable & syms )
                 : crnt(), envs{ env }, opens{ open }, symtab( syms ) {}
 …
         /// Limit to depth of recursion of assertion satisfaction
         static const int recursionLimit = 4;
+        static const int recursionLimit = 8;
         /// Maximum number of simultaneously-deferred assertions to attempt concurrent satisfaction of
         static const int deferLimit = 10;
 } // anonymous namespace
 void satisfyAssertions(
         CandidateRef & cand, const ast::SymbolTable & symtab, CandidateList & out,
+void satisfyAssertions(
+        CandidateRef & cand, const ast::SymbolTable & symtab, CandidateList & out,
         std::vector<std::string> & errors
 ) {
 …
                         if ( it != thresholds.end() && it->second < sat.costs ) goto nextSat;
+                        // make initial pass at matching assertions
+                        for ( auto & assn : sat.need ) {
+                                // fail early if any assertion is not satisfiable
+                                if ( ! satisfyAssertion( assn, sat ) ) {
+                        // should a limit be imposed? worst case here is O(n^2) but very unlikely to happen.
+                        for (unsigned resetCount = 0; ; ++resetCount) {
+                                ast::AssertionList next;
+                                resetTyVarRenaming();
+                                // make initial pass at matching assertions
+                                for ( auto & assn : sat.need ) {
+                                        // fail early if any assertion is not satisfiable
+                                        if ( ! satisfyAssertion( assn, sat ) ) {
+                                                next.emplace_back(assn);
+                                                // goto nextSat;
+                                        }
+                                }
+                                // success
+                                if (next.empty()) break;
+                                // fail if nothing resolves
+                                else if (next.size() == sat.need.size()) {
                                         Indenter tabs{ 3 };
                                         std::ostringstream ss;
 …
                                         print( ss, *sat.cand, ++tabs );
                                         ss << (tabs-1) << "Could not satisfy assertion:\n";
                                         ast::print( ss, assn.first, tabs );
+                                        ast::print( ss, next[0].first, tabs );
                                         errors.emplace_back( ss.str() );
                                         goto nextSat;
+                                }
+                                sat.need = std::move(next);
+                        }
 …
                                 // either add successful match or push back next state
                                 if ( sat.newNeed.empty() ) {
                                         finalizeAssertions(
+                                        finalizeAssertions(
                                                 sat.cand, sat.inferred, thresholds, std::move( sat.costs ), out );
                                 } else {
 …
                                 ss << (tabs-1) << "Too many non-unique satisfying assignments for assertions:\n";
                                 for ( const auto & d : sat.deferred ) {
                                         ast::print( ss, d.decl, tabs );
+                                        ast::print( ss, d.expr, tabs );
+                                }
 …
                                 std::vector< CandidateEnvMerger::OutType > compatible = filterCombos(
                                         sat.deferred, CandidateEnvMerger{ sat.cand->env, sat.cand->open, sat.symtab } );
                                 // fail early if no mutually-compatible assertion satisfaction
                                 if ( compatible.empty() ) {
 …
                                         ss << (tabs-1) << "No mutually-compatible satisfaction for assertions:\n";
                                         for ( const auto& d : sat.deferred ) {
                                                 ast::print( ss, d.decl, tabs );
+                                                ast::print( ss, d.expr, tabs );
+                                        }
 …
                                         // set up next satisfaction state
                                         CandidateRef nextCand = std::make_shared<Candidate>(
                                                 sat.cand->expr, std::move( compat.env ), std::move( compat.open ),
+                                                sat.cand->expr, std::move( compat.env ), std::move( compat.open ),
                                                 ast::AssertionSet{} /* need moved into satisfaction state */,
                                                 sat.cand->cost, sat.cand->cvtCost );
 …
                                         ast::AssertionSet nextNewNeed{ sat.newNeed };
                                         InferCache nextInferred{ sat.inferred };
                                         CostVec nextCosts{ sat.costs };
                                         nextCosts.back() += compat.cost;
                                         ast::SymbolTable nextSymtab{ sat.symtab };
 …
                                                 nextNewNeed.insert( match.need.begin(), match.need.end() );
                                                 bindAssertion( r.decl, r.info, nextCand, match, nextInferred );
+                                                bindAssertion( r.expr, r.info, nextCand, match, nextInferred );
+                                        }
                                         // either add successful match or push back next state
                                         if ( nextNewNeed.empty() ) {
                                                 finalizeAssertions(
+                                                finalizeAssertions(
                                                         nextCand, nextInferred, thresholds, std::move( nextCosts ), out );
                                         } else {
                                                 nextSats.emplace_back(
                                                         std::move( nextCand ), std::move( nextNewNeed ),
                                                         std::move( nextInferred ), std::move( nextCosts ),
+                                                nextSats.emplace_back(
+                                                        std::move( nextCand ), std::move( nextNewNeed ),
+                                                        std::move( nextInferred ), std::move( nextCosts ),
                                                         std::move( nextSymtab ) );
+                                        }
 …
                 nextSats.clear();
+        }
         // exceeded recursion limit if reaches here
         if ( out.empty() ) {

src/ResolvExpr/SatisfyAssertions.hpp

-              r3c64c668
+              r58fe85a
 namespace ResolvExpr {
+/// Recursively satisfies all assertions provided in a candidate; returns true if succeeds
+void satisfyAssertions(
+        CandidateRef & cand, const ast::SymbolTable & symtab, CandidateList & out,
+/// Recursively satisfies all assertions provided in a candidate
+/// returns true if it has been run (candidate has any assertions)
+void satisfyAssertions(
+        CandidateRef & cand, const ast::SymbolTable & symtab, CandidateList & out,
         std::vector<std::string> & errors );

src/ResolvExpr/SpecCost.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Oct 02 15:50:00 2018
 // Last Modified By : Andrew Beach
+// Last Modified On : Wed Jun 19 10:43:00 2019
+// Update Count     : 2
+//
+// Last Modified On : Wed Jul  3 11:07:00 2019
+// Update Count     : 3
+//
+#include <cassert>
 #include <limits>
 #include <list>
 …
                         typename std::add_pointer<ast::Type const *(typename T::value_type const &)>::type;
+                #warning Should use a standard maybe_accept
+                void maybe_accept( ast::Type const * type ) {
+                        if ( type ) {
+                                auto node = type->accept( *visitor );
+                                assert( node == nullptr || node == type );
+                        }
+                }
                 // Update the minimum to the new lowest non-none value.
                 template<typename T>
 …
                         for ( const auto & node : list ) {
                                 count = -1;
                                 mapper( node )->accept( *visitor );
+                                maybe_accept( mapper( node ) );
                                 if ( count != -1 && count < minimum ) minimum = count;
+                        }
 …
                 void previsit( const ast::FunctionType * fty ) {
                         int minCount = std::numeric_limits<int>::max();
                         updateMinimumPresent( minCount, fty->params, decl_type );
                         updateMinimumPresent( minCount, fty->returns, decl_type );
+                        updateMinimumPresent( minCount, fty->params, type_deref );
+                        updateMinimumPresent( minCount, fty->returns, type_deref );
                         // Add another level to minCount if set.
                         count = toNoneOrInc( minCount );
 …
+        }
         ast::Pass<SpecCounter> counter;
         type->accept( *counter.pass.visitor );
         return counter.pass.get_count();
+        type->accept( counter );
+        return counter.core.get_count();
+}

src/ResolvExpr/TypeEnvironment.cc

-              r3c64c668
+              r58fe85a
 #include <utility>                     // for pair, move
+#include "CompilationState.h"          // for deterministic_output
 #include "Common/utility.h"            // for maybeClone
 #include "SynTree/Type.h"              // for Type, FunctionType, Type::Fora...
 …
         void EqvClass::print( std::ostream &os, Indenter indent ) const {
+                os << "( ";
+                std::copy( vars.begin(), vars.end(), std::ostream_iterator< std::string >( os, " " ) );
+                os << "(";
+                bool first = true;
+                for(const auto & var : vars) {
+                        if(first) first = false;
+                        else os << " ";
+                        if( deterministic_output && isUnboundType(var) ) os << "[unbound]";
+                        else os << var;
+                }
                 os << ")";
                 if ( type ) {
 …
                 // check safely bindable
                 if ( r.type && occursIn( r.type, s.vars.begin(), s.vars.end(), *this ) ) return false;
                 // merge classes in
                 r.vars.insert( s.vars.begin(), s.vars.end() );

src/ResolvExpr/TypeEnvironment.h

r3c64c668	r58fe85a
149	149	iterator end() const { return env.end(); }
150	150
	151	auto size() const { return env.size(); }
	152
151	153	private:
152	154	ClassList env;

src/ResolvExpr/Unify.cc

-              r3c64c668
+              r58fe85a
 #include <vector>
+#include "AST/Copy.hpp"
 #include "AST/Decl.hpp"
 #include "AST/Node.hpp"
 #include "AST/Pass.hpp"
+#include "AST/Print.hpp"
 #include "AST/Type.hpp"
 #include "AST/TypeEnvironment.hpp"
 …
                 findOpenVars( newSecond, open, closed, need, have, FirstOpen );
+                return unifyExact(
+                        newFirst, newSecond, newEnv, need, have, open, noWiden(), symtab );
+                return unifyExact(newFirst, newSecond, newEnv, need, have, open, noWiden(), symtab );
+        }
 …
                 newFirst->get_qualifiers() = Type::Qualifiers();
                 newSecond->get_qualifiers() = Type::Qualifiers();
+///   std::cerr << "first is ";
+///   first->print( std::cerr );
+///   std::cerr << std::endl << "second is ";
+///   second->print( std::cerr );
+///   std::cerr << std::endl << "newFirst is ";
+///   newFirst->print( std::cerr );
+///   std::cerr << std::endl << "newSecond is ";
+///   newSecond->print( std::cerr );
+///   std::cerr << std::endl;
                 bool result = unifyExact( newFirst, newSecond, newEnv, needAssertions, haveAssertions, openVars, WidenMode( false, false ), indexer );
                 delete newFirst;
 …
                 ast::AssertionSet need, have;
+                ast::ptr<ast::Type> newFirst{ first }, newSecond{ second };
+                env.apply( newFirst );
+                env.apply( newSecond );
+                reset_qualifiers( newFirst );
+                reset_qualifiers( newSecond );
+                ast::Type * newFirst  = shallowCopy( first  );
+                ast::Type * newSecond = shallowCopy( second );
+                newFirst ->qualifiers = {};
+                newSecond->qualifiers = {};
+                ast::ptr< ast::Type > t1_(newFirst );
+                ast::ptr< ast::Type > t2_(newSecond);
+                ast::ptr< ast::Type > subFirst = env.apply(newFirst).node;
+                ast::ptr< ast::Type > subSecond = env.apply(newSecond).node;
                 return unifyExact(
+                        newFirst, newSecond, newEnv, need, have, open, noWiden(), symtab );
+                        subFirst,
+                        subSecond,
+                        newEnv, need, have, open, noWiden(), symtab );
+        }
 …
         void markAssertionSet( AssertionSet &assertions, DeclarationWithType *assert ) {
-///   std::cerr << "assertion set is" << std::endl;
-///   printAssertionSet( assertions, std::cerr, 8 );
-///   std::cerr << "looking for ";
-///   assert->print( std::cerr );
-///   std::cerr << std::endl;
                 AssertionSet::iterator i = assertions.find( assert );
                 if ( i != assertions.end() ) {
-///     std::cerr << "found it!" << std::endl;
                         i->second.isUsed = true;
                 } // if
 …
         template< typename Iterator1, typename Iterator2 >
         bool unifyDeclList( Iterator1 list1Begin, Iterator1 list1End, Iterator2 list2Begin, Iterator2 list2End, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, const OpenVarSet &openVars, const SymTab::Indexer &indexer ) {
+        bool unifyTypeList( Iterator1 list1Begin, Iterator1 list1End, Iterator2 list2Begin, Iterator2 list2End, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, const OpenVarSet &openVars, const SymTab::Indexer &indexer ) {
                 auto get_type = [](DeclarationWithType * dwt){ return dwt->get_type(); };
                 for ( ; list1Begin != list1End && list2Begin != list2End; ++list1Begin, ++list2Begin ) {
 …
                                         || flatOther->isTtype()
                         ) {
                                 if ( unifyDeclList( flatFunc->parameters.begin(), flatFunc->parameters.end(), flatOther->parameters.begin(), flatOther->parameters.end(), env, needAssertions, haveAssertions, openVars, indexer ) ) {
                                         if ( unifyDeclList( flatFunc->returnVals.begin(), flatFunc->returnVals.end(), flatOther->returnVals.begin(), flatOther->returnVals.end(), env, needAssertions, haveAssertions, openVars, indexer ) ) {
+                                if ( unifyTypeList( flatFunc->parameters.begin(), flatFunc->parameters.end(), flatOther->parameters.begin(), flatOther->parameters.end(), env, needAssertions, haveAssertions, openVars, indexer ) ) {
+                                        if ( unifyTypeList( flatFunc->returnVals.begin(), flatFunc->returnVals.end(), flatOther->returnVals.begin(), flatOther->returnVals.end(), env, needAssertions, haveAssertions, openVars, indexer ) ) {
                                                 // the original types must be used in mark assertions, since pointer comparisons are used
 …
                 const ast::SymbolTable & symtab;
         public:
+                static size_t traceId;
                 bool result;
 …
                 /// If this isn't done when satifying ttype assertions, then argument lists can have
                 /// different size and structure when they should be compatible.
                 struct TtypeExpander_new : public ast::WithShortCircuiting {
+                struct TtypeExpander_new : public ast::WithShortCircuiting, public ast::PureVisitor {
                         ast::TypeEnvironment & tenv;
 …
                         const ast::Type * postvisit( const ast::TypeInstType * typeInst ) {
                                 if ( const ast::EqvClass * clz = tenv.lookup( typeInst->name ) ) {
+                                if ( const ast::EqvClass * clz = tenv.lookup( *typeInst ) ) {
                                         // expand ttype parameter into its actual type
                                         if ( clz->data.kind == ast::TypeDecl::Ttype && clz->bound ) {
 …
                 /// returns flattened version of `src`
                 static std::vector< ast::ptr< ast::DeclWithType > > flattenList(
                         const std::vector< ast::ptr< ast::DeclWithType > > & src, ast::TypeEnvironment & env
+                static std::vector< ast::ptr< ast::Type > > flattenList(
+                        const std::vector< ast::ptr< ast::Type > > & src, ast::TypeEnvironment & env
                 ) {
                         std::vector< ast::ptr< ast::DeclWithType > > dst;
+                        std::vector< ast::ptr< ast::Type > > dst;
                         dst.reserve( src.size() );
                         for ( const ast::DeclWithType * d : src ) {
+                        for ( const auto & d : src ) {
                                 ast::Pass<TtypeExpander_new> expander{ env };
+                                d = d->accept( expander );
+                                auto types = flatten( d->get_type() );
+                                // TtypeExpander pass is impure (may mutate nodes in place)
+                                // need to make nodes shared to prevent accidental mutation
+                                ast::ptr<ast::Type> dc = d->accept(expander);
+                                auto types = flatten( dc );
                                 for ( ast::ptr< ast::Type > & t : types ) {
                                         // outermost const, volatile, _Atomic qualifiers in parameters should not play
 …
                                         // requirements than a non-mutex function
                                         remove_qualifiers( t, ast::CV::Const | ast::CV::Volatile | ast::CV::Atomic );
                                         dst.emplace_back( new ast::ObjectDecl{ d->location, "", t } );
+                                        dst.emplace_back( t );
+                                }
+                        }
 …
                 /// Creates a tuple type based on a list of DeclWithType
                 template< typename Iter >
                 static ast::ptr< ast::Type > tupleFromDecls( Iter crnt, Iter end ) {
+                static const ast::Type * tupleFromTypes( Iter crnt, Iter end ) {
                         std::vector< ast::ptr< ast::Type > > types;
                         while ( crnt != end ) {
                                 // it is guaranteed that a ttype variable will be bound to a flat tuple, so ensure
                                 // that this results in a flat tuple
                                 flatten( (*crnt)->get_type(), types );
+                                flatten( *crnt, types );
                                 ++crnt;
+                        }
                         return { new ast::TupleType{ std::move(types) } };
+                        return new ast::TupleType{ std::move(types) };
+                }
                 template< typename Iter >
                 static bool unifyDeclList(
+                static bool unifyTypeList(
                         Iter crnt1, Iter end1, Iter crnt2, Iter end2, ast::TypeEnvironment & env,
                         ast::AssertionSet & need, ast::AssertionSet & have, const ast::OpenVarSet & open,
 …
                 ) {
                         while ( crnt1 != end1 && crnt2 != end2 ) {
                                 const ast::Type * t1 = (*crnt1)->get_type();
                                 const ast::Type * t2 = (*crnt2)->get_type();
+                                const ast::Type * t1 = *crnt1;
+                                const ast::Type * t2 = *crnt2;
                                 bool isTuple1 = Tuples::isTtype( t1 );
                                 bool isTuple2 = Tuples::isTtype( t2 );
 …
                                         // combine remainder of list2, then unify
                                         return unifyExact(
                                                 t1, tupleFromDecls( crnt2, end2 ), env, need, have, open,
+                                                t1, tupleFromTypes( crnt2, end2 ), env, need, have, open,
                                                 noWiden(), symtab );
                                 } else if ( ! isTuple1 && isTuple2 ) {
                                         // combine remainder of list1, then unify
                                         return unifyExact(
                                                 tupleFromDecls( crnt1, end1 ), t2, env, need, have, open,
+                                                tupleFromTypes( crnt1, end1 ), t2, env, need, have, open,
                                                 noWiden(), symtab );
+                                }
 …
                         if ( crnt1 != end1 ) {
                                 // try unifying empty tuple with ttype
                                 const ast::Type * t1 = (*crnt1)->get_type();
+                                const ast::Type * t1 = *crnt1;
                                 if ( ! Tuples::isTtype( t1 ) ) return false;
                                 return unifyExact(
                                         t1, tupleFromDecls( crnt2, end2 ), env, need, have, open,
+                                        t1, tupleFromTypes( crnt2, end2 ), env, need, have, open,
                                         noWiden(), symtab );
                         } else if ( crnt2 != end2 ) {
                                 // try unifying empty tuple with ttype
                                 const ast::Type * t2 = (*crnt2)->get_type();
+                                const ast::Type * t2 = *crnt2;
                                 if ( ! Tuples::isTtype( t2 ) ) return false;
                                 return unifyExact(
                                         tupleFromDecls( crnt1, end1 ), t2, env, need, have, open,
+                                        tupleFromTypes( crnt1, end1 ), t2, env, need, have, open,
                                         noWiden(), symtab );
+                        }
 …
+                }
                 static bool unifyDeclList(
                         const std::vector< ast::ptr< ast::DeclWithType > > & list1,
                         const std::vector< ast::ptr< ast::DeclWithType > > & list2,
+                static bool unifyTypeList(
+                        const std::vector< ast::ptr< ast::Type > > & list1,
+                        const std::vector< ast::ptr< ast::Type > > & list2,
                         ast::TypeEnvironment & env, ast::AssertionSet & need, ast::AssertionSet & have,
                         const ast::OpenVarSet & open, const ast::SymbolTable & symtab
                 ) {
                         return unifyDeclList(
+                        return unifyTypeList(
                                 list1.begin(), list1.end(), list2.begin(), list2.end(), env, need, have, open,
                                 symtab );
+                }
                 static void markAssertionSet( ast::AssertionSet & assns, const ast::DeclWithType * assn ) {
+                static void markAssertionSet( ast::AssertionSet & assns, const ast::VariableExpr * assn ) {
                         auto i = assns.find( assn );
                         if ( i != assns.end() ) {
 …
                 static void markAssertions(
                         ast::AssertionSet & assn1, ast::AssertionSet & assn2,
                         const ast::ParameterizedType * type
+                        const ast::FunctionType * type
                 ) {
+                        for ( const auto & tyvar : type->forall ) {
+                                for ( const ast::DeclWithType * assert : tyvar->assertions ) {
+                                        markAssertionSet( assn1, assert );
+                                        markAssertionSet( assn2, assert );
+                                }
+                        for ( auto & assert : type->assertions ) {
+                                markAssertionSet( assn1, assert );
+                                markAssertionSet( assn2, assert );
+                        }
+                }
 …
                         ) return;
                         if ( ! unifyDeclList( params, params2, tenv, need, have, open, symtab ) ) return;
                         if ( ! unifyDeclList(
+                        if ( ! unifyTypeList( params, params2, tenv, need, have, open, symtab ) ) return;
+                        if ( ! unifyTypeList(
                                 func->returns, func2->returns, tenv, need, have, open, symtab ) ) return;
 …
         private:
+                template< typename RefType >
+                const RefType * handleRefType( const RefType * inst, const ast::Type * other ) {
+                // Returns: other, cast as XInstType
+                // Assigns this->result: whether types are compatible (up to generic parameters)
+                template< typename XInstType >
+                const XInstType * handleRefType( const XInstType * inst, const ast::Type * other ) {
                         // check that the other type is compatible and named the same
                         auto otherInst = dynamic_cast< const RefType * >( other );
                         result = otherInst && inst->name == otherInst->name;
+                        auto otherInst = dynamic_cast< const XInstType * >( other );
+                        this->result = otherInst && inst->name == otherInst->name;
                         return otherInst;
+                }
 …
+                }
                 template< typename RefType >
                 void handleGenericRefType( const RefType * inst, const ast::Type * other ) {
+                template< typename XInstType >
+                void handleGenericRefType( const XInstType * inst, const ast::Type * other ) {
                         // check that other type is compatible and named the same
                         const RefType * inst2 = handleRefType( inst, other );
                         if ( ! inst2 ) return;
+                        const XInstType * otherInst = handleRefType( inst, other );
+                        if ( ! this->result ) return;
                         // check that parameters of types unify, if any
                         const std::vector< ast::ptr< ast::Expr > > & params = inst->params;
                         const std::vector< ast::ptr< ast::Expr > > & params2 = inst2->params;
+                        const std::vector< ast::ptr< ast::Expr > > & params2 = otherInst->params;
                         auto it = params.begin();
 …
                 void postvisit( const ast::TypeInstType * typeInst ) {
                         assert( open.find( typeInst->name ) == open.end() );
+                        assert( open.find( *typeInst ) == open.end() );
                         handleRefType( typeInst, type2 );
+                }
 …
         private:
                 /// Creates a tuple type based on a list of Type
                 static ast::ptr< ast::Type > tupleFromTypes(
+                static const ast::Type * tupleFromTypes(
                         const std::vector< ast::ptr< ast::Type > > & tys
                 ) {
 …
                         ast::Pass<TtypeExpander_new> expander{ tenv };
                         const ast::Type * flat = tuple->accept( expander );
                         const ast::Type * flat2 = tuple2->accept( expander );
 …
         };
+        // size_t Unify_new::traceId = Stats::Heap::new_stacktrace_id("Unify_new");
         bool unify(
                         const ast::ptr<ast::Type> & type1, const ast::ptr<ast::Type> & type2,
 …
                 auto var2 = dynamic_cast< const ast::TypeInstType * >( type2 );
                 ast::OpenVarSet::const_iterator
                         entry1 = var1 ? open.find( var1->name ) : open.end(),
                         entry2 = var2 ? open.find( var2->name ) : open.end();
+                        entry1 = var1 ? open.find( *var1 ) : open.end(),
+                        entry2 = var2 ? open.find( *var2 ) : open.end();
                 bool isopen1 = entry1 != open.end();
                 bool isopen2 = entry2 != open.end();
 …
                         ast::Pass<Unify_new> comparator{ type2, env, need, have, open, widen, symtab };
                         type1->accept( comparator );
                         return comparator.pass.result;
+                        return comparator.core.result;
+                }
+        }
 …
                 // force t1 and t2 to be cloned if their qualifiers must be stripped, so that type1 and
                 // type2 are left unchanged; calling convention forces type{1,2}->strong_ref >= 1
+                ast::ptr<ast::Type> t1{ type1 }, t2{ type2 };
+                reset_qualifiers( t1 );
+                reset_qualifiers( t2 );
+                ast::Type * t1 = shallowCopy(type1.get());
+                ast::Type * t2 = shallowCopy(type2.get());
+                t1->qualifiers = {};
+                t2->qualifiers = {};
+                ast::ptr< ast::Type > t1_(t1);
+                ast::ptr< ast::Type > t2_(t2);
                 if ( unifyExact( t1, t2, env, need, have, open, widen, symtab ) ) {
-                        t1 = nullptr; t2 = nullptr; // release t1, t2 to avoid spurious clones
                         // if exact unification on unqualified types, try to merge qualifiers
                         if ( q1 == q2 || ( ( q1 > q2 || widen.first ) && ( q2 > q1 || widen.second ) ) ) {
                                 common = type1;
                                 reset_qualifiers( common, q1 | q2 );
+                                t1->qualifiers = q1 | q2;
+                                common = t1;
                                 return true;
                         } else {
 …
                 } else if (( common = commonType( t1, t2, widen, symtab, env, open ) )) {
-                        t1 = nullptr; t2 = nullptr; // release t1, t2 to avoid spurious clones
                         // no exact unification, but common type
+                        reset_qualifiers( common, q1 | q2 );
+                        auto c = shallowCopy(common.get());
+                        c->qualifiers = q1 | q2;
+                        common = c;
                         return true;
                 } else {
 …
         ast::ptr<ast::Type> extractResultType( const ast::FunctionType * func ) {
                 if ( func->returns.empty() ) return new ast::VoidType{};
                 if ( func->returns.size() == 1 ) return func->returns[0]->get_type();
+                if ( func->returns.size() == 1 ) return func->returns[0];
                 std::vector<ast::ptr<ast::Type>> tys;
                 for ( const ast::DeclWithType * decl : func->returns ) {
                         tys.emplace_back( decl->get_type() );
+                for ( const auto & decl : func->returns ) {
+                        tys.emplace_back( decl );
+                }
                 return new ast::TupleType{ std::move(tys) };

src/ResolvExpr/module.mk

-              r3c64c668
+              r58fe85a
       ResolvExpr/Alternative.cc \
       ResolvExpr/AlternativeFinder.cc \
+      ResolvExpr/AlternativeFinder.h \
+      ResolvExpr/Alternative.h \
       ResolvExpr/Candidate.cpp \
       ResolvExpr/CandidateFinder.cpp \
+      ResolvExpr/CandidateFinder.hpp \
+      ResolvExpr/Candidate.hpp \
       ResolvExpr/CastCost.cc \
       ResolvExpr/CommonType.cc \
       ResolvExpr/ConversionCost.cc \
+      ResolvExpr/ConversionCost.h \
+      ResolvExpr/Cost.h \
       ResolvExpr/CurrentObject.cc \
+      ResolvExpr/CurrentObject.h \
       ResolvExpr/ExplodedActual.cc \
+      ResolvExpr/ExplodedActual.h \
       ResolvExpr/ExplodedArg.cpp \
+      ResolvExpr/ExplodedArg.hpp \
       ResolvExpr/FindOpenVars.cc \
+      ResolvExpr/FindOpenVars.h \
       ResolvExpr/Occurs.cc \
       ResolvExpr/PolyCost.cc \
 …
       ResolvExpr/PtrsCastable.cc \
       ResolvExpr/RenameVars.cc \
+      ResolvExpr/RenameVars.h \
       ResolvExpr/ResolveAssertions.cc \
+      ResolvExpr/ResolveAssertions.h \
       ResolvExpr/Resolver.cc \
+      ResolvExpr/Resolver.h \
       ResolvExpr/ResolveTypeof.cc \
+      ResolvExpr/ResolveTypeof.h \
+      ResolvExpr/ResolvMode.h \
       ResolvExpr/SatisfyAssertions.cpp \
+      ResolvExpr/SatisfyAssertions.hpp \
       ResolvExpr/SpecCost.cc \
       ResolvExpr/TypeEnvironment.cc \
+      ResolvExpr/Unify.cc
+      ResolvExpr/TypeEnvironment.h \
+      ResolvExpr/typeops.h \
+      ResolvExpr/Unify.cc \
+      ResolvExpr/Unify.h \
+      ResolvExpr/WidenMode.h
+SRC += $(SRC_RESOLVEXPR) ResolvExpr/AlternativePrinter.cc
+SRC += $(SRC_RESOLVEXPR) ResolvExpr/AlternativePrinter.cc ResolvExpr/AlternativePrinter.h
 SRCDEMANGLE += $(SRC_RESOLVEXPR)

src/ResolvExpr/typeops.h

-              r3c64c668
+              r58fe85a
 // Created On       : Sun May 17 07:28:22 2015
 // Last Modified By : Andrew Beach
 // Last Modified On : Thu Aug  8 16:36:00 2019
 // Update Count     : 5
+// Last Modified On : Tue Oct  1 09:45:00 2019
+// Update Count     : 6
 //
 …
                 const SymTab::Indexer & indexer, const TypeEnvironment & env );
         Cost castCost(
                 const ast::Type * src, const ast::Type * dst, const ast::SymbolTable & symtab,
                 const ast::TypeEnvironment & env );
+                const ast::Type * src, const ast::Type * dst, bool srcIsLvalue,
+                const ast::SymbolTable & symtab, const ast::TypeEnvironment & env );
         // in ConversionCost.cc
 …
                 const SymTab::Indexer & indexer, const TypeEnvironment & env );
         Cost conversionCost(
                 const ast::Type * src, const ast::Type * dst, const ast::SymbolTable & symtab,
                 const ast::TypeEnvironment & env );
+                const ast::Type * src, const ast::Type * dst, bool srcIsLvalue,
+                const ast::SymbolTable & symtab, const ast::TypeEnvironment & env );
         // in AlternativeFinder.cc

src/SymTab/Autogen.cc

-              r3c64c668
+              r58fe85a
 #include "SynTree/Type.h"          // for FunctionType, Type, TypeInstType
 #include "SynTree/Visitor.h"       // for maybeAccept, Visitor, acceptAll
+#include "CompilationState.h"
 class Attribute;
 …
+        }
+        // shallow copy the pointer list for return
+        std::vector<ast::ptr<ast::TypeDecl>> getGenericParams (const ast::Type * t) {
+                if (auto structInst = dynamic_cast<const ast::StructInstType*>(t)) {
+                        return structInst->base->params;
+                }
+                if (auto unionInst = dynamic_cast<const ast::UnionInstType*>(t)) {
+                        return unionInst->base->params;
+                }
+                return {};
+        }
         /// given type T, generate type of default ctor/dtor, i.e. function type void (*) (T *)
         FunctionType * genDefaultType( Type * paramType, bool maybePolymorphic ) {
 …
                 ftype->parameters.push_back( dstParam );
                 return ftype;
+        }
+        ///
+        ast::FunctionDecl * genDefaultFunc(const CodeLocation loc, const std::string fname, const ast::Type * paramType, bool maybePolymorphic) {
+                std::vector<ast::ptr<ast::TypeDecl>> typeParams;
+                if (maybePolymorphic) typeParams = getGenericParams(paramType);
+                auto dstParam = new ast::ObjectDecl(loc, "_dst", new ast::ReferenceType(paramType), nullptr, {}, ast::Linkage::Cforall);
+                return new ast::FunctionDecl(loc, fname, std::move(typeParams), {dstParam}, {}, new ast::CompoundStmt(loc));
+        }
 …
         void FuncGenerator::resolve( FunctionDecl * dcl ) {
                 try {
+                        ResolvExpr::resolveDecl( dcl, indexer );
+                        if (!useNewAST) // attempt to delay resolver call
+                                ResolvExpr::resolveDecl( dcl, indexer );
                         if ( functionNesting == 0 ) {
                                 // forward declare if top-level struct, so that
 …
                 } catch ( SemanticErrorException & ) {
                         // okay if decl does not resolve - that means the function should not be generated
+                        delete dcl;
+                        // delete dcl;
+                        delete dcl->statements;
+                        dcl->statements = nullptr;
+                        dcl->isDeleted = true;
+                        definitions.push_back( dcl );
+                        indexer.addId( dcl );
+                }
+        }

src/SymTab/Autogen.h

-              r3c64c668
+              r58fe85a
 #include "AST/Decl.hpp"
+#include "AST/Eval.hpp"
 #include "AST/Expr.hpp"
 #include "AST/Init.hpp"
 …
         /// maybePolymorphic is true if the resulting FunctionType is allowed to be polymorphic
         FunctionType * genDefaultType( Type * paramType, bool maybePolymorphic = true );
+        ast::FunctionDecl * genDefaultFunc(const CodeLocation loc, const std::string fname, const ast::Type * paramType, bool maybePolymorphic = true);
         /// generate the type of a copy constructor for paramType.
 …
                 fExpr->args.emplace_back( dstParam );
                 const ast::Stmt * listInit = srcParam.buildListInit( fExpr );
+                ast::ptr<ast::Stmt> listInit = srcParam.buildListInit( fExpr );
                 // fetch next set of arguments
 …
+                }
+                ast::ptr< ast::Expr > begin, end, cmp, update;
+                ast::ptr< ast::Expr > begin, end;
+                std::string cmp, update;
                 if ( forward ) {
 …
                         begin = ast::ConstantExpr::from_int( loc, 0 );
                         end = array->dimension;
                         cmp = new ast::NameExpr{ loc, "?<?" };
                         update = new ast::NameExpr{ loc, "++?" };
+                        cmp = "?<?";
+                        update = "++?";
                 } else {
                         // generate: for ( int i = N-1; i >= 0; --i )
+                        begin = new ast::UntypedExpr{
+                                loc, new ast::NameExpr{ loc, "?-?" },
+                                { array->dimension, ast::ConstantExpr::from_int( loc, 1 ) } };
+                        begin = ast::call(
+                                loc, "?-?", array->dimension, ast::ConstantExpr::from_int( loc, 1 ) );
                         end = ast::ConstantExpr::from_int( loc, 0 );
                         cmp = new ast::NameExpr{ loc, "?>=?" };
                         update = new ast::NameExpr{ loc, "--?" };
+                        cmp = "?>=?";
+                        update = "--?";
+                }
 …
                         loc, indexName.newName(), new ast::BasicType{ ast::BasicType::SignedInt },
                         new ast::SingleInit{ loc, begin } };
+                ast::ptr< ast::Expr > cond = new ast::UntypedExpr{
+                        loc, cmp, { new ast::VariableExpr{ loc, index }, end } };
+                ast::ptr< ast::Expr > inc = new ast::UntypedExpr{
+                        loc, update, { new ast::VariableExpr{ loc, index } } };
+                ast::ptr< ast::Expr > dstIndex = new ast::UntypedExpr{
+                        loc, new ast::NameExpr{ loc, "?[?]" },
+                        { dstParam, new ast::VariableExpr{ loc, index } } };
+                ast::ptr< ast::Expr > indexVar = new ast::VariableExpr{ loc, index };
+                ast::ptr< ast::Expr > cond = ast::call( loc, cmp, indexVar, end );
+                ast::ptr< ast::Expr > inc = ast::call( loc, update, indexVar );
+                ast::ptr< ast::Expr > dstIndex = ast::call( loc, "?[?]", dstParam, indexVar );
                 // srcParam must keep track of the array indices to build the source parameter and/or
                 // array list initializer
                 srcParam.addArrayIndex( new ast::VariableExpr{ loc, index }, array->dimension );
+                srcParam.addArrayIndex( indexVar, array->dimension );
                 // for stmt's body, eventually containing call
 …
                 if ( isUnnamedBitfield( obj ) ) return {};
                 ast::ptr< ast::Type > addCast = nullptr;
+                ast::ptr< ast::Type > addCast;
                 if ( (fname == "?{}" || fname == "^?{}") && ( ! obj || ( obj && ! obj->bitfieldWidth ) ) ) {
                         assert( dstParam->result );

src/SymTab/FixFunction.cc

-              r3c64c668
+              r58fe85a
                 bool isVoid = false;
                 void premutate( const ast::FunctionDecl * ) { visit_children = false; }
+                void previsit( const ast::FunctionDecl * ) { visit_children = false; }
                 const ast::DeclWithType * postmutate( const ast::FunctionDecl * func ) {
+                const ast::DeclWithType * postvisit( const ast::FunctionDecl * func ) {
                         return new ast::ObjectDecl{
                                 func->location, func->name, new ast::PointerType{ func->type }, nullptr,
 …
+                }
                 void premutate( const ast::ArrayType * ) { visit_children = false; }
+                void previsit( const ast::ArrayType * ) { visit_children = false; }
                 const ast::Type * postmutate( const ast::ArrayType * array ) {
+                const ast::Type * postvisit( const ast::ArrayType * array ) {
                         return new ast::PointerType{
                                 array->base, array->dimension, array->isVarLen, array->isStatic,
 …
+                }
                 void premutate( const ast::VoidType * ) { isVoid = true; }
+                void previsit( const ast::VoidType * ) { isVoid = true; }
                 void premutate( const ast::BasicType * ) { visit_children = false; }
                 void premutate( const ast::PointerType * ) { visit_children = false; }
                 void premutate( const ast::StructInstType * ) { visit_children = false; }
                 void premutate( const ast::UnionInstType * ) { visit_children = false; }
                 void premutate( const ast::EnumInstType * ) { visit_children = false; }
                 void premutate( const ast::TraitInstType * ) { visit_children = false; }
                 void premutate( const ast::TypeInstType * ) { visit_children = false; }
                 void premutate( const ast::TupleType * ) { visit_children = false; }
                 void premutate( const ast::VarArgsType * ) { visit_children = false; }
                 void premutate( const ast::ZeroType * ) { visit_children = false; }
                 void premutate( const ast::OneType * ) { visit_children = false; }
+                void previsit( const ast::BasicType * ) { visit_children = false; }
+                void previsit( const ast::PointerType * ) { visit_children = false; }
+                void previsit( const ast::StructInstType * ) { visit_children = false; }
+                void previsit( const ast::UnionInstType * ) { visit_children = false; }
+                void previsit( const ast::EnumInstType * ) { visit_children = false; }
+                void previsit( const ast::TraitInstType * ) { visit_children = false; }
+                void previsit( const ast::TypeInstType * ) { visit_children = false; }
+                void previsit( const ast::TupleType * ) { visit_children = false; }
+                void previsit( const ast::VarArgsType * ) { visit_children = false; }
+                void previsit( const ast::ZeroType * ) { visit_children = false; }
+                void previsit( const ast::OneType * ) { visit_children = false; }
         };
 } // anonymous namespace
 …
         ast::Pass< FixFunction_new > fixer;
         dwt = dwt->accept( fixer );
         isVoid |= fixer.pass.isVoid;
+        isVoid |= fixer.core.isVoid;
         return dwt;
+}

src/SymTab/Mangler.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Sun May 17 21:40:29 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Feb 15 13:55:12 2020
 // Update Count     : 33
+// Last Modified On : Wed Nov 18 12:01:38 2020
+// Update Count     : 64
 //
 #include "Mangler.h"
 …
                                 void postvisit( const QualifiedType * qualType );
                                 std::string get_mangleName() { return mangleName.str(); }
+                                std::string get_mangleName() { return mangleName; }
                           private:
                                 std::ostringstream mangleName;  ///< Mangled name being constructed
+                                std::string mangleName;         ///< Mangled name being constructed
                                 typedef std::map< std::string, std::pair< int, int > > VarMapType;
                                 VarMapType varNums;             ///< Map of type variables to indices
 …
                                         isTopLevel = false;
                                 } // if
                                 mangleName << Encoding::manglePrefix;
+                                mangleName += Encoding::manglePrefix;
                                 const CodeGen::OperatorInfo * opInfo = CodeGen::operatorLookup( declaration->get_name() );
                                 if ( opInfo ) {
                                         mangleName << opInfo->outputName.size() << opInfo->outputName;
+                                        mangleName += std::to_string( opInfo->outputName.size() ) + opInfo->outputName;
                                 } else {
                                         mangleName << declaration->name.size() << declaration->name;
+                                        mangleName += std::to_string( declaration->name.size() ) + declaration->name;
                                 } // if
                                 maybeAccept( declaration->get_type(), *visitor );
 …
                                         // so they need a different name mangling
                                         if ( declaration->get_linkage() == LinkageSpec::AutoGen ) {
                                                 mangleName << Encoding::autogen;
+                                                mangleName += Encoding::autogen;
                                         } else if ( declaration->get_linkage() == LinkageSpec::Intrinsic ) {
                                                 mangleName << Encoding::intrinsic;
+                                                mangleName += Encoding::intrinsic;
                                         } else {
                                                 // if we add another kind of overridable function, this has to change
 …
                         void Mangler_old::postvisit( const VoidType * voidType ) {
                                 printQualifiers( voidType );
                                 mangleName << Encoding::void_t;
+                                mangleName += Encoding::void_t;
+                        }
 …
                                 printQualifiers( basicType );
                                 assertf( basicType->kind < BasicType::NUMBER_OF_BASIC_TYPES, "Unhandled basic type: %d", basicType->kind );
                                 mangleName << Encoding::basicTypes[ basicType->kind ];
+                                mangleName += Encoding::basicTypes[ basicType->kind ];
+                        }
 …
                                 printQualifiers( pointerType );
                                 // mangle void (*f)() and void f() to the same name to prevent overloading on functions and function pointers
                                 if ( ! dynamic_cast<FunctionType *>( pointerType->base ) ) mangleName << Encoding::pointer;
+                                if ( ! dynamic_cast<FunctionType *>( pointerType->base ) ) mangleName += Encoding::pointer;
                                 maybeAccept( pointerType->base, *visitor );
+                        }
 …
                                 // TODO: encode dimension
                                 printQualifiers( arrayType );
                                 mangleName << Encoding::array << "0";
+                                mangleName += Encoding::array + "0";
                                 maybeAccept( arrayType->base, *visitor );
+                        }
 …
                         void Mangler_old::postvisit( const FunctionType * functionType ) {
                                 printQualifiers( functionType );
                                 mangleName << Encoding::function;
+                                mangleName += Encoding::function;
                                 // turn on inFunctionType so that printQualifiers does not print most qualifiers for function parameters,
                                 // since qualifiers on outermost parameter type do not differentiate function types, e.g.,
 …
                                 inFunctionType = true;
                                 std::list< Type* > returnTypes = getTypes( functionType->returnVals );
                                 if (returnTypes.empty()) mangleName << Encoding::void_t;
+                                if (returnTypes.empty()) mangleName += Encoding::void_t;
                                 else acceptAll( returnTypes, *visitor );
                                 mangleName << "_";
+                                mangleName += "_";
                                 std::list< Type* > paramTypes = getTypes( functionType->parameters );
                                 acceptAll( paramTypes, *visitor );
                                 mangleName << "_";
+                                mangleName += "_";
+                        }
 …
                                 printQualifiers( refType );
                                 mangleName << prefix << refType->name.length() << refType->name;
+                                mangleName += prefix + std::to_string( refType->name.length() ) + refType->name;
                                 if ( mangleGenericParams ) {
                                         const std::list< Expression* > & params = refType->parameters;
                                         if ( ! params.empty() ) {
                                                 mangleName << "_";
+                                                mangleName += "_";
                                                 for ( const Expression * param : params ) {
                                                         const TypeExpr * paramType = dynamic_cast< const TypeExpr * >( param );
 …
                                                         maybeAccept( paramType->type, *visitor );
+                                                }
                                                 mangleName << "_";
+                                                mangleName += "_";
+                                        }
+                                }
 …
                                         // are first found and prefixing with the appropriate encoding for the type class.
                                         assertf( varNum->second.second < TypeDecl::NUMBER_OF_KINDS, "Unhandled type variable kind: %d", varNum->second.second );
                                         mangleName << Encoding::typeVariables[varNum->second.second] << varNum->second.first;
+                                        mangleName += Encoding::typeVariables[varNum->second.second] + std::to_string( varNum->second.first );
                                 } // if
+                        }
 …
                         void Mangler_old::postvisit( const TraitInstType * inst ) {
                                 printQualifiers( inst );
                                 mangleName << inst->name.size() << inst->name;
+                                mangleName += std::to_string( inst->name.size() ) + inst->name;
+                        }
                         void Mangler_old::postvisit( const TupleType * tupleType ) {
                                 printQualifiers( tupleType );
                                 mangleName << Encoding::tuple << tupleType->types.size();
+                                mangleName += Encoding::tuple + std::to_string( tupleType->types.size() );
                                 acceptAll( tupleType->types, *visitor );
+                        }
 …
                                 printQualifiers( varArgsType );
                                 static const std::string vargs = "__builtin_va_list";
                                 mangleName << Encoding::type << vargs.size() << vargs;
+                                mangleName += Encoding::type + std::to_string( vargs.size() ) + vargs;
+                        }
                         void Mangler_old::postvisit( const ZeroType * ) {
                                 mangleName << Encoding::zero;
+                                mangleName += Encoding::zero;
+                        }
                         void Mangler_old::postvisit( const OneType * ) {
                                 mangleName << Encoding::one;
+                                mangleName += Encoding::one;
+                        }
 …
                                         // N marks the start of a qualified type
                                         inQualifiedType = true;
                                         mangleName << Encoding::qualifiedTypeStart;
+                                        mangleName += Encoding::qualifiedTypeStart;
+                                }
                                 maybeAccept( qualType->parent, *visitor );
 …
                                         // E marks the end of a qualified type
                                         inQualifiedType = false;
                                         mangleName << Encoding::qualifiedTypeEnd;
+                                        mangleName += Encoding::qualifiedTypeEnd;
+                                }
+                        }
 …
                                 assertf(false, "Mangler_old should not visit typedecl: %s", toCString(decl));
                                 assertf( decl->kind < TypeDecl::NUMBER_OF_KINDS, "Unhandled type variable kind: %d", decl->kind );
                                 mangleName << Encoding::typeVariables[ decl->kind ] << ( decl->name.length() ) << decl->name;
+                                mangleName += Encoding::typeVariables[ decl->kind ] + std::to_string( decl->name.length() ) + decl->name;
+                        }
 …
                                         std::list< std::string > assertionNames;
                                         int dcount = 0, fcount = 0, vcount = 0, acount = 0;
                                         mangleName << Encoding::forall;
+                                        mangleName += Encoding::forall;
                                         for ( const TypeDecl * i : type->forall ) {
                                                 switch ( i->kind ) {
 …
                                                 } // for
                                         } // for
+                                        mangleName << dcount << "_" << fcount << "_" << vcount << "_" << acount << "_";
+                                        std::copy( assertionNames.begin(), assertionNames.end(), std::ostream_iterator< std::string >( mangleName, "" ) );
+                                        mangleName << "_";
+                                        mangleName += std::to_string( dcount ) + "_" + std::to_string( fcount ) + "_" + std::to_string( vcount ) + "_" + std::to_string( acount ) + "_";
+                                        for(const auto & a : assertionNames) mangleName += a;
+//                                      std::copy( assertionNames.begin(), assertionNames.end(), std::ostream_iterator< std::string >( mangleName, "" ) );
+                                        mangleName += "_";
                                 } // if
                                 if ( ! inFunctionType ) {
                                         // these qualifiers do not distinguish the outermost type of a function parameter
                                         if ( type->get_const() ) {
                                                 mangleName << Encoding::qualifiers.at(Type::Const);
+                                                mangleName += Encoding::qualifiers.at(Type::Const);
                                         } // if
                                         if ( type->get_volatile() ) {
                                                 mangleName << Encoding::qualifiers.at(Type::Volatile);
+                                                mangleName += Encoding::qualifiers.at(Type::Volatile);
                                         } // if
                                         // Removed due to restrict not affecting function compatibility in GCC
                                         // if ( type->get_isRestrict() ) {
                                         //      mangleName << "E";
+                                        //      mangleName += "E";
                                         // } // if
                                         if ( type->get_atomic() ) {
                                                 mangleName << Encoding::qualifiers.at(Type::Atomic);
+                                                mangleName += Encoding::qualifiers.at(Type::Atomic);
                                         } // if
+                                }
                                 if ( type->get_mutex() ) {
                                         mangleName << Encoding::qualifiers.at(Type::Mutex);
+                                        mangleName += Encoding::qualifiers.at(Type::Mutex);
                                 } // if
                                 if ( inFunctionType ) {
 …
+                                }
+                        }
                 }       // namespace
+                } // namespace
         } // namespace Mangler
 } // namespace SymTab
 …
                         void postvisit( const ast::QualifiedType * qualType );
                         std::string get_mangleName() { return mangleName.str(); }
+                        std::string get_mangleName() { return mangleName; }
                   private:
                         std::ostringstream mangleName;  ///< Mangled name being constructed
+                        std::string mangleName;         ///< Mangled name being constructed
                         typedef std::map< std::string, std::pair< int, int > > VarMapType;
                         VarMapType varNums;             ///< Map of type variables to indices
 …
                   private:
                         void mangleDecl( const ast::DeclWithType *declaration );
                         void mangleRef( const ast::ReferenceToType *refType, std::string prefix );
+                        void mangleRef( const ast::BaseInstType *refType, std::string prefix );
                         void printQualifiers( const ast::Type *type );
 …
                 ast::Pass<Mangler_new> mangler( mode );
                 maybeAccept( decl, mangler );
                 return mangler.pass.get_mangleName();
+                return mangler.core.get_mangleName();
+        }
 …
                                 isTopLevel = false;
                         } // if
                         mangleName << Encoding::manglePrefix;
+                        mangleName += Encoding::manglePrefix;
                         const CodeGen::OperatorInfo * opInfo = CodeGen::operatorLookup( decl->name );
                         if ( opInfo ) {
                                 mangleName << opInfo->outputName.size() << opInfo->outputName;
+                                mangleName += std::to_string( opInfo->outputName.size() ) + opInfo->outputName;
                         } else {
                                 mangleName << decl->name.size() << decl->name;
+                                mangleName += std::to_string( decl->name.size() ) + decl->name;
                         } // if
                         maybeAccept( decl->get_type(), *visitor );
 …
                                 // so they need a different name mangling
                                 if ( decl->linkage == ast::Linkage::AutoGen ) {
                                         mangleName << Encoding::autogen;
+                                        mangleName += Encoding::autogen;
                                 } else if ( decl->linkage == ast::Linkage::Intrinsic ) {
                                         mangleName << Encoding::intrinsic;
+                                        mangleName += Encoding::intrinsic;
                                 } else {
                                         // if we add another kind of overridable function, this has to change
 …
                 void Mangler_new::postvisit( const ast::VoidType * voidType ) {
                         printQualifiers( voidType );
                         mangleName << Encoding::void_t;
+                        mangleName += Encoding::void_t;
+                }
 …
                         printQualifiers( basicType );
                         assertf( basicType->kind < ast::BasicType::NUMBER_OF_BASIC_TYPES, "Unhandled basic type: %d", basicType->kind );
                         mangleName << Encoding::basicTypes[ basicType->kind ];
+                        mangleName += Encoding::basicTypes[ basicType->kind ];
+                }
 …
                         printQualifiers( pointerType );
                         // mangle void (*f)() and void f() to the same name to prevent overloading on functions and function pointers
                         if ( ! pointerType->base.as<ast::FunctionType>() ) mangleName << Encoding::pointer;
+                        if ( ! pointerType->base.as<ast::FunctionType>() ) mangleName += Encoding::pointer;
                         maybe_accept( pointerType->base.get(), *visitor );
+                }
 …
                         // TODO: encode dimension
                         printQualifiers( arrayType );
                         mangleName << Encoding::array << "0";
+                        mangleName += Encoding::array + "0";
                         maybeAccept( arrayType->base.get(), *visitor );
+                }
 …
                 void Mangler_new::postvisit( const ast::FunctionType * functionType ) {
                         printQualifiers( functionType );
                         mangleName << Encoding::function;
+                        mangleName += Encoding::function;
                         // turn on inFunctionType so that printQualifiers does not print most qualifiers for function parameters,
                         // since qualifiers on outermost parameter type do not differentiate function types, e.g.,
 …
                         GuardValue( inFunctionType );
                         inFunctionType = true;
+                        std::vector< ast::ptr< ast::Type > > returnTypes = getTypes( functionType->returns );
+                        if (returnTypes.empty()) mangleName << Encoding::void_t;
+                        else accept_each( returnTypes, *visitor );
+                        mangleName << "_";
+                        std::vector< ast::ptr< ast::Type > > paramTypes = getTypes( functionType->params );
+                        accept_each( paramTypes, *visitor );
+                        mangleName << "_";
+                }
+                void Mangler_new::mangleRef( const ast::ReferenceToType * refType, std::string prefix ) {
+                        if (functionType->returns.empty()) mangleName += Encoding::void_t;
+                        else accept_each( functionType->returns, *visitor );
+                        mangleName += "_";
+                        accept_each( functionType->params, *visitor );
+                        mangleName += "_";
+                }
+                void Mangler_new::mangleRef( const ast::BaseInstType * refType, std::string prefix ) {
                         printQualifiers( refType );
                         mangleName << prefix << refType->name.length() << refType->name;
+                        mangleName += prefix + std::to_string( refType->name.length() ) + refType->name;
                         if ( mangleGenericParams ) {
                                 if ( ! refType->params.empty() ) {
                                         mangleName << "_";
+                                        mangleName += "_";
                                         for ( const ast::Expr * param : refType->params ) {
                                                 auto paramType = dynamic_cast< const ast::TypeExpr * >( param );
 …
                                                 maybeAccept( paramType->type.get(), *visitor );
+                                        }
                                         mangleName << "_";
+                                        mangleName += "_";
+                                }
+                        }
 …
                                 // are first found and prefixing with the appropriate encoding for the type class.
                                 assertf( varNum->second.second < TypeDecl::NUMBER_OF_KINDS, "Unhandled type variable kind: %d", varNum->second.second );
                                 mangleName << Encoding::typeVariables[varNum->second.second] << varNum->second.first;
+                                mangleName += Encoding::typeVariables[varNum->second.second] + std::to_string( varNum->second.first );
                         } // if
+                }
 …
                 void Mangler_new::postvisit( const ast::TraitInstType * inst ) {
                         printQualifiers( inst );
                         mangleName << inst->name.size() << inst->name;
+                        mangleName += std::to_string( inst->name.size() ) + inst->name;
+                }
                 void Mangler_new::postvisit( const ast::TupleType * tupleType ) {
                         printQualifiers( tupleType );
                         mangleName << Encoding::tuple << tupleType->types.size();
+                        mangleName += Encoding::tuple + std::to_string( tupleType->types.size() );
                         accept_each( tupleType->types, *visitor );
+                }
 …
                         printQualifiers( varArgsType );
                         static const std::string vargs = "__builtin_va_list";
                         mangleName << Encoding::type << vargs.size() << vargs;
+                        mangleName += Encoding::type + std::to_string( vargs.size() ) + vargs;
+                }
                 void Mangler_new::postvisit( const ast::ZeroType * ) {
                         mangleName << Encoding::zero;
+                        mangleName += Encoding::zero;
+                }
                 void Mangler_new::postvisit( const ast::OneType * ) {
                         mangleName << Encoding::one;
+                        mangleName += Encoding::one;
+                }
 …
                                 // N marks the start of a qualified type
                                 inQualifiedType = true;
                                 mangleName << Encoding::qualifiedTypeStart;
+                                mangleName += Encoding::qualifiedTypeStart;
+                        }
                         maybeAccept( qualType->parent.get(), *visitor );
 …
                                 // E marks the end of a qualified type
                                 inQualifiedType = false;
                                 mangleName << Encoding::qualifiedTypeEnd;
+                                mangleName += Encoding::qualifiedTypeEnd;
+                        }
+                }
 …
                         assertf(false, "Mangler_new should not visit typedecl: %s", toCString(decl));
                         assertf( decl->kind < ast::TypeDecl::Kind::NUMBER_OF_KINDS, "Unhandled type variable kind: %d", decl->kind );
                         mangleName << Encoding::typeVariables[ decl->kind ] << ( decl->name.length() ) << decl->name;
+                        mangleName += Encoding::typeVariables[ decl->kind ] + std::to_string( decl->name.length() ) + decl->name;
+                }
 …
                         // skip if not including qualifiers
                         if ( typeMode ) return;
                         if ( auto ptype = dynamic_cast< const ast::ParameterizedType * >(type) ) {
+                        if ( auto ptype = dynamic_cast< const ast::FunctionType * >(type) ) {
                                 if ( ! ptype->forall.empty() ) {
                                         std::list< std::string > assertionNames;
                                         int dcount = 0, fcount = 0, vcount = 0, acount = 0;
                                         mangleName << Encoding::forall;
                                         for ( const ast::TypeDecl * decl : ptype->forall ) {
+                                        mangleName += Encoding::forall;
+                                        for ( auto & decl : ptype->forall ) {
                                                 switch ( decl->kind ) {
                                                 case ast::TypeDecl::Kind::Dtype:
 …
                                                 } // switch
                                                 varNums[ decl->name ] = std::make_pair( nextVarNum, (int)decl->kind );
-                                                for ( const ast::DeclWithType * assert : decl->assertions ) {
-                                                        ast::Pass<Mangler_new> sub_mangler(
-                                                                mangleOverridable, typeMode, mangleGenericParams, nextVarNum, varNums );
-                                                        assert->accept( sub_mangler );
-                                                        assertionNames.push_back( sub_mangler.pass.get_mangleName() );
-                                                        acount++;
-                                                } // for
                                         } // for
+                                        mangleName << dcount << "_" << fcount << "_" << vcount << "_" << acount << "_";
+                                        std::copy( assertionNames.begin(), assertionNames.end(), std::ostream_iterator< std::string >( mangleName, "" ) );
+                                        mangleName << "_";
+                                        for ( auto & assert : ptype->assertions ) {
+                                                ast::Pass<Mangler_new> sub_mangler(
+                                                        mangleOverridable, typeMode, mangleGenericParams, nextVarNum, varNums );
+                                                assert->var->accept( sub_mangler );
+                                                assertionNames.push_back( sub_mangler.core.get_mangleName() );
+                                                acount++;
+                                        } // for
+                                        mangleName += std::to_string( dcount ) + "_" + std::to_string( fcount ) + "_" + std::to_string( vcount ) + "_" + std::to_string( acount ) + "_";
+                                        for(const auto & a : assertionNames) mangleName += a;
+//                                      std::copy( assertionNames.begin(), assertionNames.end(), std::ostream_iterator< std::string >( mangleName, "" ) );
+                                        mangleName += "_";
                                 } // if
                         } // if
 …
                                 // these qualifiers do not distinguish the outermost type of a function parameter
                                 if ( type->is_const() ) {
                                         mangleName << Encoding::qualifiers.at(Type::Const);
+                                        mangleName += Encoding::qualifiers.at(Type::Const);
                                 } // if
                                 if ( type->is_volatile() ) {
                                         mangleName << Encoding::qualifiers.at(Type::Volatile);
+                                        mangleName += Encoding::qualifiers.at(Type::Volatile);
                                 } // if
                                 // Removed due to restrict not affecting function compatibility in GCC
                                 // if ( type->get_isRestrict() ) {
                                 //      mangleName << "E";
+                                //      mangleName += "E";
                                 // } // if
                                 if ( type->is_atomic() ) {
                                         mangleName << Encoding::qualifiers.at(Type::Atomic);
+                                        mangleName += Encoding::qualifiers.at(Type::Atomic);
                                 } // if
+                        }
                         if ( type->is_mutex() ) {
                                 mangleName << Encoding::qualifiers.at(Type::Mutex);
+                                mangleName += Encoding::qualifiers.at(Type::Mutex);
                         } // if
                         if ( inFunctionType ) {

src/SymTab/Validate.cc

-              r3c64c668
+              r58fe85a
 #include "Common/UniqueName.h"         // for UniqueName
 #include "Common/utility.h"            // for operator+, cloneAll, deleteAll
+#include "CompilationState.h"          // skip some passes in new-ast build
 #include "Concurrency/Keywords.h"      // for applyKeywords
 #include "FixFunction.h"               // for FixFunction
 …
         };
         struct ArrayLength : public WithIndexer {
+        struct InitializerLength {
                 /// for array types without an explicit length, compute the length and store it so that it
                 /// is known to the rest of the phases. For example,
 …
                 void previsit( ObjectDecl * objDecl );
+        };
+        struct ArrayLength : public WithIndexer {
+                static void computeLength( std::list< Declaration * > & translationUnit );
                 void previsit( ArrayType * arrayType );
         };
 …
                                 mutateAll( translationUnit, compoundliteral );
                         });
+                        Stats::Time::TimeBlock("Resolve With Expressions", [&]() {
+                                ResolvExpr::resolveWithExprs( translationUnit ); // must happen before FixObjectType because user-code is resolved and may contain with variables
+                        });
+                        if (!useNewAST) {
+                                Stats::Time::TimeBlock("Resolve With Expressions", [&]() {
+                                        ResolvExpr::resolveWithExprs( translationUnit ); // must happen before FixObjectType because user-code is resolved and may contain with variables
+                                });
+                        }
+                }
+                {
                         Stats::Heap::newPass("validate-F");
                         Stats::Time::BlockGuard guard("validate-F");
+                        Stats::Time::TimeCall("Fix Object Type",
+                                FixObjectType::fix, translationUnit);
+                        Stats::Time::TimeCall("Array Length",
+                                ArrayLength::computeLength, translationUnit);
+                        if (!useNewAST) {
+                                Stats::Time::TimeCall("Fix Object Type",
+                                        FixObjectType::fix, translationUnit);
+                        }
+                        Stats::Time::TimeCall("Initializer Length",
+                                InitializerLength::computeLength, translationUnit);
+                        if (!useNewAST) {
+                                Stats::Time::TimeCall("Array Length",
+                                        ArrayLength::computeLength, translationUnit);
+                        }
                         Stats::Time::TimeCall("Find Special Declarations",
                                 Validate::findSpecialDecls, translationUnit);
                         Stats::Time::TimeCall("Fix Label Address",
                                 mutateAll<LabelAddressFixer>, translationUnit, labelAddrFixer);
+                        Stats::Time::TimeCall("Handle Attributes",
+                                Validate::handleAttributes, translationUnit);
+                        if (!useNewAST) {
+                                Stats::Time::TimeCall("Handle Attributes",
+                                        Validate::handleAttributes, translationUnit);
+                        }
+                }
+        }
 …
+        }
+        static bool isNonParameterAttribute( Attribute * attr ) {
+                static const std::vector<std::string> bad_names = {
+                        "aligned", "__aligned__",
+                };
+                for ( auto name : bad_names ) {
+                        if ( name == attr->name ) {
+                                return true;
+                        }
+                }
+                return false;
+        }
         Type * ReplaceTypedef::postmutate( TypeInstType * typeInst ) {
                 // instances of typedef types will come here. If it is an instance
 …
                         ret->location = typeInst->location;
                         ret->get_qualifiers() |= typeInst->get_qualifiers();
+                        // attributes are not carried over from typedef to function parameters/return values
+                        if ( ! inFunctionType ) {
+                                ret->attributes.splice( ret->attributes.end(), typeInst->attributes );
+                        } else {
+                                deleteAll( ret->attributes );
+                                ret->attributes.clear();
+                        }
+                        // GCC ignores certain attributes if they arrive by typedef, this mimics that.
+                        if ( inFunctionType ) {
+                                ret->attributes.remove_if( isNonParameterAttribute );
+                        }
+                        ret->attributes.splice( ret->attributes.end(), typeInst->attributes );
                         // place instance parameters on the typedef'd type
                         if ( ! typeInst->parameters.empty() ) {
 …
+        }
+        void InitializerLength::computeLength( std::list< Declaration * > & translationUnit ) {
+                PassVisitor<InitializerLength> len;
+                acceptAll( translationUnit, len );
+        }
         void ArrayLength::computeLength( std::list< Declaration * > & translationUnit ) {
                 PassVisitor<ArrayLength> len;
 …
+        }
         void ArrayLength::previsit( ObjectDecl * objDecl ) {
+        void InitializerLength::previsit( ObjectDecl * objDecl ) {
                 if ( ArrayType * at = dynamic_cast< ArrayType * >( objDecl->type ) ) {
                         if ( at->dimension ) return;
 …
         /// Replaces enum types by int, and function/array types in function parameter and return
         /// lists by appropriate pointers
+        /*
         struct EnumAndPointerDecay_new {
                 const ast::EnumDecl * previsit( const ast::EnumDecl * enumDecl ) {
 …
+                }
         };
+        */
         /// expand assertions from a trait instance, performing appropriate type variable substitutions
 …
+        }
+        /*
         /// Associates forward declarations of aggregates with their definitions
         class LinkReferenceToTypes_new final
 …
+                }
                 void checkGenericParameters( const ast::ReferenceToType * inst ) {
+                void checkGenericParameters( const ast::BaseInstType * inst ) {
                         for ( const ast::Expr * param : inst->params ) {
                                 if ( ! dynamic_cast< const ast::TypeExpr * >( param ) ) {
 …
                 static const node_t * forallFixer(
                         const CodeLocation & loc, const node_t * node,
                         ast::ParameterizedType::ForallList parent_t::* forallField
+                        ast::FunctionType::ForallList parent_t::* forallField
                 ) {
                         for ( unsigned i = 0; i < (node->* forallField).size(); ++i ) {
 …
+                }
         };
+        */
 } // anonymous namespace
+/*
 const ast::Type * validateType(
                 const CodeLocation & loc, const ast::Type * type, const ast::SymbolTable & symtab ) {
         ast::Pass< EnumAndPointerDecay_new > epc;
+        // ast::Pass< EnumAndPointerDecay_new > epc;
         ast::Pass< LinkReferenceToTypes_new > lrt{ loc, symtab };
         ast::Pass< ForallPointerDecay_new > fpd{ loc };
         return type->accept( epc )->accept( lrt )->accept( fpd );
+        return type->accept( lrt )->accept( fpd );
+}
+*/
 } // namespace SymTab

src/SymTab/module.mk

-              r3c64c668
+              r58fe85a
 SRC_SYMTAB = \
       SymTab/Autogen.cc \
+      SymTab/Autogen.h \
       SymTab/FixFunction.cc \
+      SymTab/FixFunction.h \
       SymTab/Indexer.cc \
+      SymTab/Indexer.h \
       SymTab/Mangler.cc \
       SymTab/ManglerCommon.cc \
+      SymTab/Validate.cc
+      SymTab/Mangler.h \
+      SymTab/Validate.cc \
+      SymTab/Validate.h
 SRC += $(SRC_SYMTAB)

src/SynTree/AggregateDecl.cc

-              r3c64c668
+              r58fe85a
 #include "Common/utility.h"      // for printAll, cloneAll, deleteAll
 #include "Declaration.h"         // for AggregateDecl, TypeDecl, Declaration
+#include "Expression.h"
 #include "Initializer.h"
 #include "LinkageSpec.h"         // for Spec, linkageName, Cforall
 …
 const char * StructDecl::typeString() const { return aggrString( kind ); }
+StructInstType * StructDecl::makeInst( std::list< Expression * > const & new_parameters ) {
+        std::list< Expression * > copy_parameters;
+        cloneAll( new_parameters, copy_parameters );
+        return makeInst( move( copy( copy_parameters ) ) );
+}
+StructInstType * StructDecl::makeInst( std::list< Expression * > && new_parameters ) {
+        assert( parameters.size() == new_parameters.size() );
+        StructInstType * type = new StructInstType( noQualifiers, this );
+        type->parameters = std::move( new_parameters );
+        return type;
+}
 const char * UnionDecl::typeString() const { return aggrString( Union ); }

src/SynTree/ApplicationExpr.cc

-              r3c64c668
+              r58fe85a
 ParamEntry::ParamEntry( const ParamEntry &other ) :
                 decl( other.decl ), declptr( maybeClone( other.declptr ) ), actualType( maybeClone( other.actualType ) ), formalType( maybeClone( other.formalType ) ), expr( maybeClone( other.expr ) ) {
+                decl( other.decl ), declptr( other.declptr ), actualType( maybeClone( other.actualType ) ), formalType( maybeClone( other.formalType ) ), expr( maybeClone( other.expr ) ) {
+}
 ParamEntry::~ParamEntry() {
         delete declptr;
+        // delete declptr;
         delete actualType;
         delete formalType;

src/SynTree/Declaration.h

-              r3c64c668
+              r58fe85a
   public:
         Type * base;
-        std::list< TypeDecl * > parameters;
         std::list< DeclarationWithType * > assertions;
 …
         Type * get_base() const { return base; }
         void set_base( Type * newValue ) { base = newValue; }
-        std::list< TypeDecl* > & get_parameters() { return parameters; }
         std::list< DeclarationWithType * >& get_assertions() { return assertions; }
 …
         bool is_coroutine() { return kind == Coroutine; }
+        bool is_monitor() { return kind == Monitor; }
+        bool is_thread() { return kind == Thread; }
+        bool is_generator() { return kind == Generator; }
+        bool is_monitor  () { return kind == Monitor  ; }
+        bool is_thread   () { return kind == Thread   ; }
+        // Make a type instance of this declaration.
+        StructInstType * makeInst( std::list< Expression * > const & parameters );
+        StructInstType * makeInst( std::list< Expression * > && parameters );
         virtual StructDecl * clone() const override { return new StructDecl( *this ); }

src/SynTree/Expression.cc

-              r3c64c668
+              r58fe85a
 #include "Type.h"                    // for Type, BasicType, Type::Qualifiers
 #include "TypeSubstitution.h"        // for TypeSubstitution
+#include "CompilationState.h"        // for deterministic_output
 #include "GenPoly/Lvalue.h"
 …
         printInferParams( inferParams, os, indent+1, 0 );
+        if ( result ) {
+                os << std::endl << indent << "with resolved type:" << std::endl;
+                os << (indent+1);
+                result->print( os, indent+1 );
+        }
         if ( env ) {
                 os << std::endl << indent << "... with environment:" << std::endl;
 …
+}
+KeywordCastExpr::KeywordCastExpr( Expression * arg, AggregateDecl::Aggregate target ) : Expression(), arg(arg), target( target ) {
+}
+KeywordCastExpr::KeywordCastExpr( const KeywordCastExpr & other ) : Expression( other ), arg( maybeClone( other.arg ) ), target( other.target ) {
+}
+KeywordCastExpr::KeywordCastExpr( Expression * arg, AggregateDecl::Aggregate target ) : Expression(), arg(arg), target( target ) {}
+KeywordCastExpr::KeywordCastExpr( Expression * arg, AggregateDecl::Aggregate target, const KeywordCastExpr::Concrete & concrete_target ) : Expression(), arg(arg), target( target ), concrete_target(concrete_target) {}
+KeywordCastExpr::KeywordCastExpr( const KeywordCastExpr & other ) : Expression( other ), arg( maybeClone( other.arg ) ), target( other.target ) {}
 KeywordCastExpr::~KeywordCastExpr() {

src/SynTree/Expression.h

-              r3c64c668
+              r58fe85a
 };
+/// VariableExpr represents an expression that simply refers to the value of a named variable.
+/// Does not take ownership of var.
+class VariableExpr : public Expression {
+  public:
+        DeclarationWithType * var;
+        VariableExpr();
+        VariableExpr( DeclarationWithType * var );
+        VariableExpr( const VariableExpr & other );
+        virtual ~VariableExpr();
+        bool get_lvalue() const final;
+        DeclarationWithType * get_var() const { return var; }
+        void set_var( DeclarationWithType * newValue ) { var = newValue; }
+        static VariableExpr * functionPointer( FunctionDecl * decl );
+        virtual VariableExpr * clone() const override { return new VariableExpr( * this ); }
+        virtual void accept( Visitor & v ) override { v.visit( this ); }
+        virtual void accept( Visitor & v ) const override { v.visit( this ); }
+        virtual Expression * acceptMutator( Mutator & m ) override { return m.mutate( this ); }
+        virtual void print( std::ostream & os, Indenter indent = {} ) const override;
+};
 // The following classes are used to represent expression types that cannot be converted into
 // function-call format.
 …
   public:
         Expression * arg;
+        bool isGenerated = true; // cast generated implicitly by code generation or explicit in program
+        // Inidicates cast is introduced by the CFA type system.
+        // true for casts that the resolver introduces to force a return type
+        // false for casts from user code
+        // false for casts from desugaring advanced CFA features into simpler CFA
+        // example
+        //   int * p;     // declaration
+        //   (float *) p; // use, with subject cast
+        // subject cast isGenerated means we are considering an interpretation with a type mismatch
+        // subject cast not isGenerated means someone in charge wants it that way
+        bool isGenerated = true;
         CastExpr( Expression * arg, bool isGenerated = true );
 …
         KeywordCastExpr( Expression * arg, AggregateDecl::Aggregate target );
+        KeywordCastExpr( Expression * arg, AggregateDecl::Aggregate target, const Concrete & concrete_target );
         KeywordCastExpr( const KeywordCastExpr & other );
         virtual ~KeywordCastExpr();
 …
         virtual MemberExpr * clone() const override { return new MemberExpr( * this ); }
-        virtual void accept( Visitor & v ) override { v.visit( this ); }
-        virtual void accept( Visitor & v ) const override { v.visit( this ); }
-        virtual Expression * acceptMutator( Mutator & m ) override { return m.mutate( this ); }
-        virtual void print( std::ostream & os, Indenter indent = {} ) const override;
-};
-/// VariableExpr represents an expression that simply refers to the value of a named variable.
-/// Does not take ownership of var.
-class VariableExpr : public Expression {
-  public:
-        DeclarationWithType * var;
-        VariableExpr();
-        VariableExpr( DeclarationWithType * var );
-        VariableExpr( const VariableExpr & other );
-        virtual ~VariableExpr();
-        bool get_lvalue() const final;
-        DeclarationWithType * get_var() const { return var; }
-        void set_var( DeclarationWithType * newValue ) { var = newValue; }
-        static VariableExpr * functionPointer( FunctionDecl * decl );
-        virtual VariableExpr * clone() const override { return new VariableExpr( * this ); }
         virtual void accept( Visitor & v ) override { v.visit( this ); }
         virtual void accept( Visitor & v ) const override { v.visit( this ); }

src/SynTree/LinkageSpec.cc

-              r3c64c668
+              r58fe85a
 // Author           : Rodolfo G. Esteves
 // Created On       : Sat May 16 13:22:09 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Dec 16 15:02:29 2019
 // Update Count     : 28
+// Last Modified By : Andrew Beach
+// Last Modified On : Mon Mar  2 16:13:00 2020
+// Update Count     : 29
 //
 …
 #include "LinkageSpec.h"
+#include "Common/CodeLocation.h"
 #include "Common/SemanticError.h"

src/SynTree/LinkageSpec.h

-              r3c64c668
+              r58fe85a
 // Author           : Rodolfo G. Esteves
 // Created On       : Sat May 16 13:24:28 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Dec 16 15:03:43 2019
 // Update Count     : 20
+// Last Modified By : Andrew Beach
+// Last Modified On : Mon Mar  2 16:13:00 2020
+// Update Count     : 21
 //
 …
 #include <string>
+#include "Common/CodeLocation.h"
+struct CodeLocation;
 namespace LinkageSpec {

src/SynTree/Mutator.h

r3c64c668	r58fe85a
51	51	virtual Statement * mutate( CatchStmt * catchStmt ) = 0;
52	52	virtual Statement * mutate( FinallyStmt * catchStmt ) = 0;
	53	virtual Statement * mutate( SuspendStmt * suspendStmt ) = 0;
53	54	virtual Statement * mutate( WaitForStmt * waitforStmt ) = 0;
54	55	virtual Declaration * mutate( WithStmt * withStmt ) = 0;

src/SynTree/NamedTypeDecl.cc

-              r3c64c668
+              r58fe85a
 #include "LinkageSpec.h"         // for Spec, Cforall, linkageName
 #include "Type.h"                // for Type, Type::StorageClasses
+#include "CompilationState.h"
 NamedTypeDecl::NamedTypeDecl( const std::string &name, Type::StorageClasses scs, Type *base )
 …
 NamedTypeDecl::NamedTypeDecl( const NamedTypeDecl &other )
         : Parent( other ), base( maybeClone( other.base ) ) {
-        cloneAll( other.parameters, parameters );
         cloneAll( other.assertions, assertions );
+}
 …
 NamedTypeDecl::~NamedTypeDecl() {
         delete base;
-        deleteAll( parameters );
         deleteAll( assertions );
+}
 …
         using namespace std;
+        if ( name != "" ) os << name << ": ";
+        if ( ! name.empty() ) {
+                if( deterministic_output && isUnboundType(name) ) os << "[unbound]:";
+                else os << name << ": ";
+        }
         if ( linkage != LinkageSpec::Cforall ) {
 …
                 os << " for ";
                 base->print( os, indent+1 );
-        } // if
-        if ( ! parameters.empty() ) {
-                os << endl << indent << "... with parameters" << endl;
-                printAll( parameters, os, indent+1 );
         } // if
         if ( ! assertions.empty() ) {
 …
                 base->print( os, indent+1 );
         } // if
-        if ( ! parameters.empty() ) {
-                os << endl << indent << "... with parameters" << endl;
-                printAll( parameters, os, indent+1 );
-        } // if
+}

src/SynTree/ReferenceToType.cc

-              r3c64c668
+              r58fe85a
 #include "Type.h"             // for TypeInstType, StructInstType, UnionInstType
 #include "TypeSubstitution.h" // for TypeSubstitution
+#include "CompilationState.h"
 class Attribute;
 …
         Type::print( os, indent );
+        os << "instance of " << typeString() << " " << get_name() << " (" << ( isFtype ? "" : "not" ) << " function type)";
+        os << "instance of " << typeString() << " ";
+        const auto & name_ = get_name();
+        if( deterministic_output && isUnboundType(name) ) os << "[unbound]";
+        else os << name;
+        os << " (" << ( isFtype ? "" : "not" ) << " function type)";
         if ( ! parameters.empty() ) {
                 os << endl << indent << "... with parameters" << endl;

src/SynTree/Statement.cc

-              r3c64c668
+              r58fe85a
+}
+SuspendStmt::SuspendStmt( const SuspendStmt & other )
+        : Statement( other )
+        , then( maybeClone(other.then) )
+{}
+SuspendStmt::~SuspendStmt() {
+        delete then;
+}
+void SuspendStmt::print( std::ostream & os, Indenter indent ) const {
+        os << "Suspend Statement";
+        switch (type) {
+                case None     : os << " with implicit target"; break;
+                case Generator: os << " for generator"       ; break;
+                case Coroutine: os << " for coroutine"       ; break;
+        }
+        os << endl;
+        indent += 1;
+        if(then) {
+                os << indent << " with post statement :" << endl;
+                then->print( os, indent + 1);
+        }
+}
 WaitForStmt::WaitForStmt() : Statement() {
         timeout.time      = nullptr;

src/SynTree/Statement.h

-              r3c64c668
+              r58fe85a
 };
+class SuspendStmt : public Statement {
+  public:
+        CompoundStmt * then = nullptr;
+        enum Type { None, Coroutine, Generator } type = None;
+        SuspendStmt() = default;
+        SuspendStmt( const SuspendStmt & );
+        virtual ~SuspendStmt();
+        virtual SuspendStmt * clone() const override { return new SuspendStmt( *this ); }
+        virtual void accept( Visitor & v ) override { v.visit( this ); }
+        virtual void accept( Visitor & v ) const override { v.visit( this ); }
+        virtual Statement * acceptMutator( Mutator & m )  override { return m.mutate( this ); }
+        virtual void print( std::ostream & os, Indenter indent = {} ) const override;
+};
 class WaitForStmt : public Statement {
   public:
 …
 class ImplicitCtorDtorStmt : public Statement {
   public:
         // Non-owned pointer to the constructor/destructor statement
+        // the constructor/destructor call statement; owned here for a while, eventually transferred elsewhere
         Statement * callStmt;

src/SynTree/SynTree.h

r3c64c668	r58fe85a
54	54	class CatchStmt;
55	55	class FinallyStmt;
	56	class SuspendStmt;
56	57	class WaitForStmt;
57	58	class WithStmt;

src/SynTree/Type.cc

-              r3c64c668
+              r58fe85a
 const Type::Qualifiers noQualifiers;
+bool isUnboundType(const Type * type) {
+        if (auto typeInst = dynamic_cast<const TypeInstType *>(type)) {
+                // xxx - look for a type name produced by renameTyVars.
+                // TODO: once TypeInstType representation is updated, it should properly check
+                // if the context id is filled. this is a temporary hack for now
+                return isUnboundType(typeInst->name);
+        }
+        return false;
+}
+bool isUnboundType(const std::string & tname) {
+        // xxx - look for a type name produced by renameTyVars.
+        // TODO: once TypeInstType representation is updated, it should properly check
+        // if the context id is filled. this is a temporary hack for now
+        if (std::count(tname.begin(), tname.end(), '_') >= 3) {
+                return true;
+        }
+        return false;
+}
 // Local Variables: //
 // tab-width: 4 //

src/SynTree/Type.h

-              r3c64c668
+              r58fe85a
 };
+bool isUnboundType(const Type * type);
+bool isUnboundType(const std::string & tname);
 // Local Variables: //
 // tab-width: 4 //

src/SynTree/TypeDecl.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Mon May 18 07:44:20 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Dec 13 15:26:14 2019
 // Update Count     : 21
+// Last Modified On : Thu Oct  8 18:18:55 2020
+// Update Count     : 22
 //
 …
 #include "Type.h"            // for Type, Type::StorageClasses
+TypeDecl::TypeDecl( const std::string & name, Type::StorageClasses scs, Type * type, Kind kind, bool sized, Type * init ) : Parent( name, scs, type ), kind( kind ), sized( kind == Ttype || sized ), init( init ) {
+TypeDecl::TypeDecl( const std::string & name, Type::StorageClasses scs, Type * type, Kind kind, bool sized, Type * init ) :
+        Parent( name, scs, type ), kind( kind ), sized( kind == Ttype || sized ), init( init ) {
+}

src/SynTree/Visitor.h

r3c64c668	r58fe85a
78	78	virtual void visit( FinallyStmt * node ) { visit( const_cast<const FinallyStmt *>(node) ); }
79	79	virtual void visit( const FinallyStmt * finallyStmt ) = 0;
	80	virtual void visit( SuspendStmt * node ) { visit( const_cast<const SuspendStmt *>(node) ); }
	81	virtual void visit( const SuspendStmt * suspendStmt ) = 0;
80	82	virtual void visit( WaitForStmt * node ) { visit( const_cast<const WaitForStmt *>(node) ); }
81	83	virtual void visit( const WaitForStmt * waitforStmt ) = 0;

src/SynTree/module.mk

-              r3c64c668
+              r58fe85a
       SynTree/ApplicationExpr.cc \
       SynTree/ArrayType.cc \
+      SynTree/Attribute.cc \
+      SynTree/Attribute.h \
       SynTree/AttrType.cc \
       SynTree/Attribute.cc \
+      SynTree/BaseSyntaxNode.h \
       SynTree/BasicType.cc \
       SynTree/CommaExpr.cc \
       SynTree/CompoundStmt.cc \
       SynTree/Constant.cc \
+      SynTree/Constant.h \
+      SynTree/Declaration.cc \
+      SynTree/Declaration.h \
+      SynTree/DeclarationWithType.cc \
       SynTree/DeclReplacer.cc \
+      SynTree/DeclReplacer.h \
       SynTree/DeclStmt.cc \
-      SynTree/Declaration.cc \
-      SynTree/DeclarationWithType.cc \
       SynTree/Expression.cc \
+      SynTree/Expression.h \
       SynTree/FunctionDecl.cc \
       SynTree/FunctionType.cc \
       SynTree/Initializer.cc \
+      SynTree/Initializer.h \
+      SynTree/Label.h \
       SynTree/LinkageSpec.cc \
+      SynTree/LinkageSpec.h \
+      SynTree/Mutator.h \
       SynTree/NamedTypeDecl.cc \
       SynTree/ObjectDecl.cc \
 …
       SynTree/ReferenceType.cc \
       SynTree/Statement.cc \
+      SynTree/Statement.h \
+      SynTree/SynTree.h \
       SynTree/TupleExpr.cc \
       SynTree/TupleType.cc \
 …
       SynTree/TypeDecl.cc \
       SynTree/TypeExpr.cc \
+      SynTree/Type.h \
+      SynTree/TypeofType.cc \
       SynTree/TypeSubstitution.cc \
       SynTree/TypeofType.cc \
+      SynTree/TypeSubstitution.h \
       SynTree/VarArgsType.cc \
+      SynTree/Visitor.h \
       SynTree/VoidType.cc \
       SynTree/ZeroOneType.cc

src/Tuples/Explode.cc

-              r3c64c668
+              r58fe85a
                         for ( const ast::Expr * expr : tupleExpr->exprs ) {
                                 exprs.emplace_back( applyCast( expr, false ) );
-                                //exprs.emplace_back( ast::ptr< ast::Expr >( applyCast( expr, false ) ) );
+                        }
                         if ( first ) {
 …
+        }
         const ast::Expr * postmutate( const ast::UniqueExpr * node ) {
+        const ast::Expr * postvisit( const ast::UniqueExpr * node ) {
                 // move cast into unique expr so that the unique expr has type T& rather than
                 // type T. In particular, this transformation helps with generating the
 …
                         castAdded = false;
                         const ast::Type * newType = getReferenceBase( newNode->result );
                         return new ast::CastExpr{ newNode->location, node, newType };
+                        return new ast::CastExpr{ newNode->location, newNode, newType };
+                }
                 return newNode;
+        }
         const ast::Expr * postmutate( const ast::TupleIndexExpr * tupleExpr ) {
+        const ast::Expr * postvisit( const ast::TupleIndexExpr * tupleExpr ) {
                 // tuple index expr needs to be rebuilt to ensure that the type of the
                 // field is consistent with the type of the tuple expr, since the field
 …
         ast::Pass<CastExploderCore> exploder;
         expr = expr->accept( exploder );
         if ( ! exploder.pass.foundUniqueExpr ) {
+        if ( ! exploder.core.foundUniqueExpr ) {
                 expr = new ast::CastExpr{ expr, new ast::ReferenceType{ expr->result } };
+        }

src/Tuples/Explode.h

r3c64c668	r58fe85a
210	210	}
211	211	// Cast a reference away to a value-type to allow further explosion.
212		if ( ~~dynamic_cast< const ast::ReferenceType *>( local->result.get()~~ ) ) {
	212	if ( local->result.as< ast::ReferenceType >() ) {
213	213	local = new ast::CastExpr{ local, tupleType };
214	214	}
…	…
220	220	// delete idx;
221	221	}
222		~~// delete local;~~
223	222	}
224	223	} else {

src/Tuples/TupleAssignment.cc

-              r3c64c668
+              r58fe85a
                                         // resolve ctor/dtor for the new object
                                         ast::ptr< ast::Init > ctorInit = ResolvExpr::resolveCtorInit(
                                                         InitTweak::genCtorInit( location, ret ), spotter.crntFinder.symtab );
+                                                        InitTweak::genCtorInit( location, ret ), spotter.crntFinder.localSyms );
                                         // remove environments from subexpressions of stmtExpr
                                         ast::Pass< EnvRemover > rm{ env };
 …
                                         // resolve the cast expression so that rhsCand return type is bound by the cast
                                         // type as needed, and transfer the resulting environment
                                         ResolvExpr::CandidateFinder finder{ spotter.crntFinder.symtab, env };
+                                        ResolvExpr::CandidateFinder finder{ spotter.crntFinder.localSyms, env };
                                         finder.find( rhsCand->expr, ResolvExpr::ResolvMode::withAdjustment() );
                                         assert( finder.candidates.size() == 1 );
 …
                                         // explode the LHS so that each field of a tuple-valued expr is assigned
                                         ResolvExpr::CandidateList lhs;
                                         explode( *lhsCand, crntFinder.symtab, back_inserter(lhs), true );
+                                        explode( *lhsCand, crntFinder.localSyms, back_inserter(lhs), true );
                                         for ( ResolvExpr::CandidateRef & cand : lhs ) {
                                                 // each LHS value must be a reference - some come in with a cast, if not
 …
                                                         if ( isTuple( rhsCand->expr ) ) {
                                                                 // multiple assignment
                                                                 explode( *rhsCand, crntFinder.symtab, back_inserter(rhs), true );
+                                                                explode( *rhsCand, crntFinder.localSyms, back_inserter(rhs), true );
                                                                 matcher.reset(
                                                                         new MultipleAssignMatcher{ *this, expr->location, lhs, rhs } );
 …
                                                         // multiple assignment
                                                         ResolvExpr::CandidateList rhs;
                                                         explode( rhsCand, crntFinder.symtab, back_inserter(rhs), true );
+                                                        explode( rhsCand, crntFinder.localSyms, back_inserter(rhs), true );
                                                         matcher.reset(
                                                                 new MultipleAssignMatcher{ *this, expr->location, lhs, rhs } );
 …
+                                )
                                 ResolvExpr::CandidateFinder finder{ crntFinder.symtab, matcher->env };
+                                ResolvExpr::CandidateFinder finder{ crntFinder.localSyms, matcher->env };
                                 try {

src/Tuples/TupleExpansion.cc

r3c64c668	r58fe85a
323	323	std::vector<ast::ptr<ast::Type>> types;
324	324	ast::CV::Qualifiers quals{
325		ast::CV::Const \| ast::CV::Volatile \| ast::CV::Restrict \| ~~ast::CV::Lvalue \|~~
	325	ast::CV::Const \| ast::CV::Volatile \| ast::CV::Restrict \|
326	326	ast::CV::Atomic \| ast::CV::Mutex };
327	327

src/Tuples/Tuples.cc

-              r3c64c668
+              r58fe85a
         };
         struct ImpurityDetectorIgnoreUnique : public ImpurityDetector {
+                using ImpurityDetector::previsit;
                 void previsit( ast::UniqueExpr const * ) {
                         visit_children = false;
 …
                 ast::Pass<Detector> detector;
                 expr->accept( detector );
                 return detector.pass.maybeImpure;
+                return detector.core.maybeImpure;
+        }
 } // namespace

src/Tuples/module.mk

-              r3c64c668
+              r58fe85a
 ###############################################################################
+SRC += Tuples/TupleAssignment.cc Tuples/TupleExpansion.cc Tuples/Explode.cc \
+        Tuples/Tuples.cc
+SRCDEMANGLE += Tuples/TupleAssignment.cc Tuples/TupleExpansion.cc Tuples/Explode.cc \
+        Tuples/Tuples.cc
+SRC_TUPLES = \
+        Tuples/Explode.cc \
+        Tuples/Explode.h \
+        Tuples/TupleAssignment.cc \
+        Tuples/TupleExpansion.cc \
+        Tuples/Tuples.cc \
+        Tuples/Tuples.h
+SRC += $(SRC_TUPLES)
+SRCDEMANGLE += $(SRC_TUPLES)

src/Validate/module.mk

r3c64c668	r58fe85a
15	15	###############################################################################
16	16
17		SRC += Validate/HandleAttributes.cc Validate/~~FindSpecialDecls.cc~~
18		SRCDEMANGLE += Validate/HandleAttributes.cc Validate/~~FindSpecialDecls.cc~~
	17	SRC += Validate/HandleAttributes.cc Validate/HandleAttributes.h Validate/FindSpecialDecls.cc Validate/FindSpecialDecls.h
	18	SRCDEMANGLE += Validate/HandleAttributes.cc Validate/HandleAttributes.h Validate/FindSpecialDecls.cc Validate/FindSpecialDecls.h

src/Virtual/ExpandCasts.cc

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Jul 24 13:59:00 2017
 // Last Modified By : Andrew Beach
 // Last Modified On : Tus Aug  2 14:59:00 2017
 // Update Count     : 1
+// Last Modified On : Fri Jul 31 10:29:00 2020
+// Update Count     : 4
 //
 …
 #include <cassert>                 // for assert, assertf
 #include <iterator>                // for back_inserter, inserter
-#include <map>                     // for map, _Rb_tree_iterator, map<>::ite...
 #include <string>                  // for string, allocator, operator==, ope...
-#include <utility>                 // for pair
 #include "Common/PassVisitor.h"    // for PassVisitor
+#include "Common/ScopedMap.h"      // for ScopedMap
 #include "Common/SemanticError.h"  // for SemanticError
+#include "SymTab/Mangler.h"        // for mangleType
 #include "SynTree/Declaration.h"   // for ObjectDecl, StructDecl, FunctionDecl
 #include "SynTree/Expression.h"    // for VirtualCastExpr, CastExpr, Address...
 …
 namespace Virtual {
+        // Indented until the new ast code gets added.
+        /// Maps virtual table types the instance for that type.
+        class VirtualTableMap final {
+                ScopedMap<std::string, ObjectDecl *> vtable_instances;
+        public:
+                void enterScope() {
+                        vtable_instances.beginScope();
+                }
+                void leaveScope() {
+                        vtable_instances.endScope();
+                }
+                ObjectDecl * insert( ObjectDecl * vtableDecl ) {
+                        std::string const & mangledName = SymTab::Mangler::mangleType( vtableDecl->type );
+                        ObjectDecl *& value = vtable_instances[ mangledName ];
+                        if ( value ) {
+                                if ( vtableDecl->storageClasses.is_extern ) {
+                                        return nullptr;
+                                } else if ( ! value->storageClasses.is_extern ) {
+                                        return value;
+                                }
+                        }
+                        value = vtableDecl;
+                        return nullptr;
+                }
+                ObjectDecl * lookup( const Type * vtableType ) {
+                        std::string const & mangledName = SymTab::Mangler::mangleType( vtableType );
+                        const auto it = vtable_instances.find( mangledName );
+                        return ( vtable_instances.end() == it ) ? nullptr : it->second;
+                }
+        };
         /* Currently virtual depends on the rather brittle name matching between
 …
          */
+        namespace {
         std::string get_vtable_name( std::string const & name ) {
                 return name + "_vtable";
 …
         std::string get_vtable_inst_name_root( std::string const & name ) {
                 return get_vtable_name_root( name.substr(1, name.size() - 10 ) );
+        }
-        bool is_vtable_name( std::string const & name ) {
-                return (name.substr( name.size() - 7 ) == "_vtable" );
+        }
 …
+        }
+        } // namespace
         class VirtualCastCore {
-        std::map<std::string, ObjectDecl *> vtable_instances;
-        FunctionDecl *vcast_decl;
-        StructDecl *pvt_decl;
                 Type * pointer_to_pvt(int level_of_indirection) {
                         Type * type = new StructInstType(
 …
         public:
                 VirtualCastCore() :
                         vtable_instances(), vcast_decl( nullptr ), pvt_decl( nullptr )
+                        indexer(), vcast_decl( nullptr ), pvt_decl( nullptr )
                 {}
 …
                 Expression * postmutate( VirtualCastExpr * castExpr );
+                VirtualTableMap indexer;
+        private:
+                FunctionDecl *vcast_decl;
+                StructDecl *pvt_decl;
         };
 …
         void VirtualCastCore::premutate( ObjectDecl * objectDecl ) {
                 if ( is_vtable_inst_name( objectDecl->get_name() ) ) {
+                        vtable_instances[objectDecl->get_name()] = objectDecl;
+                }
+        }
+                        if ( ObjectDecl * existing = indexer.insert( objectDecl ) ) {
+                                std::string msg = "Repeated instance of virtual table, original found at: ";
+                                msg += existing->location.filename;
+                                msg += ":" + toString( existing->location.first_line );
+                                SemanticError( objectDecl->location, msg );
+                        }
+                }
+        }
+        namespace {
+        /// Better error locations for generated casts.
+        CodeLocation castLocation( const VirtualCastExpr * castExpr ) {
+                if ( castExpr->location.isSet() ) {
+                        return castExpr->location;
+                } else if ( castExpr->arg->location.isSet() ) {
+                        return castExpr->arg->location;
+                } else if ( castExpr->result->location.isSet() ) {
+                        return castExpr->result->location;
+                } else {
+                        return CodeLocation();
+                }
+        }
+        [[noreturn]] void castError( const VirtualCastExpr * castExpr, std::string const & message ) {
+                SemanticError( castLocation( castExpr ), message );
+        }
+        /// Get the virtual table type used in a virtual cast.
+        Type * getVirtualTableType( const VirtualCastExpr * castExpr ) {
+                const Type * objectType;
+                if ( auto target = dynamic_cast<const PointerType *>( castExpr->result ) ) {
+                        objectType = target->base;
+                } else if ( auto target = dynamic_cast<const ReferenceType *>( castExpr->result ) ) {
+                        objectType = target->base;
+                } else {
+                        castError( castExpr, "Virtual cast type must be a pointer or reference type." );
+                }
+                assert( objectType );
+                const StructInstType * structType = dynamic_cast<const StructInstType *>( objectType );
+                if ( nullptr == structType ) {
+                        castError( castExpr, "Virtual cast type must refer to a structure type." );
+                }
+                const StructDecl * structDecl = structType->baseStruct;
+                assert( structDecl );
+                const ObjectDecl * fieldDecl = nullptr;
+                if ( 0 < structDecl->members.size() ) {
+                        const Declaration * memberDecl = structDecl->members.front();
+                        assert( memberDecl );
+                        fieldDecl = dynamic_cast<const ObjectDecl *>( memberDecl );
+                        if ( fieldDecl && fieldDecl->name != "virtual_table" ) {
+                                fieldDecl = nullptr;
+                        }
+                }
+                if ( nullptr == fieldDecl ) {
+                        castError( castExpr, "Virtual cast type must have a leading virtual_table field." );
+                }
+                const PointerType * fieldType = dynamic_cast<const PointerType *>( fieldDecl->type );
+                if ( nullptr == fieldType ) {
+                        castError( castExpr, "Virtual cast type virtual_table field is not a pointer." );
+                }
+                assert( fieldType->base );
+                auto virtualStructType = dynamic_cast<const StructInstType *>( fieldType->base );
+                assert( virtualStructType );
+                // Here is the type, but if it is polymorphic it will have lost information.
+                // (Always a clone so that it may always be deleted.)
+                StructInstType * virtualType = virtualStructType->clone();
+                if ( ! structType->parameters.empty() ) {
+                        deleteAll( virtualType->parameters );
+                        virtualType->parameters.clear();
+                        cloneAll( structType->parameters, virtualType->parameters );
+                }
+                return virtualType;
+        }
+        } // namespace
         Expression * VirtualCastCore::postmutate( VirtualCastExpr * castExpr ) {
                 assertf( castExpr->get_result(), "Virtual Cast target not found before expansion." );
+                assertf( castExpr->result, "Virtual Cast target not found before expansion." );
                 assert( vcast_decl );
                 assert( pvt_decl );
+                // May only cast to a pointer or reference type.
+                // A earlier validation should give a syntax error, this is
+                // just to make sure errors don't creep during translation.
+                // Move to helper with more detailed error messages.
+                PointerType * target_type =
+                        dynamic_cast<PointerType *>( castExpr->get_result() );
+                assert( target_type );
+                StructInstType * target_struct =
+                        dynamic_cast<StructInstType *>( target_type->get_base() );
+                assert( target_struct );
+                StructDecl * target_decl = target_struct->get_baseStruct();
+                std::map<std::string, ObjectDecl *>::iterator found =
+                        vtable_instances.find(
+                                get_vtable_inst_name( target_decl->get_name() ) );
+                if ( vtable_instances.end() == found ) {
+                        assertf( false, "virtual table instance not found." );
+                }
+                ObjectDecl * table = found->second;
+                const Type * vtable_type = getVirtualTableType( castExpr );
+                ObjectDecl * table = indexer.lookup( vtable_type );
+                if ( nullptr == table ) {
+                        SemanticError( castLocation( castExpr ),
+                                "Could not find virtual table instance." );
+                }
                 Expression * result = new CastExpr(
-                        //new ApplicationExpr(
-                                //new AddressExpr( new VariableExpr( vcast_decl ) ),
-                                //new CastExpr( new VariableExpr( vcast_decl ),
-                                //      new PointerType( noQualifiers,
-                                //              vcast_decl->get_type()->clone()
-                                //              )
-                                //      ),
                         new ApplicationExpr( VariableExpr::functionPointer( vcast_decl ), {
                                         new CastExpr(
 …
                 castExpr->set_result( nullptr );
                 delete castExpr;
+                delete vtable_type;
                 return result;
+        }

src/Virtual/module.mk

-              r3c64c668
+              r58fe85a
 ###############################################################################
+SRC += Virtual/ExpandCasts.cc
+SRC += Virtual/ExpandCasts.cc Virtual/ExpandCasts.h \
+        Virtual/Tables.cc Virtual/Tables.h
+SRCDEMANGLE += Virtual/Tables.cc

src/config.h.in

-              r3c64c668
+              r58fe85a
 /* Location of cfa install. */
 #undef CFA_PREFIX
+/* Sets whether or not to use the new-ast, this is adefault value and can be
+   overrided by --old-ast and --new-ast */
+#undef CFA_USE_NEW_AST
 /* Major.Minor */

src/main.cc

-              r3c64c668
+              r58fe85a
 // Author           : Peter Buhr and Rob Schluntz
 // Created On       : Fri May 15 23:12:02 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Feb  8 08:33:50 2020
 // Update Count     : 633
+// Last Modified By : Andrew Beach
+// Last Modified On : Mon Dec  7 15:29:00 2020
+// Update Count     : 639
 //
 …
 using namespace std;
+#include "AST/Convert.hpp"
 #include "CompilationState.h"
 #include "../config.h"                      // for CFA_LIBDIR
 …
 #include "CodeTools/ResolvProtoDump.h"      // for dumpAsResolvProto
 #include "CodeTools/TrackLoc.h"             // for fillLocations
+#include "Common/CodeLocationTools.hpp"     // for forceFillCodeLocations
 #include "Common/CompilerError.h"           // for CompilerError
 #include "Common/Stats.h"
 …
                 } // if
+                PASS( "Translate Throws", ControlStruct::translateThrows( translationUnit ) );
                 PASS( "Fix Labels", ControlStruct::fixLabels( translationUnit ) );
                 PASS( "Fix Names", CodeGen::fixNames( translationUnit ) );
 …
                 } // if
+                PASS( "Resolve", ResolvExpr::resolve( translationUnit ) );
+                if ( exprp ) {
+                        dump( translationUnit );
+                        return EXIT_SUCCESS;
+                } // if
+                if( useNewAST ) {
+                        if (Stats::Counters::enabled) {
+                                ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New");
+                                ast::pass_visitor_stats.max = Stats::Counters::build<Stats::Counters::MaxCounter<double>>("Max depth - New");
+                        }
+                        auto transUnit = convert( move( translationUnit ) );
+                        PASS( "Resolve", ResolvExpr::resolve( transUnit ) );
+                        if ( exprp ) {
+                                translationUnit = convert( move( transUnit ) );
+                                dump( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        forceFillCodeLocations( transUnit );
+                        PASS( "Fix Init", InitTweak::fix(transUnit, buildingLibrary()));
+                        translationUnit = convert( move( transUnit ) );
+                } else {
+                        PASS( "Resolve", ResolvExpr::resolve( translationUnit ) );
+                        if ( exprp ) {
+                                dump( translationUnit );
+                                return EXIT_SUCCESS;
+                        }
+                        PASS( "Fix Init", InitTweak::fix( translationUnit, buildingLibrary() ) );
+                }
                 // fix ObjectDecl - replaces ConstructorInit nodes
-                PASS( "Fix Init", InitTweak::fix( translationUnit, buildingLibrary() ) );
                 if ( ctorinitp ) {
                         dump ( translationUnit );
 …
                 PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( translationUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
                 PASS( "Translate EHM" , ControlStruct::translateEHM( translationUnit ) );
+                PASS( "Translate Tries" , ControlStruct::translateTries( translationUnit ) );
                 PASS( "Gen Waitfor" , Concurrency::generateWaitFor( translationUnit ) );
 …
 static const char optstring[] = ":c:ghlLmNnpP:S:twW:D:";
+static const char optstring[] = ":c:ghlLmNnpdOAP:S:twW:D:";
 enum { PreludeDir = 128 };
 …
         { "no-prelude", no_argument, nullptr, 'n' },
         { "prototypes", no_argument, nullptr, 'p' },
+        { "deterministic-out", no_argument, nullptr, 'd' },
+        { "old-ast", no_argument, nullptr, 'O'},
+        { "new-ast", no_argument, nullptr, 'A'},
         { "print", required_argument, nullptr, 'P' },
         { "prelude-dir", required_argument, nullptr, PreludeDir },
 …
 static const char * description[] = {
         "diagnostic color: never, always, or auto.",          // -c
         "wait for gdb to attach",                             // -g
         "print help message",                                 // -h
         "generate libcfa.c",                                  // -l
         "generate line marks",                                // -L
         "do not replace main",                                // -m
         "do not generate line marks",                         // -N
         "do not read prelude",                                // -n
+        "diagnostic color: never, always, or auto.",            // -c
+        "wait for gdb to attach",                                                       // -g
+        "print help message",                                                           // -h
+        "generate libcfa.c",                                                            // -l
+        "generate line marks",                                                          // -L
+        "do not replace main",                                                          // -m
+        "do not generate line marks",                                           // -N
+        "do not read prelude",                                                          // -n
         "generate prototypes for prelude functions",            // -p
+        "print",                                              // -P
+        "only print deterministic output",                  // -d
+        "Use the old-ast",                                                                      // -O
+        "Use the new-ast",                                                                      // -A
+        "print",                                                                                        // -P
         "<directory> prelude directory for debug/nodebug",      // no flag
         "<option-list> enable profiling information:\n          counters,heap,time,all,none", // -S
         "building cfa standard lib",                          // -t
         "",                                                   // -w
         "",                                                   // -W
         "",                                                   // -D
+        "building cfa standard lib",                                            // -t
+        "",                                                                                                     // -w
+        "",                                                                                                     // -W
+        "",                                                                                                     // -D
 }; // description
 …
                         genproto = true;
                         break;
+                  case 'd':                                     // don't print non-deterministic output
+                        deterministic_output = true;
+                        break;
+                  case 'O':                                     // don't print non-deterministic output
+                        useNewAST = false;
+                        break;
+                  case 'A':                                     // don't print non-deterministic output
+                        useNewAST = true;
+                        break;
                   case 'P':                                                                             // print options
                         for ( int i = 0;; i += 1 ) {

tests/.expect/alloc.txt

-              r3c64c668
+              r58fe85a
 xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede
 CFA array alloc, fill 0xef
 xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef
+xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef
 CFA array alloc, fill from array
 xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef, 0xefefefef 0xefefefef,
+xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef, 0xdeadbeef 0xdeadbeef,
 C realloc
 xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef
+xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef
 CFA realloc
 xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0xefefefef 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101
+xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101
 CFA resize array alloc
+CFA realloc array alloc
 xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef
 CFA resize array alloc
+CFA realloc array alloc
 xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101
 CFA resize array alloc
+CFA realloc array alloc
 xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef
 CFA resize array alloc
 xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede
 CFA resize array alloc
+CFA realloc array alloc, fill
+xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede
+CFA realloc array alloc, fill
 xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef
 CFA resize array alloc, fill
 xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede
+CFA realloc array alloc, fill
+xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede
 C   memalign 42 42.5

tests/.expect/array.txt

r3c64c668	r58fe85a
	1	array.cfa: In function '_X4mainFi___1':
	2	array.cfa:55:9: note: #pragma message: Compiled

tests/.expect/cast.txt

r3c64c668	r58fe85a
	1	cast.cfa: In function '_X4mainFi_iPPKc__1':
	2	cast.cfa:18:9: note: #pragma message: Compiled

tests/.expect/copyfile.txt

-              r3c64c668
+              r58fe85a
 //
 // Author           : Peter A. Buhr
 // Created On       : Tue Jul 16 16:47:22 2019
+// Created On       : Fri Jun 19 13:44:05 2020
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Jul 17 18:04:44 2019
 // Update Count     : 26
+// Last Modified On : Fri Jun 19 17:58:03 2020
+// Update Count     : 4
 //
 #include <fstream.hfa>
 #include <stdlib.hfa>                                                                   // new/delete
+#include <exception.hfa>
 int main( int argc, char * argv[] ) {
+        ifstream * in  = &stdin;                                                        // default files
+        ofstream * out = &stdout;
+        ifstream in  = stdin;                                                           // copy default files
+        ofstream out = stdout;
         try {
                 choose ( argc ) {
                   case 2, 3:
                           in = new( (const char *)argv[1] );            // open input file first as output creates file
                           if ( argc == 3 ) out = new( (const char *)argv[2] ); // only open output if input opens as output created if nonexistent
                   case 1: ;                                     // use default files
+                        open( in, argv[1] );                                            // open input file first as output creates file
+                        if ( argc == 3 ) open( out, argv[2] );          // do not create output unless input opens
+                  case 1: ;                                                                             // use default files
                   default:
                           exit | "Usage [ input-file (default stdin) [ output-file (default stdout) ] ]";
+                        exit | "Usage" | argv[0] | "[ input-file (default stdin) [ output-file (default stdout) ] ]";
                 } // choose
+        } catch( Open_Failure * ex ; ex->istream == &in ) {
+                exit | "Unable to open input file" | argv[1];
+        } catch( Open_Failure * ex ; ex->ostream == &out ) {
+                close( in );                                                                    // optional
+                exit | "Unable to open output file" | argv[2];
+        } // try
+                char ch;
+                *out | nlOff;                                                                   // turn off auto newline
+                *in  | nlOn;                                                                    // turn on reading newline
+        out | nlOff;                                                                            // turn off auto newline
+        in  | nlOn;                                                                                     // turn on reading newline
+                for () {                                                                                // read all characters
+                        *in | ch;
+                  if ( eof( *in ) ) break;                                              // eof ?
+                        *out | ch;
+                } // for
+        } finally {
+                if ( in  != &stdin  ) delete( in );                             // close file, do not delete stdin!
+                if ( out != &stdout ) delete( out );                    // close file, do not delete stdout!
+        } // try
+        char ch;
+        for () {                                                                                        // read all characters
+                in | ch;
+          if ( eof( in ) ) break;                                                       // eof ?
+                out | ch;
+        } //for
 } // main

tests/.expect/declarationSpecifier.x64.txt

-              r3c64c668
+              r58fe85a
 static inline int invoke_main(int argc, char* argv[], char* envp[]) { (void)argc; (void)argv; (void)envp; return _X4mainFi_iPPKc__1((signed int )argc, (const char **)argv); }
 static inline signed int invoke_main(signed int argc, char **argv, char **envp);
+signed int _X13cfa_args_argci_1;
+char **_X13cfa_args_argvPPc_1;
+char **_X13cfa_args_envpPPc_1;
 signed int main(signed int _X4argci_1, char **_X4argvPPc_1, char **_X4envpPPc_1){
     __attribute__ ((unused)) signed int _X12_retval_maini_1;
+    {
+        ((void)(_X13cfa_args_argci_1=_X4argci_1));
+    }
+    {
+        ((void)(_X13cfa_args_argvPPc_1=_X4argvPPc_1));
+    }
+    {
+        ((void)(_X13cfa_args_envpPPc_1=_X4envpPPc_1));
+    }
+    {
         signed int _tmp_cp_ret4;
         ((void)(_X12_retval_maini_1=(((void)(_tmp_cp_ret4=invoke_main(_X4argci_1, _X4argvPPc_1, _X4envpPPc_1))) , _tmp_cp_ret4)) /* ?{} */);

tests/.expect/declarationSpecifier.x86.txt

-              r3c64c668
+              r58fe85a
 static inline int invoke_main(int argc, char* argv[], char* envp[]) { (void)argc; (void)argv; (void)envp; return _X4mainFi_iPPKc__1((signed int )argc, (const char **)argv); }
 static inline signed int invoke_main(signed int argc, char **argv, char **envp);
+signed int _X13cfa_args_argci_1;
+char **_X13cfa_args_argvPPc_1;
+char **_X13cfa_args_envpPPc_1;
 signed int main(signed int _X4argci_1, char **_X4argvPPc_1, char **_X4envpPPc_1){
     __attribute__ ((unused)) signed int _X12_retval_maini_1;
+    {
+        ((void)(_X13cfa_args_argci_1=_X4argci_1));
+    }
+    {
+        ((void)(_X13cfa_args_argvPPc_1=_X4argvPPc_1));
+    }
+    {
+        ((void)(_X13cfa_args_envpPPc_1=_X4envpPPc_1));
+    }
+    {
         signed int _tmp_cp_ret4;
         ((void)(_X12_retval_maini_1=(((void)(_tmp_cp_ret4=invoke_main(_X4argci_1, _X4argvPPc_1, _X4envpPPc_1))) , _tmp_cp_ret4)) /* ?{} */);

tests/.expect/enum.txt

r3c64c668	r58fe85a
	1	done

tests/.expect/expression.txt

r3c64c668	r58fe85a
	1	expression.cfa: In function '_X4mainFi___1':
	2	expression.cfa:89:9: note: #pragma message: Compiled

tests/.expect/forall.txt

r3c64c668	r58fe85a
	1	forall.cfa: In function '_X4mainFi___1':
	2	forall.cfa:218:9: note: #pragma message: Compiled

tests/.expect/gccExtensions.x64.txt

-              r3c64c668
+              r58fe85a
 static inline int invoke_main(int argc, char* argv[], char* envp[]) { (void)argc; (void)argv; (void)envp; return _X4mainFi_iPPKc__1((signed int )argc, (const char **)argv); }
 static inline signed int invoke_main(signed int argc, char **argv, char **envp);
+signed int _X13cfa_args_argci_1;
+char **_X13cfa_args_argvPPc_1;
+char **_X13cfa_args_envpPPc_1;
 signed int main(signed int _X4argci_1, char **_X4argvPPc_1, char **_X4envpPPc_1){
     __attribute__ ((unused)) signed int _X12_retval_maini_1;
+    {
+        ((void)(_X13cfa_args_argci_1=_X4argci_1));
+    }
+    {
+        ((void)(_X13cfa_args_argvPPc_1=_X4argvPPc_1));
+    }
+    {
+        ((void)(_X13cfa_args_envpPPc_1=_X4envpPPc_1));
+    }
+    {
         signed int _tmp_cp_ret4;
         ((void)(_X12_retval_maini_1=(((void)(_tmp_cp_ret4=invoke_main(_X4argci_1, _X4argvPPc_1, _X4envpPPc_1))) , _tmp_cp_ret4)) /* ?{} */);

tests/.expect/gccExtensions.x86.txt

-              r3c64c668
+              r58fe85a
 static inline int invoke_main(int argc, char* argv[], char* envp[]) { (void)argc; (void)argv; (void)envp; return _X4mainFi_iPPKc__1((signed int )argc, (const char **)argv); }
 static inline signed int invoke_main(signed int argc, char **argv, char **envp);
+signed int _X13cfa_args_argci_1;
+char **_X13cfa_args_argvPPc_1;
+char **_X13cfa_args_envpPPc_1;
 signed int main(signed int _X4argci_1, char **_X4argvPPc_1, char **_X4envpPPc_1){
     __attribute__ ((unused)) signed int _X12_retval_maini_1;
+    {
+        ((void)(_X13cfa_args_argci_1=_X4argci_1));
+    }
+    {
+        ((void)(_X13cfa_args_argvPPc_1=_X4argvPPc_1));
+    }
+    {
+        ((void)(_X13cfa_args_envpPPc_1=_X4envpPPc_1));
+    }
+    {
         signed int _tmp_cp_ret4;
         ((void)(_X12_retval_maini_1=(((void)(_tmp_cp_ret4=invoke_main(_X4argci_1, _X4argvPPc_1, _X4envpPPc_1))) , _tmp_cp_ret4)) /* ?{} */);

tests/.expect/heap.txt

r3c64c668	r58fe85a
	1	done

tests/.expect/identFuncDeclarator.txt

r3c64c668	r58fe85a
	1	identFuncDeclarator.cfa: In function '_X4mainFi___1':
	2	identFuncDeclarator.cfa:116:9: note: #pragma message: Compiled

tests/.expect/identParamDeclarator.txt

r3c64c668	r58fe85a
	1	done

tests/.expect/labelledExit.txt

r3c64c668	r58fe85a
	1	labelledExit.cfa: In function '_X4mainFi_iPPKc__1':
	2	labelledExit.cfa:183:9: note: #pragma message: Compiled

tests/.expect/limits.txt

r3c64c668	r58fe85a
	1	limits.cfa: In function '_X4mainFi_iPPKc__1':
	2	limits.cfa:154:9: note: #pragma message: Compiled

tests/.expect/maybe.txt

r3c64c668	r58fe85a
	1	done

tests/.expect/minmax.txt

-              r3c64c668
+              r58fe85a
 char                    z a     min a
 signed int              4 3     min 3
+signed int              4 -3    min -3
 unsigned int            4 3     min 3
 signed long int         4 3     min 3
+signed long int         4 -3    min -3
 unsigned long int       4 3     min 3
 signed long long int    4 3     min 3
+signed long long int    4 -3    min -3
 unsigned long long int  4 3     min 3
 float                   4. 3.1  min 3.1
 …
 char                    z a     max z
 signed int              4 3     max 4
+signed int              4 -3    max 4
 unsigned int            4 3     max 4
 signed long int         4 3     max 4
+signed long int         4 -3    max 4
 unsigned long int       4 3     max 4
 signed long long int    4 3     max 4
+signed long long int    4 -3    max 4
 unsigned long long int  4 3     max 4
 float                   4. 3.1  max 4.

tests/.expect/nested-types.txt

r3c64c668	r58fe85a
	1	nested-types.cfa: In function '_X4mainFi___1':
	2	nested-types.cfa:102:9: note: #pragma message: Compiled

tests/.expect/numericConstants.txt

r3c64c668	r58fe85a
	1	numericConstants.cfa: In function '_X4mainFi___1':
	2	numericConstants.cfa:68:9: note: #pragma message: Compiled

tests/.expect/operators.txt

r3c64c668	r58fe85a
	1	done

tests/.expect/result.txt

r3c64c668	r58fe85a
	1	done

tests/.expect/stdincludes.txt

r3c64c668	r58fe85a
	1	stdincludes.cfa: In function '_X4mainFi___1':
	2	stdincludes.cfa:52:9: note: #pragma message: Compiled

tests/.expect/switch.txt

r3c64c668	r58fe85a
	1	switch.cfa: In function '_X4mainFi___1':
	2	switch.cfa:105:9: note: #pragma message: Compiled

tests/.expect/time.txt

r3c64c668	r58fe85a
1	1	10800 2 3.375 12 1.00001
	2	0.125 0.0333333333333333 3.375 12000. 1000010.
2	3	0 2 3.375
3	4	7 7 7

tests/.expect/typedefRedef-ERR1.txt

r3c64c668	r58fe85a
1	1	typedefRedef.cfa:4:1 error: Cannot redefine typedef: Foo
2		typedefRedef.cfa:60:1 error: Cannot redefine typedef: ARR
	2	typedefRedef.cfa:59:1 error: Cannot redefine typedef: ARR

tests/.expect/typedefRedef.txt

r3c64c668	r58fe85a
	1	typedefRedef.cfa: In function '_X4mainFi___1':
	2	typedefRedef.cfa:71:9: note: #pragma message: Compiled

tests/.expect/typeof.txt

r3c64c668	r58fe85a
	1	done

tests/.expect/variableDeclarator.txt

r3c64c668	r58fe85a
	1	variableDeclarator.cfa: In function '_X4mainFi_iPPKc__1':
	2	variableDeclarator.cfa:182:9: note: #pragma message: Compiled

tests/.expect/voidPtr.txt

r3c64c668	r58fe85a
	1	done

tests/.in/copyfile.txt

-              r3c64c668
+              r58fe85a
 //
 // Author           : Peter A. Buhr
 // Created On       : Tue Jul 16 16:47:22 2019
+// Created On       : Fri Jun 19 13:44:05 2020
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Jul 17 18:04:44 2019
 // Update Count     : 26
+// Last Modified On : Fri Jun 19 17:58:03 2020
+// Update Count     : 4
 //
 #include <fstream.hfa>
 #include <stdlib.hfa>                                                                   // new/delete
+#include <exception.hfa>
 int main( int argc, char * argv[] ) {
+        ifstream * in  = &stdin;                                                        // default files
+        ofstream * out = &stdout;
+        ifstream in  = stdin;                                                           // copy default files
+        ofstream out = stdout;
         try {
                 choose ( argc ) {
                   case 2, 3:
                           in = new( (const char *)argv[1] );            // open input file first as output creates file
                           if ( argc == 3 ) out = new( (const char *)argv[2] ); // only open output if input opens as output created if nonexistent
                   case 1: ;                                     // use default files
+                        open( in, argv[1] );                                            // open input file first as output creates file
+                        if ( argc == 3 ) open( out, argv[2] );          // do not create output unless input opens
+                  case 1: ;                                                                             // use default files
                   default:
                           exit | "Usage [ input-file (default stdin) [ output-file (default stdout) ] ]";
+                        exit | "Usage" | argv[0] | "[ input-file (default stdin) [ output-file (default stdout) ] ]";
                 } // choose
+        } catch( Open_Failure * ex ; ex->istream == &in ) {
+                exit | "Unable to open input file" | argv[1];
+        } catch( Open_Failure * ex ; ex->ostream == &out ) {
+                close( in );                                                                    // optional
+                exit | "Unable to open output file" | argv[2];
+        } // try
+                char ch;
+                *out | nlOff;                                                                   // turn off auto newline
+                *in  | nlOn;                                                                    // turn on reading newline
+        out | nlOff;                                                                            // turn off auto newline
+        in  | nlOn;                                                                                     // turn on reading newline
+                for () {                                                                                // read all characters
+                        *in | ch;
+                  if ( eof( *in ) ) break;                                              // eof ?
+                        *out | ch;
+                } // for
+        } finally {
+                if ( in  != &stdin  ) delete( in );                             // close file, do not delete stdin!
+                if ( out != &stdout ) delete( out );                    // close file, do not delete stdout!
+        } // try
+        char ch;
+        for () {                                                                                        // read all characters
+                in | ch;
+          if ( eof( in ) ) break;                                                       // eof ?
+                out | ch;
+        } //for
 } // main

tests/.in/manipulatorsInput.txt

-              r3c64c668
+              r58fe85a
 .5 3.5 3.456E+23.456E+2 -0x1.2p-3 3.5 0X1.23p3     3.5
 .5 3.5 3.456E+23.456E+2 -0x1.2p-3 3.5 0X1.23p3     3.5
+-25 42798
+1402432282 1505850196993244515
+394749758663249135511342
+12935154696204706112391834394
+423859149128410414395372834994551
+13889016598639747063234935497057631587
+170141183460469231731687303715884105727
+340282366920938463463374607431768211455
+-340282366920938463463374607431768211455
+340282366920938463463374607431768211455999
+1234567890123456789 -1234567890123456789

tests/Makefile.am

-              r3c64c668
+              r58fe85a
 ## Created On       : Sun May 31 09:08:15 2015
 ## Last Modified By : Peter A. Buhr
 ## Last Modified On : Tue Nov 20 11:18:51 2018
 ## Update Count     : 68
+## Last Modified On : Fri Oct  9 23:13:07 2020
+## Update Count     : 86
 ###############################################################################
 …
 ACLOCAL_AMFLAGS  = -I automake
+include $(top_srcdir)/src/cfa.make
+include $(top_srcdir)/tools/build/cfa.make
+DEFAULT_INCLUDES = -I${abs_srcdir}
 debug=yes
 …
 # since automake doesn't have support for CFA we have to
 AM_CFLAGS = $(if $(test), 2> $(test), ) \
+        -fdebug-prefix-map=$(abspath ${abs_srcdir})= \
+        -fdebug-prefix-map=/tmp= \
+        -fno-diagnostics-show-caret \
         -g \
         -Wall \
 …
         -DIN_DIR="${abs_srcdir}/.in/"
+AM_CFAFLAGS = -XCFA --deterministic-out
 # get the desired cfa to test
 TARGET_CFA = $(if $(filter $(installed),yes), @CFACC_INSTALL@, @CFACC@)
 # adjust CC to current flags
 CC = $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
+CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
 CFACC = $(CC)
 …
 # adjusted CC but without the actual distcc call
+CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
+CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
+CFACCLINK = $(CFACCLOCAL) -quiet $(if $(test), 2> $(test), ) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g'))
 PRETTY_PATH=mkdir -p $(dir $(abspath ${@})) && cd ${srcdir} &&
 …
 .INTERMEDIATE: .validate .validate.cfa
 EXTRA_PROGRAMS = avl_test .dummy_hack # build but do not install
+EXTRA_DIST = test.py \
+        pybin/__init__.py \
+        pybin/print-core.gdb \
+        pybin/settings.py \
+        pybin/test_run.py \
+        pybin/tools.py \
+        long_tests.hfa \
+        .in/io.data \
+        avltree/avl.h \
+        avltree/avl-private.h \
+        concurrent/clib.c \
+        exceptions/with-threads.hfa \
+        exceptions/except-io.hfa
+dist-hook:
+        echo "Gathering test files"
+        for file in `${TEST_PY} --list-dist`; do \
+                if test -f ${srcdir}/$${file}; then \
+                        $(MKDIR_P) $$(dirname ${distdir}/$${file}); \
+                        cp -df ${srcdir}/$${file} ${distdir}/$${file}; \
+                fi; \
+        done
 avl_test_SOURCES = avltree/avl_test.cfa avltree/avl0.cfa avltree/avl1.cfa avltree/avl2.cfa avltree/avl3.cfa avltree/avl4.cfa avltree/avl-private.cfa
 # automake doesn't know we still need C/CPP rules so pretend like we have a C program
 _dummy_hack_SOURCES = .dummy_hack.c .dummy_hackxx.cpp
+nodist__dummy_hack_SOURCES = .dummy_hack.c .dummy_hackxx.cpp
 #----------------------------------------------------------------------------------------------------------------
 …
         @+${TEST_PY} --debug=${debug} --install=${installed} --archive-errors=${archiveerrors} ${concurrent} ${timeouts} --all # '@' => do not echo command (SILENT), '+' => allows recursive make from within python program
+clean-local :
+mostlyclean-local :
+        find ${builddir} -not -path './__pycache__/*' -path '*.o' -delete
+        find ${builddir} -not -path './__pycache__/*' -path '*/.err/*.log' -delete
+        find ${builddir} -not -path './__pycache__/*' -path '*/.out/*.log' -delete
         rm -f ${EXTRA_PROGRAMS}
+        rm -rf __pycache__
+distclean-local :
+        find ${builddir} -path '*.Po' -delete
 list :
 …
 % : %.cfa $(CFACCBIN)
         $(CFACOMPILETEST) -c -o $(abspath ${@}).o
+        $(CFACCLOCAL) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g')) $(abspath ${@}).o -o $(abspath ${@})
+        $(CFACCLINK) ${@}.o -o $(abspath ${@})
+        rm $(abspath ${@}).o
 # implicit rule for c++ test
 …
         $(CFACOMPILETEST) -CFA -XCFA -p -c -fsyntax-only -o $(abspath ${@})
-# Use for tests where the make command is expected to succeed but the expected.txt should be compared to stderr
-EXPECT_STDERR = builtins/sync warnings/self-assignment
-$(EXPECT_STDERR): % : %.cfa $(CFACCBIN)
-        $(CFACOMPILETEST) -c -fsyntax-only 2> $(abspath ${@})
 #------------------------------------------------------------------------------
 # CUSTOM TARGET
 #------------------------------------------------------------------------------
+# tests that just validate syntax and compiler output should be compared to stderr
+CFACOMPILE_SYNTAX = $(CFACOMPILETEST) -Wno-unused-variable -Wno-unused-label -c -fsyntax-only -o $(abspath ${@})
+SYNTAX_ONLY_CODE = expression typedefRedef variableDeclarator switch numericConstants identFuncDeclarator forall \
+        init1 limits nested-types stdincludes cast labelledExit array builtins/sync warnings/self-assignment
+$(SYNTAX_ONLY_CODE): % : %.cfa $(CFACCBIN)
+        $(CFACOMPILE_SYNTAX)
+        $(if $(test), cp $(test) $(abspath ${@}), )
 # expected failures
 # use custom target since they require a custom define and custom dependencies
+# use custom target since they require a custom define *and* have a name that doesn't match the file
 alloc-ERROR : alloc.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -DERR1 -c -fsyntax-only -o $(abspath ${@})
+        $(CFACOMPILE_SYNTAX) -DERR1
+        -cp $(test) $(abspath ${@})
+init1-ERROR : init1.cfa $(CFACCBIN)
+        $(CFACOMPILE_SYNTAX) -DERR1
+        -cp $(test) $(abspath ${@})
 typedefRedef-ERR1 : typedefRedef.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -DERR1 -c -fsyntax-only -o $(abspath ${@})
+        $(CFACOMPILE_SYNTAX) -DERR1
+        -cp $(test) $(abspath ${@})
 nested-types-ERR1 : nested-types.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -DERR1 -c -fsyntax-only -o $(abspath ${@})
+        $(CFACOMPILE_SYNTAX) -DERR1
+        -cp $(test) $(abspath ${@})
 nested-types-ERR2 : nested-types.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -DERR2 -c -fsyntax-only -o $(abspath ${@})
+        $(CFACOMPILE_SYNTAX) -DERR2
+        -cp $(test) $(abspath ${@})
 raii/memberCtors-ERR1 : raii/memberCtors.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -DERR1 -c -fsyntax-only -o $(abspath ${@})
+        $(CFACOMPILE_SYNTAX) -DERR1
+        -cp $(test) $(abspath ${@})
 raii/ctor-autogen-ERR1 : raii/ctor-autogen.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -DERR1 -c -fsyntax-only -o $(abspath ${@})
+        $(CFACOMPILE_SYNTAX) -DERR1
+        -cp $(test) $(abspath ${@})
 raii/dtor-early-exit-ERR1 : raii/dtor-early-exit.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -DERR1 -c -fsyntax-only -o $(abspath ${@})
+        $(CFACOMPILE_SYNTAX) -DERR1
+        -cp $(test) $(abspath ${@})
 raii/dtor-early-exit-ERR2 : raii/dtor-early-exit.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -DERR2 -c -fsyntax-only -o $(abspath ${@})
+        $(CFACOMPILE_SYNTAX) -DERR2
+        -cp $(test) $(abspath ${@})
+# Exception Tests
+# Test with libcfathread; it changes how storage works.
+exceptions/%-threads : exceptions/%.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -include exceptions/with-threads.hfa -c -o $(abspath ${@}).o
+        $(CFACCLOCAL) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g')) $(abspath ${@}).o -o $(abspath ${@})
+# Linking tests
+# Meta tests to make sure we see linking errors (can't compile with -O2 since it may multiply number of calls)
+linking/linkerror : linking/linkerror.cfa $(CFACCBIN)
+        $(CFACOMPILETEST) -O0 -c -o $(abspath ${@}).o
+        $(CFACCLINK)  -O0 ${@}.o -o $(abspath ${@})
+        rm $(abspath ${@}).o
 #------------------------------------------------------------------------------

tests/alloc.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed Feb  3 07:56:22 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Feb 16 09:21:13 2020
 // Update Count     : 405
+// Last Modified On : Thu Nov 12 10:02:18 2020
+// Update Count     : 432
 //
 …
         size_t dim = 10;
         char fill = '\xde';
         int * p, * p1;
+        int * ip, * ip1;
         // allocation, non-array types
         p = (int *)malloc( sizeof(*p) );                                        // C malloc, type unsafe
         *p = 0xdeadbeef;
         printf( "C   malloc %#x\n", *p );
         free( p );
         p = malloc();                                       // CFA malloc, type safe
         *p = 0xdeadbeef;
         printf( "CFA malloc %#x\n", *p );
         free( p );
         p = alloc();                                        // CFA alloc, type safe
         *p = 0xdeadbeef;
         printf( "CFA alloc %#x\n", *p );
         free( p );
         p = alloc_set( fill );                                                          // CFA alloc, fill
         printf( "CFA alloc, fill %08x\n", *p );
         free( p );
         p = alloc_set( 3 );                                                                     // CFA alloc, fill
         printf( "CFA alloc, fill %d\n", *p );
         free( p );
+        ip = (int *)malloc( sizeof(*ip) );                                      // C malloc, type unsafe
+        *ip = 0xdeadbeef;
+        printf( "C   malloc %#x\n", *ip );
+        free( ip );
+        ip = malloc();                                                                          // CFA malloc, type safe
+        *ip = 0xdeadbeef;
+        printf( "CFA malloc %#x\n", *ip );
+        free( ip );
+        ip = alloc();                                                                           // CFA alloc, type safe
+        *ip = 0xdeadbeef;
+        printf( "CFA alloc %#x\n", *ip );
+        free( ip );
+        ip = alloc( fill`fill );                                                                // CFA alloc, fill
+        printf( "CFA alloc, fill %08x\n", *ip );
+        free( ip );
+        ip = alloc( 3`fill );                                                           // CFA alloc, fill
+        printf( "CFA alloc, fill %d\n", *ip );
+        free( ip );
 …
         printf( "\n" );
         p = (int *)calloc( dim, sizeof( *p ) );                         // C array calloc, type unsafe
+        ip = (int *)calloc( dim, sizeof( *ip ) );                       // C array calloc, type unsafe
         printf( "C   array calloc, fill 0\n" );
         for ( i; dim ) { printf( "%#x ", p[i] ); }
         printf( "\n" );
         free( p );
         p = calloc( dim );                                  // CFA array calloc, type safe
+        for ( i; dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        free( ip );
+        ip = calloc( dim );                                                                     // CFA array calloc, type safe
         printf( "CFA array calloc, fill 0\n" );
         for ( i; dim ) { printf( "%#x ", p[i] ); }
         printf( "\n" );
         free( p );
         p = alloc( dim );                                   // CFA array alloc, type safe
         for ( i; dim ) { p[i] = 0xdeadbeef; }
+        for ( i; dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        free( ip );
+        ip = alloc( dim );                                                                      // CFA array alloc, type safe
+        for ( i; dim ) { ip[i] = 0xdeadbeef; }
         printf( "CFA array alloc, no fill\n" );
         for ( i; dim ) { printf( "%#x ", p[i] ); }
         printf( "\n" );
         free( p );
         p = alloc_set( 2 * dim, fill );                                         // CFA array alloc, fill
+        for ( i; dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        free( ip );
+        ip = alloc( 2 * dim, fill`fill );                                       // CFA array alloc, fill
         printf( "CFA array alloc, fill %#hhx\n", fill );
         for ( i; 2 * dim ) { printf( "%#x ", p[i] ); }
         printf( "\n" );
         free( p );
         p = alloc_set( 2 * dim, 0xdeadbeef );                           // CFA array alloc, fill
+        for ( i; 2 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        free( ip );
+        ip = alloc( 2 * dim, ((int)0xdeadbeef)`fill );                          // CFA array alloc, fill
         printf( "CFA array alloc, fill %#hhx\n", 0xdeadbeef );
         for ( i; 2 * dim ) { printf( "%#x ", p[i] ); }
         printf( "\n" );
         // do not free
         p1 = alloc_set( 2 * dim, p );                                           // CFA array alloc, fill
+        for ( i; 2 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip1 = alloc( 2 * dim, [ip, 2 * dim]`fill );                             // CFA array alloc, fill
         printf( "CFA array alloc, fill from array\n" );
+        for ( i; 2 * dim ) { printf( "%#x %#x, ", p[i], p1[i] ); }
+        free( p1 );
+        printf( "\n" );
+        for ( i; 2 * dim ) { printf( "%#x %#x, ", ip[i], ip1[i] ); }
+        free( ip1 );
+        printf( "\n" );
+        // realloc, non-array types
+        printf( "\n" );
+        ip = (int *)realloc( ip, dim * sizeof(*ip) );           // C realloc
+        printf( "C realloc\n" );
+        for ( i; dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip = realloc( ip, 2 * dim * sizeof(*ip) );                      // CFA realloc
+        for ( i; dim ~ 2 * dim ) { ip[i] = 0x1010101; }
+        printf( "CFA realloc\n" );
+        for ( i; 2 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        // realloc, array types
+        printf( "\n" );
+        ip = alloc( dim, ip`realloc );                                                          // CFA realloc array alloc
+        for ( i; dim ) { ip[i] = 0xdeadbeef; }
+        printf( "CFA realloc array alloc\n" );
+        for ( i; dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip = alloc( 2 * dim, ip`realloc );                                                      // CFA realloc array alloc
+        for ( i; dim ~ 2 * dim ) { ip[i] = 0x1010101; }         // fill upper part
+        printf( "CFA realloc array alloc\n" );
+        for ( i; 2 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip = alloc( dim, ip`realloc );                                                          // CFA realloc array alloc
+        printf( "CFA realloc array alloc\n" );
+        for ( i; dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip = alloc( 3 * dim, ip`realloc, fill`fill );                           // CFA realloc array alloc, fill
+        printf( "CFA realloc array alloc, fill\n" );
+        for ( i; 3 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip = alloc( dim, ip`realloc, fill`fill );                                       // CFA realloc array alloc, fill
+        printf( "CFA realloc array alloc, fill\n" );
+        for ( i; dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip = alloc( 3 * dim, ip`realloc, fill`fill );                           // CFA realloc array alloc, fill
+        printf( "CFA realloc array alloc, fill\n" );
+        for ( i; 3 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+#if 0 // FIX ME
+        ip = alloc( 5 * dim, ip`realloc, 5`fill );                                      // CFA realloc array alloc, 5
+        printf( "CFA realloc array alloc, 5\n" );
+        for ( i; 5 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip = alloc( dim, ip`realloc, 5`fill );                                          // CFA realloc array alloc, 5
+        printf( "CFA realloc array alloc, 5\n" );
+        for ( i; dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+        ip = alloc( 5 * dim, ip`realloc, 5`fill );                                      // CFA realloc array alloc, 5
+        printf( "CFA realloc array alloc, 5\n" );
+        for ( i; 5 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+#endif // 0
+        free( ip );
         // resize, non-array types
+        printf( "\n" );
         p = (int *)realloc( p, dim * sizeof(*p) );                      // C realloc
         printf( "C realloc\n" );
+        for ( i; dim ) { printf( "%#x ", p[i] ); }
         printf( "\n" );
         // do not free
         p = realloc( p, 2 * dim * sizeof(*p) );             // CFA realloc
+        for ( i; dim ~ 2 * dim ) { p[i] = 0x1010101; }
         printf( "CFA realloc\n" );
+        for ( i; 2 * dim ) { printf( "%#x ", p[i] ); }
         printf( "\n" );
+        // do not free
+        struct S {
+                int a[5];
+        };
+    ip = alloc();
+        *ip = 5;
+    double * dp = alloc( ip`resize );
+        *dp = 5.5;
+    S * sp = alloc( dp`resize );
+        *sp = (S){ {0, 1, 2, 3, 4} };
+    ip = alloc( sp`resize );
+        *ip = 3;
+    free( ip );
         // resize, array types
+        printf( "\n" );
+        p = alloc( p, dim );                                // CFA resize array alloc
+        for ( i; dim ) { p[i] = 0xdeadbeef; }
+        printf( "CFA resize array alloc\n" );
+        for ( i; dim ) { printf( "%#x ", p[i] ); }
+        printf( "\n" );
+        // do not free
+        p = alloc( p, 2 * dim );                            // CFA resize array alloc
+        for ( i; dim ~ 2 * dim ) { p[i] = 0x1010101; }          // fill upper part
+        printf( "CFA resize array alloc\n" );
+        for ( i; 2 * dim ) { printf( "%#x ", p[i] ); }
+        printf( "\n" );
+        // do not free
+        p = alloc( p, dim );                                // CFA resize array alloc
+        printf( "CFA resize array alloc\n" );
+        for ( i; dim ) { printf( "%#x ", p[i] ); }
+        printf( "\n" );
+        // do not free
+        p = alloc_set( p, 3 * dim, fill );                                      // CFA resize array alloc, fill
+        printf( "CFA resize array alloc\n" );
+        for ( i; 3 * dim ) { printf( "%#x ", p[i] ); }
+        printf( "\n" );
+        // do not free
+        p = alloc_set( p, dim, fill );                                          // CFA resize array alloc, fill
+        printf( "CFA resize array alloc\n" );
+        for ( i; dim ) { printf( "%#x ", p[i] ); }
+        printf( "\n" );
+        // do not free
+        p = alloc_set( p, 3 * dim, fill );                                      // CFA resize array alloc, fill
+        printf( "CFA resize array alloc, fill\n" );
+        for ( i; 3 * dim ) { printf( "%#x ", p[i] );; }
+        printf( "\n" );
+        free( p );
+    ip = alloc( 5 );
+        for ( i; 5 ) { ip[i] = 5; }
+    dp = alloc( 5, ip`resize );
+        for ( i; 5 ) { dp[i] = 5.5; }
+    sp = alloc( 5, dp`resize );
+        for ( i; 5 ) { sp[i] = (S){ {0, 1, 2, 3, 4} }; }
+    ip = alloc( 3, sp`resize );
+        for ( i; 3 ) { ip[i] = 3; }
+    ip = alloc( 7, ip`realloc );
+        for ( i; 7 ) { ip[i] = 7; }
+    ip = alloc( 7, ip`resize );
+        for ( i; 7 ) { ip[i] = 7; }
+    free( ip );
+        int const_count, dest_count;
         struct Struct { int x; double y; };
+        void  ?{}( Struct & a ) {                                       // construct
+                a.[ x, y ] = [ -1, -1.0 ];
+        }
+        void  ?{}( Struct & a, int x, double y ) {      // initialize
+                a.[ x, y ] = [ x, y ];
+                const_count++;
+        }
+        void ^?{}( Struct & a ) {  dest_count++; }      // destruct
         Struct st, st1, sta[dim], sta1[dim], * stp, * stp1;
 …
         free( stp );
         stp = &(*memalign( Alignment )){ 42, 42.5 };          // CFA memalign
+        stp = &(*memalign( Alignment )){ 42, 42.5 };            // CFA memalign
         assert( (uintptr_t)stp % Alignment == 0 );
         printf( "CFA memalign %d %g\n", stp->x, stp->y );
 …
         free( stp );
         stp = &(*alloc_align( Alignment)){ 42, 42.5 };          // CFA alloc_align
+        stp = &(*alloc( Alignment`align)){ 42, 42.5 };          // CFA alloc_align
         assert( (uintptr_t)stp % Alignment == 0 );
         printf( "CFA alloc_align %d %g\n", stp->x, stp->y );
         free( stp );
         stp = &(*alloc_align( Alignment )){ 42, 42.5 };         // CFA alloc_align
+        stp = &(*alloc( Alignment`align )){ 42, 42.5 };         // CFA alloc_align
         assert( (uintptr_t)stp % Alignment == 0 );
         printf( "CFA alloc_align %d %g\n", stp->x, stp->y );
         free( stp );
         stp = alloc_align_set( Alignment, fill );                       // CFA memalign, fill
+        stp = alloc( Alignment`align, fill`fill );                      // CFA memalign, fill
         assert( (uintptr_t)stp % Alignment == 0 );
         printf( "CFA alloc_align fill %#x %a\n", stp->x, stp->y );
         free( stp );
         stp = alloc_align_set( Alignment, (Struct){ 42, 42.5 } ); // CFA memalign, fill
+        stp = alloc( Alignment`align, (Struct){ 42, 42.5 }`fill ); // CFA memalign, fill
         assert( (uintptr_t)stp % Alignment == 0 );
         printf( "CFA alloc_align fill %d %g\n", stp->x, stp->y );
         // do not free
         stp = &(*alloc_align( stp, 4096 )){ 42, 42.5 };         // CFA realign
+        stp = &(*alloc( stp`realloc, 4096`align )){ 42, 42.5 };         // CFA realign
         assert( (uintptr_t)stp % 4096 == 0 );
         printf( "CFA alloc_align %d %g\n", stp->x, stp->y );
 …
         printf( "\n" );
         stp = alloc_align( Alignment, dim );                // CFA array memalign
+        stp = alloc( dim, Alignment`align );                // CFA array memalign
         assert( (uintptr_t)stp % Alignment == 0 );
         for ( i; dim ) { stp[i] = (Struct){ 42, 42.5 }; }
 …
         free( stp );
         stp = alloc_align_set( Alignment, dim, fill );          // CFA array memalign, fill
+        stp = alloc( dim, Alignment`align, fill`fill );         // CFA array memalign, fill
         assert( (uintptr_t)stp % Alignment == 0 );
         printf( "CFA array alloc_align, fill\n" );
 …
         free( stp );
         stp = alloc_align_set( Alignment, dim, (Struct){ 42, 42.5 } ); // CFA array memalign, fill
+        stp = alloc( dim, Alignment`align, ((Struct){ 42, 42.5 })`fill ); // CFA array memalign, fill
         assert( (uintptr_t)stp % Alignment == 0 );
         printf( "CFA array alloc_align, fill\n" );
 …
         // do not free
         stp1 = alloc_align_set( Alignment, dim, stp );          // CFA array memalign, fill
+        stp1 = alloc( dim, Alignment`align, [stp, dim]`fill );  // CFA array memalign, fill
         assert( (uintptr_t)stp % Alignment == 0 );
         printf( "CFA array alloc_align, fill array\n" );
 …
         free( stp1 );
         stp = alloc_align( stp, 4096, dim );                            // CFA aligned realloc array
+        stp = alloc( dim, stp`realloc, 4096`align );                            // CFA aligned realloc array
         assert( (uintptr_t)stp % 4096 == 0 );
         for ( i; dim ) { stp[i] = (Struct){ 42, 42.5 }; }
 …
         printf( "\n" );
         // new, non-array types
         printf( "\n" );
+        const_count = dest_count = 0;
         stp = new( 42, 42.5 );
+        assert( const_count == 1 && dest_count == 0 );                                          // assertion for testing
         stp1 = new( 42, 42.5 );
+        assert( const_count == 2 && dest_count == 0 );                                          // assertion for testing
         printf( "CFA new initialize\n%d %g %d %g\n", stp->x, stp->y, stp1->x, stp1->y );
         delete( stp, stp1 );
+        assert( const_count == 2 && dest_count == 2 );                                          // assertion for testing
         // new, array types
         stp = anew( dim, 42, 42.5 );
+        assert( const_count == 2 + dim && dest_count == 2 );                            // assertion for testing
         printf( "CFA array new initialize\n" );
         for ( i; dim ) { printf( "%d %g, ", stp[i].x, stp[i].y ); }
         printf( "\n" );
         stp1 = anew( dim, 42, 42.5 );
+        assert( const_count == 2 + 2 * dim && dest_count == 2 );                        // assertion for testing
         for ( i; dim ) { printf( "%d %g, ", stp1[i].x, stp1[i].y ); }
         printf( "\n" );
+        adelete( dim, stp, dim, stp1 );
+        adelete( stp, stp1 );
+        assert( const_count == 2 + 2 * dim && dest_count == 2 + 2 * dim);       // assertion for testing
         // extras
 …
         free( fp - 1 );
+        p = foo( bar( baz( malloc(), 0 ), 0 ), 0 );
+        *p = 0xdeadbeef;
+        printf( "CFA deep malloc %#x\n", *p );
+        free( p );
+        ip = foo( bar( baz( malloc(), 0 ), 0 ), 0 );
+        *ip = 0xdeadbeef;
+        printf( "CFA deep malloc %#x\n", *ip );
+        dp = alloc(5.0`fill); // just for testing multiple free
+        assert(*dp == 5.0);
+        free( ip, dp, 0p );
 #ifdef ERR1
         stp = malloc();
         printf( "\nSHOULD FAIL\n" );
+        p = realloc( stp, dim * sizeof( *stp ) );
+        p = alloc( stp, dim * sizeof( *stp ) );
+        p = memset( stp, 10 );
+        p = memcpy( &st1, &st );
+#endif
+        ip = realloc( stp, dim * sizeof( *stp ) );
+        ip = memset( stp, 10 );
+        ip = memcpy( &st1, &st );
+#endif // ERR1
 } // main

tests/array.cfa

-              r3c64c668
+              r58fe85a
 //                               -*- Mode: C -*-
 //
+//                               -*- Mode: C -*-
+//
 // Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
 //
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 //
+//
 // array.cfa -- test array declarations
 //
+//
 // Author           : Peter A. Buhr
 // Created On       : Tue Feb 19 21:18:06 2019
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb 19 21:18:46 2019
 // Update Count     : 1
 //
+// Last Modified On : Sun Sep 27 09:05:40 2020
+// Update Count     : 4
+//
 int a1[];
+int a1[0];
 //int a2[*];
 //double a4[3.0];
 int m1[][3];
+int m1[0][3];
 //int m2[*][*];
 int m4[3][3];
 …
+}
+int main() {}
+int main() {
+        #if !defined(NO_COMPILED_PRAGMA)
+                #pragma message( "Compiled" )   // force non-empty .expect file
+        #endif
+}
 // Local Variables: //

tests/avltree/avl1.cfa

r3c64c668	r58fe85a
24	24	tree(K, V) * create(K key, V value) {
25	25	// infinite loop trying to resolve ... t = malloc();
26		tree(K, V) * t = malloc(sizeof(tree(K,V)));
	26	tree(K, V) * t = ( tree(K, V) * ) malloc(sizeof(tree(K,V)));
27	27	(*t){ key, value };
28	28	return t;

tests/builtins/.expect/sync.txt

r3c64c668	r58fe85a
	1	builtins/sync.cfa: In function '_X4mainFi___1':
	2	builtins/sync.cfa:358:9: note: #pragma message: Compiled

tests/builtins/sync.cfa

-              r3c64c668
+              r58fe85a
         #if defined(__SIZEOF_INT128__)
         { __int128 ret; ret = __sync_fetch_and_nand(vplll, vlll); }
-        { __int128 ret; ret = __sync_fetch_and_nand_16(vplll, vlll); }
         #endif
 …
 int main() {
         return 0;
+        #pragma message( "Compiled" )                   // force non-empty .expect file
+}

tests/cast.cfa

-              r3c64c668
+              r58fe85a
 //Dummy main
+int main(int argc, char const *argv[])
+{
+        return 0;
+int main( int argc, char const * argv[] ) {
+        #pragma message( "Compiled" )                   // force non-empty .expect file
+}

tests/castError.cfa

-              r3c64c668
+              r58fe85a
 //
+forall(otype T) struct S { T p; };
 int f;
+S(int) sint;
 void f() {
 …
         short int v;
 , v;           // implicit void cast
+        (S(char)) sint;
+}

tests/complex.cfa

r3c64c668	r58fe85a
14	14	//
15	15
16		~~#include <stdio.h>~~
17	16	#include <complex.h>
18	17	#ifdef __CFA__

tests/concurrent/.expect/monitor.txt

r3c64c668	r58fe85a
1		4000000
	1	3000000

tests/concurrent/coroutineYield.cfa

r3c64c668	r58fe85a
33	33	sout \| "Coroutine 2";
34	34	#endif
35		suspend();
	35	suspend;
36	36	}
37	37	}

tests/concurrent/examples/.expect/datingService.txt

r3c64c668	r58fe85a
	1	done

tests/concurrent/examples/boundedBufferEXT.cfa

-              r3c64c668
+              r58fe85a
 //
 // Cforall Version 1.0.0 Copyright (C) 2018 University of Waterloo
 //
+//
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 …
+}
+enum { Prods = 4, Cons = 5 };
+Producer * prods[Prods];
+Consumer * cons[Cons];
 int main() {
         Buffer(int) buffer;
-        enum { Prods = 4, Cons = 5 };
-        Producer * prods[Prods];
-        Consumer * cons[Cons];
         int sums[Cons];
         int i;

tests/concurrent/examples/datingService.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Oct 30 12:56:20 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Jun 21 11:32:34 2019
 // Update Count     : 38
+// Last Modified On : Sun Sep 27 15:42:25 2020
+// Update Count     : 40
 //
 …
                 signal_block( Boys[ccode] );                                    // restart boy to set phone number
         } // if
         //sout | "Girl:" | PhoneNo | "is dating Boy at" | BoyPhoneNo | "with ccode" | ccode;
+        // sout | "Girl:" | PhoneNo | "is dating Boy at" | BoyPhoneNo | "with ccode" | ccode;
         return BoyPhoneNo;
 } // DatingService girl
 …
                 signal_block( Girls[ccode] );                                   // restart girl to set phone number
         } // if
         //sout | " Boy:" | PhoneNo | "is dating Girl" | GirlPhoneNo | "with ccode" | ccode;
+        // sout | " Boy:" | PhoneNo | "is dating Girl" | GirlPhoneNo | "with ccode" | ccode;
         return GirlPhoneNo;
 } // DatingService boy
 …
                 if ( girlck[ boyck[i] ] != boyck[ girlck[i] ] ) abort();
         } // for
+        printf( "done\n" );                                                                     // non-empty .expect file
 } // main

tests/concurrent/monitor.cfa

r3c64c668	r58fe85a
29	29
30	30	void main( MyThread & this ) {
31		for(int i = 0; i < ~~1_00~~0_000; i++) {
	31	for(int i = 0; i < 750_000; i++) {
32	32	increment( global );
33	33	}

tests/concurrent/park/.expect/force_preempt.txt

-              r3c64c668
+              r58fe85a
+Calling unpark 0
+Calling unpark 0
+Calling unpark 0
+Calling unpark 0
+Calling unpark 0
+Parking 0
+Unparked 0
+Calling unpark 1
+Parking 0
+Unparked 0
+Calling unpark 1
+Parking 0
+Unparked 0
+Calling unpark 1
+Parking 0
+Unparked 0
+Calling unpark 1
+Parking 0
+Unparked 0
+Calling unpark 1
+Parking 1
+Unparked 1
+Calling unpark 2
+Parking 1
+Unparked 1
+Calling unpark 2
+Parking 1
+Unparked 1
+Calling unpark 2
+Parking 1
+Unparked 1
+Calling unpark 2
+Parking 1
+Unparked 1
+Calling unpark 2
+Parking 2
+Unparked 2
+Calling unpark 3
+Parking 2
+Unparked 2
+Calling unpark 3
+Parking 2
+Unparked 2
+Calling unpark 3
+Parking 2
+Unparked 2
+Calling unpark 3
+Parking 2
+Unparked 2
+Calling unpark 3
+Parking 3
+Unparked 3
+Calling unpark 4
+Parking 3
+Unparked 3
+Calling unpark 4
+Parking 3
+Unparked 3
+Calling unpark 4
+Parking 3
+Unparked 3
+Calling unpark 4
+Parking 3
+Unparked 3
+Calling unpark 4
+Parking 4
+Unparked 4
+Parking 4
+Unparked 4
+Parking 4
+Unparked 4
+Parking 4
+Unparked 4
+Parking 4
+Unparked 4
+done

tests/concurrent/park/contention.cfa

r3c64c668	r58fe85a
24	24	} else {
25	25	Thread * thrd = __atomic_exchange_n(&blocked[idx], &this, __ATOMIC_SEQ_CST);
26		unpark( *thrd);
	26	unpark( *thrd );
27	27	park();
28	28	}

tests/concurrent/park/force_preempt.cfa

-              r3c64c668
+              r58fe85a
+}
-thread Waiter;
 thread Waiter {};
 volatile int count = 0;
 …
         // Get a unique id
         int id = __atomic_fetch_add(&count, 1, __ATOMIC_SEQ_CST);
+        int id_hash = id | (id << 8) | (id << 16) | (id << 24);
+        int mask = 0xCAFEBABA;
         for(int i = 0; i < 5; i++) {
+                assert(mask == 0xCAFEBABA);
                 // Unpark this thread, don't force a yield
+                sout | id | "Calling unpark" | i;
+                unpark(this);
+                unpark( this );
+                assert(mask == 0xCAFEBABA);
+                // Hash the mask to make sure no one else messes with them
+                mask ^= id_hash;
+                assert(mask == (id_hash ^ 0xCAFEBABA));
                 // Force a preemption before the call to park
 …
                 // Park this thread,
                 sout | id | "Parking" | i;
+                assert(mask == (id_hash ^ 0xCAFEBABA));
                 park();
+                sout | id | "Unparked" | i;
+                assert(mask == (id_hash ^ 0xCAFEBABA));
+                // Reset the hash and recheck it
+                mask ^= id_hash;
+                assert(mask == 0xCAFEBABA);
+        }
+}
 …
                 Waiter waiters[5];
+        }
+        printf( "done\n" );                             // non-empty .expect file
+}

tests/concurrent/signal/block.cfa

r3c64c668	r58fe85a
82	82	if( !is_empty( cond ) ) {
83	83
84		$thread * next = front( cond );
	84	$thread * next = ( $thread * ) front( cond );
85	85
86	86	if( ! signal_block( cond ) ) {

tests/concurrent/signal/disjoint.cfa

-              r3c64c668
+              r58fe85a
 #endif
+// This tests checks what happens when someone barges in the midle of the release
+// of a bulk of monitors.
 enum state_t { WAIT, SIGNAL, BARGE };
 monitor global_t {};
-global_t mut;
 monitor global_data_t;
 …
         int counter;
         state_t state;
+} data;
+};
+// Use a global struct because the order needs to match with Signaller thread
+struct {
+        global_t mut;
+        global_data_t data;
+} globals;
 condition cond;
 …
 void ?{}( global_data_t & this ) {
         this.counter == 0;
+        this.counter = 0;
         this.state = BARGE;
+}
 …
 thread Barger {};
+void ?{}( Barger & this ) {
+        ((thread&)this){ "Barger Thread" };
+}
 void main( Barger & this ) {
         while( !all_done ) {
                 barge( data );
+                barge( globals.data );
                 yield();
+        }
 …
 thread Waiter {};
+void ?{}( Waiter & this ) {
+        ((thread&)this){ "Waiter Thread" };
+}
 void main( Waiter & this ) {
         while( wait( mut, data ) ) { KICK_WATCHDOG; yield(); }
+        while( wait( globals.mut, globals.data ) ) { KICK_WATCHDOG; yield(); }
+}
 …
 void logic( global_t & mutex a ) {
         signal( cond, a, data );
+        signal( cond, a, globals.data );
         yield( random( 10 ) );
         //This is technically a mutual exclusion violation but the mutex monitor protects us
         bool running = TEST(data.counter < N) && data.counter > 0;
         if( data.state != SIGNAL && running ) {
                 sout | "ERROR Eager signal" | data.state;
+        bool running = TEST(globals.data.counter < N) && globals.data.counter > 0;
+        if( globals.data.state != SIGNAL && running ) {
+                sout | "ERROR Eager signal" | globals.data.state;
+        }
+}
 thread Signaller {};
+void ?{}( Signaller & this ) {
+        ((thread&)this){ "Signaller Thread" };
+}
 void main( Signaller & this ) {
         while( !all_done ) {
                 logic( mut );
+                logic( globals.mut );
                 yield();
+        }

tests/concurrent/waitfor/when.cfa

-              r3c64c668
+              r58fe85a
 void arbiter( global_t & mutex this ) {
+        // There is a race at start where callers can get in before the arbiter.
+        // It doesn't really matter here so just restart the loop correctly and move on
+        this.last_call = 6;
         for( int i = 0; i < N; i++ ) {
                    when( this.last_call == 6 ) waitfor( call1 : this ) { if( this.last_call != 1) { serr | "Expected last_call to be 1 got" | this.last_call; } }

tests/config.py.in

r3c64c668	r58fe85a
9	9	HOSTARCH = "@host_cpu@"
10	10	DISTRIBUTE = @HAS_DISTCC@
	11	NEWAST = @DEFAULT_NEW_AST@

tests/copyfile.cfa

-              r3c64c668
+              r58fe85a
 //
 // Author           : Peter A. Buhr
 // Created On       : Tue Jul 16 16:47:22 2019
+// Created On       : Fri Jun 19 13:44:05 2020
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Jul 17 18:04:44 2019
 // Update Count     : 26
+// Last Modified On : Sat Aug 15 15:00:48 2020
+// Update Count     : 6
 //
 #include <fstream.hfa>
 #include <stdlib.hfa>                                                                   // new/delete
+#include <exception.hfa>
 int main( int argc, char * argv[] ) {
+        ifstream * in  = &stdin;                                                        // default files
+        ofstream * out = &stdout;
+        ifstream in  = stdin;                                                           // copy default files
+        ofstream out = stdout;
         try {
                 choose ( argc ) {
+                choose ( argc ) {                                                               // terminate if command-line errors
                   case 2, 3:
                           in = new( (const char *)argv[1] );            // open input file first as output creates file
                           if ( argc == 3 ) out = new( (const char *)argv[2] ); // only open output if input opens as output created if nonexistent
                   case 1: ;                                     // use default files
                   default:
                           exit | "Usage [ input-file (default stdin) [ output-file (default stdout) ] ]";
+                        open( in, argv[1] );                                            // open input file first as output creates file
+                        if ( argc == 3 ) open( out, argv[2] );          // do not create output unless input opens
+                  case 1: ;                                                                             // use default files
+                  default:                                                                              // wrong number of options
+                        exit | "Usage" | argv[0] | "[ input-file (default stdin) [ output-file (default stdout) ] ]";
                 } // choose
+        } catch( Open_Failure * ex ; ex->istream == &in ) {
+                exit | "Unable to open input file" | argv[1];
+        } catch( Open_Failure * ex ; ex->ostream == &out ) {
+                close( in );                                                                    // optional
+                exit | "Unable to open output file" | argv[2];
+        } // try
+                char ch;
+                *out | nlOff;                                                                   // turn off auto newline
+                *in  | nlOn;                                                                    // turn on reading newline
+        out | nlOff;                                                                            // turn off auto newline
+        in  | nlOn;                                                                                     // turn on reading newline
+                for () {                                                                                // read all characters
+                        *in | ch;
+                  if ( eof( *in ) ) break;                                              // eof ?
+                        *out | ch;
+                } // for
+        } finally {
+                if ( in  != &stdin  ) delete( in );                             // close file, do not delete stdin!
+                if ( out != &stdout ) delete( out );                    // close file, do not delete stdout!
+        } // try
+        char ch;
+        for () {                                                                                        // read all characters
+                in | ch;
+          if ( eof( in ) ) break;                                                       // eof ?
+                out | ch;
+        } // for
 } // main

tests/coroutine/.expect/fmtLines.txt

-              r3c64c668
+              r58fe85a
 {                                                         // f  or n  ewli
 ne c  hara  cter  s                                     su
 spen  d();                                      if   ( fm
 t.ch   !=   '\n'   ) b  reak
 ;               /  / ig  nore   new  line
                                   } //   for                              sout
  | f  mt.c  h;                                                  //
 prin  t ch  arac  ter                   }
 // f  or                        sou  t |   "  "
 ;                                                               //   prin  t bl
 ock   sepa  rato  r             }   //
 for             sou  t |   nl;
                                   // p  rint   gro  up s
 epar  ator      } /  / fo  r} /
 / ma  invo  id p  rt(   Form
 at &   fmt  , ch  ar c  h )
 {      fmt  .ch   = ch  ;
  res  ume(   fmt   );}   //
 prti  nt m  ain(  ) {     Form
 at f  mt;         char   ch;    for
  ( ;  ; )   {           s  in |   ch;
                                                                                 //   rea  d on
 e ch  arac  ter     if   ( e
 of(   sin   ) )   brea  k;
                                         //   eof   ?            p  rt(
 fmt,   ch   );  }   //   for}
  //   main  // L  ocal   Var
 iabl  es:   ////   tab  -wid
 th:   4 //  // c  ompi  le-c
 omma  nd:   "cfa   fmt  Line
 s.cf  a" /  ///   End:   //
+spen  d;                                        i  f (   fmt.
+ch !  = '\  n' )   bre  ak;
+        //   igno  re n  ewli  ne
+                }   // f  or                            so  ut |
+ fmt  .ch;                                                      /  / pr
+int   char  acte  r                       } //
+ for                    s  out   | "    ";
+                                                        /  / pr  int   bloc
+k se  para  tor         } /  / fo
+r               s  out   | nl  ;
+                //   pri  nt g  roup   sep
+arat  or        }   //   for}   //
+main  void   prt  ( Fo  rmat
+ & f  mt,   char   ch   ) {
+   f  mt.c  h =   ch;      r
+esum  e( f  mt )  ;} /  / pr
+tint   mai  n()   {     Fo  rmat
+ fmt  ; ch  ar c  h;    f  or (
+ ;;   ) {               sin   | c  h;
+                                                                  // r  ead   one
+char  acte  r       if (   eof
+( si  n )   ) br  eak;
+                        /  / eo  f ?            prt  ( fm
+t, c  h );      } /  / fo  r} /
+/ ma  in//   Loc  al V  aria
+bles  : //  // t  ab-w  idth
+: 4   ////   com  pile  -com
+mand  : "c  fa f  mtLi  nes.
+cfa"   ///  / En  d: /  /

tests/coroutine/.in/fmtLines.txt

r3c64c668	r58fe85a
35	35	for ( fmt.b = 0; fmt.b < 4; fmt.b += 1 ) { // blocks of 4 characters
36	36	for ( ;; ) { // for newline characters
37		suspend();
	37	suspend;
38	38	if ( fmt.ch != '\n' ) break; // ignore newline
39	39	} // for

tests/coroutine/cntparens.cfa

-              r3c64c668
+              r58fe85a
 //
+//
 // Cforall Version 1.0.0 Copyright (C) 2017 University of Waterloo
 //
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 //
+//
 // cntparens.cfa -- match left/right parenthesis
 //
+//
 // Author           : Peter A. Buhr
 // Created On       : Sat Apr 20 11:04:45 2019
 …
 // Last Modified On : Sat Apr 20 11:06:21 2019
 // Update Count     : 1
 //
+//
 #include <fstream.hfa>
 …
 void main( CntParens & cpns ) with( cpns ) {
         for ( ; ch == '('; cnt += 1 ) {                                         // left parenthesis
                 suspend();
+                suspend;
+        }
         for ( ; ch == ')' && cnt > 1; cnt -= 1 ) {                      // right parenthesis
                 suspend();
+                suspend;
+        }
         status = ch == ')' ? Match : Error;
 } // main
 void ?{}( CntParens & cpns ) with( cpns ) { status = Cont; cnt = 0; }

tests/coroutine/devicedriver.cfa

-              r3c64c668
+              r58fe85a
 //
+//
 // Cforall Version 1.0.0 Copyright (C) 2017 University of Waterloo
 //
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 //
 // devicedriver.cfa --
 //
+//
+// devicedriver.cfa --
+//
 // Author           : Peter A. Buhr
 // Created On       : Sat Mar 16 15:30:34 2019
 …
 // Last Modified On : Sat Apr 20 09:07:19 2019
 // Update Count     : 90
 //
+//
 #include <fstream.hfa>
 …
 void checkCRC( Driver & d, unsigned int sum ) with( d ) {
         suspend();
+        suspend;
         unsigned short int crc = byte << 8;                                     // sign extension over written
         suspend();
+        suspend;
         // prevent sign extension for signed char
         status = (crc | (unsigned char)byte) == sum ? MSG : ECRC;
 …
                 status = CONT;
                 unsigned int lnth = 0, sum = 0;
                 while ( byte != STX ) suspend();
+                while ( byte != STX ) suspend;
           emsg: for () {
                         suspend();
+                        suspend;
                         choose ( byte ) {                                                       // process byte
                           case STX:
                                 status = ESTX; suspend(); continue msg;
+                                status = ESTX; suspend; continue msg;
                           case ETX:
                                 break emsg;
                           case ESC:
                                 suspend();
+                                suspend;
                         } // choose
                         if ( lnth >= MaxMsg ) {                                         // buffer full ?
                                 status = ELNTH; suspend(); continue msg;
+                                status = ELNTH; suspend; continue msg;
                         } // if
                         msg[lnth++] = byte;
 …
                 msg[lnth] = '\0';                                                               // terminate string
                 checkCRC( d, sum );                                                             // refactor CRC check
                 suspend();
+                suspend;
         } // for
 } // main

tests/coroutine/fibonacci.cfa

-              r3c64c668
+              r58fe85a
         int fn1, fn2;                                                                           // retained between resumes
         fn = 0;  fn1 = fn;                                                                      // 1st case
         suspend();                                                                                      // restart last resume
+        suspend;                                                                                        // restart last resume
         fn = 1;  fn2 = fn1;  fn1 = fn;                                          // 2nd case
         suspend();                                                                                      // restart last resume
+        suspend;                                                                                        // restart last resume
         for () {
                 fn = fn1 + fn2;  fn2 = fn1;  fn1 = fn;                  // general case
                 suspend();                                                                              // restart last resume
+                suspend;                                                                                // restart last resume
         } // for
+}

tests/coroutine/fibonacci_1.cfa

-              r3c64c668
+              r58fe85a
 // Last Modified On : Thu Mar 21 08:10:45 2019
 // Update Count     : 25
 //
+//
 #include <fstream.hfa>
 …
         [fn1, fn] = [0, 1];                                                                     // precompute first two states
         for () {
                 suspend();                                                                              // restart last resume
+                suspend;                                                                                // restart last resume
                 [fn1, fn] = [fn, fn1 + fn];                                             // general case
         } // for

tests/coroutine/fmtLines.cfa

r3c64c668	r58fe85a
27	27	for ( b = 0; b < 4; b += 1 ) { // blocks of 4 characters
28	28	for () { // for newline characters
29		suspend();
	29	suspend;
30	30	if ( ch != '\n' ) break; // ignore newline
31	31	} // for

tests/coroutine/raii.cfa

r3c64c668	r58fe85a
39	39	Raii raii = { "Coroutine" };
40	40	sout \| "Before Suspend";
41		suspend();
	41	suspend;
42	42	sout \| "After Suspend";
43	43	}

tests/coroutine/runningTotal.cfa

r3c64c668	r58fe85a
25	25	void update( RunTotal & rntl, int input ) with( rntl ) { // helper
26	26	total += input; // remember between activations
27		suspend(); // inactivate on stack
	27	suspend; // inactivate on stack
28	28	}
29	29

tests/coroutine/suspend_then.cfa

-              r3c64c668
+              r58fe85a
 #include <fstream.hfa>
-#include <coroutine.hfa>
+void then() {
+        sout | "Then!";
+}
+coroutine Fibonacci { int fn; };                                                // used for communication
+generator Fibonacci {
+        int fn;                                                                         // used for communication
+        int fn1, fn2;                                                           // retained between resumes
+};
 void main( Fibonacci & fib ) with( fib ) {                              // called on first resume
-        int fn1, fn2;                                                           // retained between resumes
         fn = 0;  fn1 = fn;                                                      // 1st case
         suspend_then(then);                                                     // restart last resume
+        suspend { sout | "Then!"; }                                             // restart last resume
         fn = 1;  fn2 = fn1;  fn1 = fn;                                  // 2nd case
         suspend_then(then);                                                     // restart last resume
+        suspend { sout | "Then!"; }                                             // restart last resume
         for () {
                 fn = fn1 + fn2;  fn2 = fn1;  fn1 = fn;                  // general case
                 suspend_then(then);                                             // restart last resume
+                suspend { sout | "Then!"; }                                     // restart last resume
         } // for
+}

tests/enum.cfa

r3c64c668	r58fe85a
26	26	//Dummy main
27	27	int main(int argc, char const *argv[]) {
	28	printf( "done\n" ); // non-empty .expect file
28	29	}

tests/expression.cfa

-              r3c64c668
+              r58fe85a
 int main() {
     int a[3] = { 0, 0, 0 };
     S s = { 3 }, * ps = &s;
     [int] t = { 3 };
     * [int] pt = &t;
     int i = 1, j = 2;
+        int a[3] = { 0, 0, 0 };
+        S s = { 3 }, * ps = &s;
+        [int] t = { 3 };
+        * [int] pt = &t;
+        int i = 1, j = 2;
     // operators
+        // operators
     !i;
     ~i;
     +i;
     -i;
     *ps;
     ++ps;
     --ps;
     ps++;
     ps--;
+        !i;
+        ~i;
+        +i;
+        -i;
+        *ps;
+        ++ps;
+        --ps;
+        ps++;
+        ps--;
     i + j;
     i - j;
     i * j;
+        i + j;
+        i - j;
+        i * j;
     i / j;
     i % j;
     i ^ j;
     i & j;
     i | j;
     i < j;
     i > j;
     i = j;
+        i / j;
+        i % j;
+        i ^ j;
+        i & j;
+        i | j;
+        i < j;
+        i > j;
+        i = j;
     i == j;
     i != j;
     i << j;
     i >> j;
     i <= j;
     i >= j;
     i && j;
     i || j;
     ps->i;
+        i == j;
+        i != j;
+        i << j;
+        i >> j;
+        i <= j;
+        i >= j;
+        i && j;
+        i || j;
+        ps->i;
     i *= j;
     i /= j;
     i %= j;
     i += j;
     i -= j;
     i &= j;
     i |= j;
     i ^= j;
     i <<= j;
     i >>= j;
+        i *= j;
+        i /= j;
+        i %= j;
+        i += j;
+        i -= j;
+        i &= j;
+        i |= j;
+        i ^= j;
+        i <<= j;
+        i >>= j;
     i ? i : j;
+        i ? i : j;
     // postfix function call
+        // postfix function call
+    (3 + 4)`mary;
+    ({3 + 4;})`mary;
+    [3, 4]`mary;
+`mary;
+    a[0]`mary;
+    a[0]`mary`mary;
+    s{0}`mary;
+    a[3]`jane++;
+    jack(3)`mary;
+    s.i`mary;
+    t.0`mary;
+    s.[i]`mary;
+    ps->i`mary;
+    pt->0`mary;
+    ps->[i]`mary;
+    i++`mary;
+    i--`mary;
+    (S){2}`mary;
+    (S)@{2}`mary;
+        (3 + 4)`mary;
+        ({3 + 4;})`mary;
+        [3, 4]`mary;
+`mary;
+        a[0]`mary;
+        a[0]`mary`mary;
+        s{0}`mary;
+        a[3]`jane++;
+        jack(3)`mary;
+        s.i`mary;
+        t.0`mary;
+        s.[i]`mary;
+        ps->i`mary;
+        pt->0`mary;
+        ps->[i]`mary;
+        i++`mary;
+        i--`mary;
+        (S){2}`mary;
+        (S)@{2}`mary;
+        #if !defined(NO_COMPILED_PRAGMA)
+                #pragma message( "Compiled" )   // force non-empty .expect file
+        #endif
 } // main

tests/forall.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May  9 08:48:15 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Mar 19 08:29:38 2019
 // Update Count     : 32
+// Last Modified On : Sun Sep 27 08:43:20 2020
+// Update Count     : 35
 //
 …
+}
 forall( otype T ) inline static {
         int RT9( T ) { T t; }
+        int RT9( T ) { T t; return 3; }
+}
 …
 // w3 g3;
+int main( void ) {}
+int main( void ) {
+        #pragma message( "Compiled" )                   // force non-empty .expect file
+}
 // Local Variables: //

tests/heap.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Nov  6 17:54:56 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Nov 24 12:34:51 2019
 // Update Count     : 28
+// Last Modified On : Tue Dec 15 12:11:51 2020
+// Update Count     : 79
 //
 …
 // }
+#define __U_DEFAULT_MMAP_START__ (512 * 1024 + 1)
+size_t default_mmap_start() __attribute__(( weak )) {
+        return __U_DEFAULT_MMAP_START__;
+size_t default_heap_expansion() {
+        return 10 * 1024 * 1024;
+} // default_heap_expansion
+size_t default_mmap_start() {
+        return 512 * 1024 + 1;
 } // default_mmap_start
 …
                 size_t s = (i + 1) * 20;
                 char * area = (char *)malloc( s );
-                if ( area == 0p ) abort( "malloc/free out of memory" );
                 area[0] = '\345'; area[s - 1] = '\345';                 // fill first/last
                 area[malloc_usable_size( area ) - 1] = '\345';  // fill ultimate byte
 …
                 size_t s = i + 1;                                                               // +1 to make initialization simpler
                 locns[i] = (char *)malloc( s );
-                if ( locns[i] == 0p ) abort( "malloc/free out of memory" );
                 locns[i][0] = '\345'; locns[i][s - 1] = '\345'; // fill first/last
                 locns[i][malloc_usable_size( locns[i] ) - 1] = '\345'; // fill ultimate byte
 …
                 size_t s = i + default_mmap_start();                    // cross over point
                 char * area = (char *)malloc( s );
-                if ( area == 0p ) abort( "malloc/free out of memory" );
                 area[0] = '\345'; area[s - 1] = '\345';                 // fill first/last
                 area[malloc_usable_size( area ) - 1] = '\345';  // fill ultimate byte
 …
                 size_t s = i + default_mmap_start();                    // cross over point
                 locns[i] = (char *)malloc( s );
-                if ( locns[i] == 0p ) abort( "malloc/free out of memory" );
                 locns[i][0] = '\345'; locns[i][s - 1] = '\345'; // fill first/last
                 locns[i][malloc_usable_size( locns[i] ) - 1] = '\345'; // fill ultimate byte
 …
                 size_t s = (i + 1) * 20;
                 char * area = (char *)calloc( 5, s );
-                if ( area == 0p ) abort( "calloc/free out of memory" );
                 if ( area[0] != '\0' || area[s - 1] != '\0' ||
                          area[malloc_usable_size( area ) - 1] != '\0' ||
+                         area[malloc_size( area ) - 1] != '\0' ||
                          ! malloc_zero_fill( area ) ) abort( "calloc/free corrupt storage1" );
                 area[0] = '\345'; area[s - 1] = '\345';                 // fill first/last
 …
                 size_t s = i + 1;
                 locns[i] = (char *)calloc( 5, s );
-                if ( locns[i] == 0p ) abort( "calloc/free out of memory" );
                 if ( locns[i][0] != '\0' || locns[i][s - 1] != '\0' ||
                          locns[i][malloc_usable_size( locns[i] ) - 1] != '\0' ||
+                         locns[i][malloc_size( locns[i] ) - 1] != '\0' ||
                          ! malloc_zero_fill( locns[i] ) ) abort( "calloc/free corrupt storage2" );
                 locns[i][0] = '\345'; locns[i][s - 1] = '\345'; // fill first/last
 …
                 size_t s = i + default_mmap_start();                    // cross over point
                 char * area = (char *)calloc( 1, s );
-                if ( area == 0p ) abort( "calloc/free out of memory" );
                 if ( area[0] != '\0' || area[s - 1] != '\0' ) abort( "calloc/free corrupt storage4.1" );
                 if ( area[malloc_usable_size( area ) - 1] != '\0' ) abort( "calloc/free corrupt storage4.2" );
+                if ( area[malloc_size( area ) - 1] != '\0' ) abort( "calloc/free corrupt storage4.2" );
                 if ( ! malloc_zero_fill( area ) ) abort( "calloc/free corrupt storage4.3" );
                 area[0] = '\345'; area[s - 1] = '\345';                 // fill first/last
 …
                 size_t s = i + default_mmap_start();                    // cross over point
                 locns[i] = (char *)calloc( 1, s );
-                if ( locns[i] == 0p ) abort( "calloc/free out of memory" );
                 if ( locns[i][0] != '\0' || locns[i][s - 1] != '\0' ||
                          locns[i][malloc_usable_size( locns[i] ) - 1] != '\0' ||
+                         locns[i][malloc_size( locns[i] ) - 1] != '\0' ||
                          ! malloc_zero_fill( locns[i] ) ) abort( "calloc/free corrupt storage5" );
                 locns[i][0] = '\345'; locns[i][s - 1] = '\345'; // fill first/last
 …
                 for ( s; 1 ~ NoOfAllocs ) {                                             // allocation of size 0 can return null
                         char * area = (char *)memalign( a, s );
-                        if ( area == 0p ) abort( "memalign/free out of memory" );
                         //sout | i | area;
                         if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
 …
                         size_t s = i + default_mmap_start();            // cross over point
                         char * area = (char *)memalign( a, s );
-                        if ( area == 0p ) abort( "memalign/free out of memory" );
                         //sout | i | area;
                         if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
 …
         } // for
+        // check malloc/resize/free (sbrk)
+        for ( i; 2 ~ NoOfAllocs ~ 12 ) {
+                // initial N byte allocation
+                char * area = (char *)malloc( i );
+                area[0] = '\345'; area[i - 1] = '\345';                 // fill first/penultimate byte
+                // Do not start this loop index at 0 because resize of 0 bytes frees the storage.
+                int prev = i;
+                for ( s; i ~ 256 * 1024 ~ 26 ) {                                // start at initial memory request
+                        if ( area[0] != '\345' || area[prev - 1] != '\345' ) abort( "malloc/resize/free corrupt storage" );
+                        area = (char *)resize( area, s );                       // attempt to reuse storage
+                        area[0] = area[s - 1] = '\345';                         // fill last byte
+                        prev = s;
+                } // for
+                free( area );
+        } // for
+        // check malloc/resize/free (mmap)
+        for ( i; 2 ~ NoOfAllocs ~ 12 ) {
+                // initial N byte allocation
+                size_t s = i + default_mmap_start();                    // cross over point
+                char * area = (char *)malloc( s );
+                area[0] = '\345'; area[s - 1] = '\345';                 // fill first/penultimate byte
+                // Do not start this loop index at 0 because resize of 0 bytes frees the storage.
+                int prev = s;
+                for ( r; s ~ 256 * 1024 ~ 26 ) {                                // start at initial memory request
+                        if ( area[0] != '\345' || area[prev - 1] != '\345' ) abort( "malloc/resize/free corrupt storage" );
+                        area = (char *)resize( area, s );                       // attempt to reuse storage
+                        area[0] = area[r - 1] = '\345';                         // fill last byte
+                        prev = r;
+                } // for
+                free( area );
+        } // for
+        // check malloc/realloc/free (sbrk)
+        for ( i; 2 ~ NoOfAllocs ~ 12 ) {
+                // initial N byte allocation
+                char * area = (char *)malloc( i );
+                area[0] = '\345'; area[i - 1] = '\345';                 // fill first/penultimate byte
+                // Do not start this loop index at 0 because realloc of 0 bytes frees the storage.
+                int prev = i;
+                for ( s; i ~ 256 * 1024 ~ 26 ) {                                // start at initial memory request
+                        if ( area[0] != '\345' || area[prev - 1] != '\345' ) abort( "malloc/realloc/free corrupt storage" );
+                        area = (char *)realloc( area, s );                      // attempt to reuse storage
+                        area[s - 1] = '\345';                                           // fill last byte
+                        prev = s;
+                } // for
+                free( area );
+        } // for
+        // check malloc/realloc/free (mmap)
+        for ( i; 2 ~ NoOfAllocs ~ 12 ) {
+                // initial N byte allocation
+                size_t s = i + default_mmap_start();                    // cross over point
+                char * area = (char *)malloc( s );
+                area[0] = '\345'; area[s - 1] = '\345';                 // fill first/penultimate byte
+                // Do not start this loop index at 0 because realloc of 0 bytes frees the storage.
+                int prev = s;
+                for ( r; s ~ 256 * 1024 ~ 26 ) {                                // start at initial memory request
+                        if ( area[0] != '\345' || area[prev - 1] != '\345' ) abort( "malloc/realloc/free corrupt storage" );
+                        area = (char *)realloc( area, s );                      // attempt to reuse storage
+                        area[r - 1] = '\345';                                           // fill last byte
+                        prev = r;
+                } // for
+                free( area );
+        } // for
         // check calloc/realloc/free (sbrk)
 …
                 // initial N byte allocation
                 char * area = (char *)calloc( 5, i );
-                if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
                 if ( area[0] != '\0' || area[i - 1] != '\0' ||
                          area[malloc_usable_size( area ) - 1] != '\0' ||
+                         area[malloc_size( area ) - 1] != '\0' ||
                          ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage1" );
 …
                 for ( s; i ~ 256 * 1024 ~ 26 ) {                                // start at initial memory request
                         area = (char *)realloc( area, s );                      // attempt to reuse storage
-                        if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
                         if ( area[0] != '\0' || area[s - 1] != '\0' ||
                                  area[malloc_usable_size( area ) - 1] != '\0' ||
+                                 area[malloc_size( area ) - 1] != '\0' ||
                                  ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage2" );
                 } // for
 …
                 size_t s = i + default_mmap_start();                    // cross over point
                 char * area = (char *)calloc( 1, s );
-                if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
                 if ( area[0] != '\0' || area[s - 1] != '\0' ||
                          area[malloc_usable_size( area ) - 1] != '\0' ||
                          ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage1" );
+                         area[malloc_size( area ) - 1] != '\0' ||
+                         ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage3" );
                 // Do not start this loop index at 0 because realloc of 0 bytes frees the storage.
                 for ( r; i ~ 256 * 1024 ~ 26 ) {                                // start at initial memory request
                         area = (char *)realloc( area, r );                      // attempt to reuse storage
-                        if ( area == 0p ) abort( "calloc/realloc/free out of memory" );
                         if ( area[0] != '\0' || area[r - 1] != '\0' ||
                                  area[malloc_usable_size( area ) - 1] != '\0' ||
                                  ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage2" );
+                                 area[malloc_size( area ) - 1] != '\0' ||
+                                 ! malloc_zero_fill( area ) ) abort( "calloc/realloc/free corrupt storage4" );
                 } // for
                 free( area );
 …
                 // initial N byte allocation
                 char * area = (char *)memalign( a, amount );    // aligned N-byte allocation
-                if ( area == 0p ) abort( "memalign/realloc/free out of memory" ); // no storage ?
                 //sout | alignments[a] | area;
                 if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
 …
                         if ( area[0] != '\345' || area[s - 2] != '\345' ) abort( "memalign/realloc/free corrupt storage" );
                         area = (char *)realloc( area, s );                      // attempt to reuse storage
-                        if ( area == 0p ) abort( "memalign/realloc/free out of memory" ); // no storage ?
                         //sout | i | area;
                         if ( (size_t)area % a != 0 ) {                          // check for initial alignment
 …
                 for ( s; 1 ~ limit ) {                                                  // allocation of size 0 can return null
                         char * area = (char *)cmemalign( a, 1, s );
-                        if ( area == 0p ) abort( "cmemalign/free out of memory" );
                         //sout | i | area;
                         if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
 …
                         } // if
                         if ( area[0] != '\0' || area[s - 1] != '\0' ||
                                  area[malloc_usable_size( area ) - 1] != '\0' ||
+                                 area[malloc_size( area ) - 1] != '\0' ||
                                  ! malloc_zero_fill( area ) ) abort( "cmemalign/free corrupt storage" );
                         area[0] = '\345'; area[s - 1] = '\345';         // fill first/last byte
 …
                 // initial N byte allocation
                 char * area = (char *)cmemalign( a, 1, amount ); // aligned N-byte allocation
-                if ( area == 0p ) abort( "cmemalign/realloc/free out of memory" ); // no storage ?
                 //sout | alignments[a] | area;
                 if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
 …
                 } // if
                 if ( area[0] != '\0' || area[amount - 1] != '\0' ||
                          area[malloc_usable_size( area ) - 1] != '\0' ||
+                         area[malloc_size( area ) - 1] != '\0' ||
                          ! malloc_zero_fill( area ) ) abort( "cmemalign/realloc/free corrupt storage1" );
                 area[0] = '\345'; area[amount - 2] = '\345';    // fill first/penultimate byte
 …
                         if ( area[0] != '\345' || area[s - 2] != '\345' ) abort( "cmemalign/realloc/free corrupt storage2" );
                         area = (char *)realloc( area, s );                      // attempt to reuse storage
-                        if ( area == 0p ) abort( "cmemalign/realloc/free out of memory" ); // no storage ?
                         //sout | i | area;
                         if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
                                 abort( "cmemalign/realloc/free bad alignment %p", area );
                         } // if
                         if ( area[s - 1] != '\0' || area[s - 1] != '\0' ||
                                  area[malloc_usable_size( area ) - 1] != '\0' ||
+                        if ( area[0] != '\345' || area[s - 1] != '\0' ||
+                                 area[malloc_size( area ) - 1] != '\0' ||
                                  ! malloc_zero_fill( area ) ) abort( "cmemalign/realloc/free corrupt storage3" );
                         area[s - 1] = '\345';                                           // fill last byte
 …
         } // for
+        // check memalign/resize with align/free
+        amount = 2;
+        for ( a; libAlign() ~= limit ~ a ) {                            // generate powers of 2
+                // initial N byte allocation
+                char * area = (char *)memalign( a, amount );    // aligned N-byte allocation
+                //sout | alignments[a] | area | endl;
+                if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
+                        abort( "memalign/resize with align/free bad alignment : memalign(%d,%d) = %p", (int)a, (int)amount, area );
+                } // if
+                area[0] = '\345'; area[amount - 2] = '\345';    // fill first/penultimate byte
+                // Do not start this loop index at 0 because resize of 0 bytes frees the storage.
+                for ( s; amount ~ 256 * 1024 ) {                                // start at initial memory request
+                        area = (char *)resize( area, a * 2, s );        // attempt to reuse storage
+                        //sout | i | area | endl;
+                        if ( (size_t)area % a * 2 != 0 ) {                      // check for initial alignment
+                                abort( "memalign/resize with align/free bad alignment %p", area );
+                        } // if
+                        area[s - 1] = '\345';                                           // fill last byte
+                } // for
+                free( area );
+        } // for
         // check memalign/realloc with align/free
 …
                 // initial N byte allocation
                 char * area = (char *)memalign( a, amount );    // aligned N-byte allocation
-                if ( area == 0p ) abort( "memalign/realloc with align/free out of memory" ); // no storage ?
                 //sout | alignments[a] | area | endl;
                 if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
 …
                         if ( area[0] != '\345' || area[s - 2] != '\345' ) abort( "memalign/realloc/free corrupt storage" );
                         area = (char *)realloc( area, a * 2, s );       // attempt to reuse storage
-                        if ( area == 0p ) abort( "memalign/realloc with align/free out of memory" ); // no storage ?
                         //sout | i | area | endl;
                         if ( (size_t)area % a * 2 != 0 ) {                      // check for initial alignment
 …
         for ( size_t a = libAlign() + libAlign(); a <= limit; a += a ) { // generate powers of 2
                 // initial N byte allocation
+                char *area = (char *)cmemalign( a, 1, amount ); // aligned N-byte allocation
+                if ( area == 0p ) abort( "cmemalign/realloc with align/free out of memory" ); // no storage ?
+                char * area = (char *)cmemalign( a, 1, amount ); // aligned N-byte allocation
                 //sout | alignments[a] | area | endl;
                 if ( (size_t)area % a != 0 || malloc_alignment( area ) != a ) { // check for initial alignment
 …
                 } // if
                 if ( area[0] != '\0' || area[amount - 1] != '\0' ||
                          area[malloc_usable_size( area ) - 1] != '\0' ||
+                         area[malloc_size( area ) - 1] != '\0' ||
                          ! malloc_zero_fill( area ) ) abort( "cmemalign/realloc with align/free corrupt storage1" );
                 area[0] = '\345'; area[amount - 2] = '\345';    // fill first/penultimate byte
 …
                         if ( area[0] != '\345' || area[s - 2] != '\345' ) abort( "cmemalign/realloc with align/free corrupt storage2" );
                         area = (char *)realloc( area, a * 2, s );       // attempt to reuse storage
-                        if ( area == 0p ) abort( "cmemalign/realloc with align/free out of memory" ); // no storage ?
                         //sout | i | area | endl;
                         if ( (size_t)area % a * 2 != 0 || malloc_alignment( area ) != a * 2 ) { // check for initial alignment
                                 abort( "cmemalign/realloc with align/free bad alignment %p %jd %jd", area, malloc_alignment( area ), a * 2 );
+                                abort( "cmemalign/realloc with align/free bad alignment %p %zd %zd", area, malloc_alignment( area ), a * 2 );
                         } // if
                         if ( area[s - 1] != '\0' || area[s - 1] != '\0' ||
                                  area[malloc_usable_size( area ) - 1] != '\0' ||
+                                 area[malloc_size( area ) - 1] != '\0' ||
                                  ! malloc_zero_fill( area ) ) abort( "cmemalign/realloc/free corrupt storage3" );
                         area[s - 1] = '\345';                                           // fill last byte
 …
         // checkFreeOn();
         // malloc_stats();
+        printf( "done\n" );                                                                     // non-empty .expect file
+}

tests/identFuncDeclarator.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed Aug 17 08:36:34 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov  6 17:56:33 2018
 // Update Count     : 3
+// Last Modified On : Sun Sep 27 08:20:46 2020
+// Update Count     : 5
 //
 …
         int (* (* const f80)(int))();
         int (* const(* const f81)(int))();
+        #pragma message( "Compiled" )                   // force non-empty .expect file
+}

tests/identParamDeclarator.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed Aug 17 08:37:56 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov  6 17:56:44 2018
 // Update Count     : 3
+// Last Modified On : Fri Sep 25 14:31:08 2020
+// Update Count     : 4
 //
 …
 int main( int argc, char const *argv[] ) {                              // dummy main
         return 0;
+        printf( "done\n" );                                                                     // non-empty .expect file
+}

tests/io2.cfa

r3c64c668	r58fe85a
121	121
122	122	[int, int, const char *, double] t3 = { 3, 4, "a", 7.2 };
123		sout \| [ 3, 4, ~~"a", 7.2 ];~~
	123	sout \| [ 3, 4, (const char*)"a", 7.2 ]; // workaround trac#207: the const cast should not be needed
124	124	sout \| t3;
125	125	sepSetTuple( sout, " " );

tests/labelledExit.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed Aug 10 07:29:39 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Feb  5 16:49:48 2020
 // Update Count     : 9
+// Last Modified On : Sun Sep 27 09:01:34 2020
+// Update Count     : 12
 //
 …
 int main( int argc, char const *argv[] ) {
         /* code */
+        #pragma message( "Compiled" )                                           // force non-empty .expect file
+}

tests/limits.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue May 10 20:44:20 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov  6 17:57:55 2018
 // Update Count     : 8
+// Last Modified On : Sun Sep 27 08:45:43 2020
+// Update Count     : 10
 //
+// Note: For testing the ability to load the constants defined in libcfa/src/limits.cfa,
+// see discussion in test const-init.
 #include <limits.hfa>
 …
 int main(int argc, char const *argv[]) {
+        //DUMMY
+        return 0;
+        #pragma message( "Compiled" )                                           // force non-empty .expect file
+}

tests/linking/withthreads.cfa

r3c64c668	r58fe85a
5	5	// file "LICENCE" distributed with Cforall.
6	6	//
7		// nothreads.cfa --
	7	// withthreads.cfa --
8	8	//
9	9	// Author : Thierry Delisle

tests/literals.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Sat Sep  9 16:34:38 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb 12 08:07:39 2019
 // Update Count     : 224
+// Last Modified On : Sat Aug 29 10:57:56 2020
+// Update Count     : 226
 //
 …
         -0X0123456789.0123456789P-09;  -0X0123456789.0123456789P-09f;  -0X0123456789.0123456789P-09l;  -0X0123456789.0123456789P-09F;  -0X0123456789.0123456789P-09L;
+#if defined( __i386 ) || defined( __x86_64 )
 #if defined(__GNUC__) && __GNUC_PREREQ(7,0)                             // gcc version >= 7
 // floating with length, gcc f16/f128x unsupported and no prelude code for any _FloatXXx, so they work by conversion to long double
 …
         /* -0x123456789.0123456789P-09F16; */  -0x123456789.0123456789P-09F32;  -0x123456789.0123456789P-09F32x;  -0x123456789.0123456789P-09F64;  -0x123456789.0123456789P-09F64x;  -0x123456789.0123456789P-09W;  -0x123456789.0123456789P-09F128;  -0x123456789.0123456789P-09q;  /* -0x123456789.0123456789P-09q; */
 #endif // __GNUC_PREREQ(7,0)
+#endif // __i386 ) || __x86_64
 #ifdef __CFA__
 …
         -01234567_l8;  -01234567_l16;  -01234567_l32;  -01234567_l64;  -01234567_l8u;  -01234567_ul16;  -01234567_l32u;  -01234567_ul64;
 #ifdef __LP64__ // 64-bit processor
+#if defined( __SIZEOF_INT128__ )
         01234567_l128;   01234567_ul128;
         +01234567_l128;  +01234567_ul128;
         -01234567_l128;  -01234567_ul128;
 #endif // __LP64__
+#endif // __SIZEOF_INT128__
         // decimal
 …
         -1234567890L8;  -1234567890L16;  -1234567890l32;  -1234567890l64;  -1234567890UL8;  -1234567890L16U;  -1234567890Ul32;  -1234567890l64u;
 #ifdef __LP64__ // 64-bit processor
+#if defined( __SIZEOF_INT128__ )
         1234567890l128;   1234567890l128u;
         +1234567890l128;  +1234567890l128u;
         -1234567890l128;  -1234567890l128u;
+#endif // __LP64__
+    1234567890123456789_L128u; 1234567890123456789_L128u;
+        18446708753438544741_l64u; 18446708753438544741_Ul64;
+#endif // __SIZEOF_INT128__
         // hexadecimal

tests/manipulatorsInput.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Sat Jun  8 17:58:54 2019
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jun 13 17:41:43 2019
 // Update Count     : 37
+// Last Modified On : Wed Jul 15 15:56:03 2020
+// Update Count     : 47
 //
 …
                 sin | ignore( wdi( 8, ldc ) );                  sout | ldc;
+        }
+#if defined( __SIZEOF_INT128__ )
+        {
+                int128 val;
+                for ( 15 ) {
+                        sin | val;
+                        sout | val;
+                }
+        }
+#endif // __SIZEOF_INT128__
 } // main

tests/manipulatorsOutput1.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Sat Jun  8 18:04:11 2019
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Jun 10 12:37:28 2019
 // Update Count     : 8
+// Last Modified On : Fri May  1 11:51:44 2020
+// Update Count     : 9
 //
 …
         signed char sc = -12;
         printf( "%hhd %2hhd %5.2hhd %-5.2hhd %hho %#hho %hhx %#hhx %#8hhx %#8.10hhx %#8.3hhX %+-8.3hhd %08hhd\n", sc, sc, sc, sc, sc, sc, sc, sc, sc, sc, sc, sc, sc );
+        sout | sc | wd(2,sc) | wd(5,2,sc) | left(wd(5,2,sc)) | nobase(oct(sc)) | oct(sc) | nobase(hex(sc)) | hex(sc) | wd(8,hex(sc)) | wd(8,10,hex(sc)) | upcase(wd(8,3,hex(sc))) | left(sign(upcase(wd(8,3,sc)))) | pad0(wd(8,sc));
+        sout | sc | wd(2,sc) | wd(5,2,sc) | left(wd(5,2,sc)) | nobase(oct(sc)) | oct(sc) | nonl;
+        sout | nobase(hex(sc)) | hex(sc) | wd(8,hex(sc)) | wd(8,10,hex(sc)) | upcase(wd(8,3,hex(sc))) | nonl;
+        sout | left(sign(upcase(wd(8,3,sc)))) | pad0(wd(8,sc));
         sout | "unsigned char";
         unsigned char usc = 12;
         printf( "%hhu %2hhu %5.2hhu %-5.2hhu %hho %#hho %hhx %#hhx %#8hhx %#8.10hhx %#8.3hhX %-8.3hhu %08hhu\n", usc, usc, usc, usc, usc, usc, usc, usc, usc, usc, usc, usc, usc );
+        sout | usc | wd(2,usc) | wd(5,2,usc) | left(wd(5,2,usc)) | nobase(oct(usc)) | oct(usc) | nobase(hex(usc)) | hex(usc) | wd(8,hex(usc)) | wd(8,10,hex(usc)) | upcase(wd(8,3,hex(usc))) | left(upcase(wd(8,3,usc))) | pad0(wd(8,usc));
+        sout | usc | wd(2,usc) | wd(5,2,usc) | left(wd(5,2,usc)) | nobase(oct(usc)) | oct(usc) | nonl;
+        sout | nobase(hex(usc)) | hex(usc) | wd(8,hex(usc)) | wd(8,10,hex(usc)) | upcase(wd(8,3,hex(usc))) | nonl;
+        sout | left(upcase(wd(8,3,usc))) | pad0(wd(8,usc));
         sout | "signed short int";
         signed short int si = -12;
         printf( "%hd %2hd %5.2hd %-5.2hd %ho %#ho %hx %#hx %#8hx %#8.10hx %#8.3hX %+-8.3hd %08hd\n", si, si, si, si, si, si, si, si, si, si, si, si, si );
+        sout | si | wd(2,si) | wd(5,2,si) | left(wd(5,2,si)) | nobase(oct(si)) | oct(si) | nobase(hex(si)) | hex(si) | wd(8,hex(si)) | wd(8,10,hex(si)) | upcase(wd(8,3,hex(si))) | left(sign(upcase(wd(8,3,si)))) | pad0(wd(8,si));
+        sout | si | wd(2,si) | wd(5,2,si) | left(wd(5,2,si)) | nobase(oct(si)) | oct(si) | nonl;
+        sout | nobase(hex(si)) | hex(si) | wd(8,hex(si)) | wd(8,10,hex(si)) | upcase(wd(8,3,hex(si))) | nonl;
+        sout | left(sign(upcase(wd(8,3,si)))) | pad0(wd(8,si));
         sout | "unsigned short int";
         unsigned short int usi = 12;
         printf( "%hu %2hu %5.2hu %-5.2hu %ho %#ho %hx %#hx %#8hx %#8.10hx %#8.3hX %-8.3hu %08hu\n", usi, usi, usi, usi, usi, usi, usi, usi, usi, usi, usi, usi, usi );
+        sout | usi | wd(2,usi) | wd(5,2,usi) | left(wd(5,2,usi)) | nobase(oct(usi)) | oct(usi) | nobase(hex(usi)) | hex(usi) | wd(8,hex(usi)) | wd(8,10,hex(usi)) | upcase(wd(8,3,hex(usi))) | left(upcase(wd(8,3,usi))) | pad0(wd(8,usi));
+        sout | usi | wd(2,usi) | wd(5,2,usi) | left(wd(5,2,usi)) | nobase(oct(usi)) | oct(usi) | nonl;
+        sout | nobase(hex(usi)) | hex(usi) | wd(8,hex(usi)) | wd(8,10,hex(usi)) | upcase(wd(8,3,hex(usi))) | nonl;
+        sout | left(upcase(wd(8,3,usi))) | pad0(wd(8,usi));
         sout | "signed int";
         signed int i = -12;
         printf( "%d %2d %5.2d %-5.2d %o %#o %x %#x %#8x %#8.10x %#8.3X %+-8.3d %08d\n", i, i, i, i, i, i, i, i, i, i, i, i, i );
+        sout | i | wd(2,i) | wd(5,2,i) | left(wd(5,2,i)) | nobase(oct(i)) | oct(i) | nobase(hex(i)) | hex(i) | wd(8,hex(i)) | wd(8,10,hex(i)) | upcase(wd(8,3,hex(i))) | left(sign(upcase(wd(8,3,i)))) | pad0(wd(8,i));
+        sout | i | wd(2,i) | wd(5,2,i) | left(wd(5,2,i)) | nobase(oct(i)) | oct(i) | nonl;
+        sout | nobase(hex(i)) | hex(i) | wd(8,hex(i)) | wd(8,10,hex(i)) | upcase(wd(8,3,hex(i))) | nonl;
+        sout | left(sign(upcase(wd(8,3,i)))) | pad0(wd(8,i));
         sout | "unsigned int";
         unsigned int ui = 12;
         printf( "%u %2u %5.2u %-5.2u %o %#o %x %#x %#8x %#8.10x %#8.3X %-8.3u %08u\n", ui, ui, ui, ui, ui, ui, ui, ui, ui, ui, ui, ui, ui );
+        sout | ui | wd(2,ui) | wd(5,2,ui) | left(wd(5,2,ui)) | nobase(oct(ui)) | oct(ui) | nobase(hex(ui)) | hex(ui) | wd(8,hex(ui)) | wd(8,10,hex(ui)) | upcase(wd(8,3,hex(ui))) | left(upcase(wd(8,3,ui))) | pad0(wd(8,ui));
+        sout | ui | wd(2,ui) | wd(5,2,ui) | left(wd(5,2,ui)) | nobase(oct(ui)) | oct(ui) | nonl;
+        sout | nobase(hex(ui)) | hex(ui) | wd(8,hex(ui)) | wd(8,10,hex(ui)) | upcase(wd(8,3,hex(ui))) | nonl;
+        sout | left(upcase(wd(8,3,ui))) | pad0(wd(8,ui));
         sout | "signed long long int";
         signed long long int lli = -12;
         printf( "%lld %2lld %5.2lld %-5.2lld %llo %#llo %llx %#llx %#8llx %#8.10llx %#8.3llX %+-8.3lld %08lld\n", lli, lli, lli, lli, lli, lli, lli, lli, lli, lli, lli, lli, lli );
+        sout | lli | wd(2,lli) | wd(5,2,lli) | left(wd(5,2,lli)) | nobase(oct(lli)) | oct(lli) | nobase(hex(lli)) | hex(lli) | wd(8,hex(lli)) | wd(8,10,hex(lli)) | upcase(wd(8,3,hex(lli))) | left(sign(upcase(wd(8,3,lli)))) | pad0(wd(8,lli));
+        sout | lli | wd(2,lli) | wd(5,2,lli) | left(wd(5,2,lli)) | nobase(oct(lli)) | oct(lli) | nonl;
+        sout | nobase(hex(lli)) | hex(lli) | wd(8,hex(lli)) | wd(8,10,hex(lli)) | upcase(wd(8,3,hex(lli))) | nonl;
+        sout | left(sign(upcase(wd(8,3,lli)))) | pad0(wd(8,lli));
         sout | "unsigned long long int";
         unsigned long long int ulli = 12;
         printf( "%llu %2llu %5.2llu %-5.2llu %llo %#llo %llx %#llx %#8llx %#8.10llx %#8.3llX %-8.3llu %08llu\n", ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli );
+        sout | ulli | wd(2,ulli) | wd(5,2,ulli) | left(wd(5,2,ulli)) | nobase(oct(ulli)) | oct(ulli) | nobase(hex(ulli)) | hex(ulli) | wd(8,hex(ulli)) | wd(8,10,hex(ulli)) | upcase(wd(8,3,hex(ulli))) | left(upcase(wd(8,3,ulli))) | pad0(wd(8,ulli));
+        sout | ulli | wd(2,ulli) | wd(5,2,ulli) | left(wd(5,2,ulli)) | nobase(oct(ulli)) | oct(ulli) | nonl;
+        sout | nobase(hex(ulli)) | hex(ulli) | wd(8,hex(ulli)) | wd(8,10,hex(ulli)) | upcase(wd(8,3,hex(ulli))) | nonl;
+        sout | left(upcase(wd(8,3,ulli))) | pad0(wd(8,ulli));
         sout | nl | "binary integral";
+        sout | bin(0) | bin(13) | upcase(bin(13)) | nobase(bin(13)) | left(wd(8,bin(13))) | wd(8,bin(13)) | pad0(left(wd(8,bin(13)))) | pad0(wd(8,bin(13))) | pad0(wd(8,10,bin(13))) | pad0(wd(8,6,bin(13)));
+        sout | bin(0) | bin(13) | upcase(bin(13)) | nobase(bin(13)) | left(wd(8,bin(13))) | wd(8,bin(13)) | nonl;
+        sout | pad0(left(wd(8,bin(13)))) | pad0(wd(8,bin(13))) | pad0(wd(8,10,bin(13))) | pad0(wd(8,6,bin(13)));
 …
         printf( "%g  %8g %#8g %g %8g %8.0g %#8.0g %8.2g %#8.2g %-8.2g %-8.2g %-#8.2g %-+8.2g %-+#8.2g %08.2g %8.2E %8.2a %#8.2A %#8.2e\n",
 .0,3.0F,3.0F, f,  f,    f,     f,    f,     f,  3.0F,      f,      f,      f,       f,     f,    f,    f,     f,     f );
+        sout | 0.0 | wd(8, 3.0F) | nodp(wd(8, 3.0F)) | f | wd(8, f) | ws(8,0, f) | nodp(ws(8,0, f)) | ws(8,2, f) | nodp(ws(8,2, f)) | left(ws(8,2, 3.0F)) | left(ws(8,2, f)) | left(nodp(ws(8,2, f))) | left(sign(ws(8,2, f))) | left(sign(nodp(ws(8,2, f)))) | pad0(ws(8,2, f)) | upcase(wd(8,2, sci(f))) | wd(8,2, hex(f)) | upcase(wd(8,2, hex(f))) | nodp(wd(8,2, sci(f)));
+        sout | 0.0 | wd(8, 3.0F) | nodp(wd(8, 3.0F)) | f | wd(8, f) | ws(8,0, f) | nodp(ws(8,0, f)) | ws(8,2, f) | nodp(ws(8,2, f)) | nonl;
+        sout | left(ws(8,2, 3.0F)) | left(ws(8,2, f)) | left(nodp(ws(8,2, f))) | left(sign(ws(8,2, f))) | left(sign(nodp(ws(8,2, f)))) | nonl;
+        sout | pad0(ws(8,2, f)) | upcase(wd(8,2, sci(f))) | wd(8,2, hex(f)) | upcase(wd(8,2, hex(f))) | nodp(wd(8,2, sci(f)));
         sout | "double";
 …
         printf( "%g  %#8f %g %8f %#8.0f %8.0f %8.2f %-8.2f %-+#8.2f %08.2F %8.2E %8.2a %8.2A %8.2e\n",
 .0,  3.0, d,  d,     d,    d,    d,     d,       d,     d,    d,    d,    d,    d );
+        sout | 0.0 | wd(8, 3.0) | d | wd(8, d) | nodp(wd(8,0, d)) | wd(8,0, d) | wd(8,2, d) | left(wd(8,2, d)) | left(sign(wd(8,2, d))) | pad0(upcase(wd(8,2, d))) | upcase(wd(8,2, sci(d))) | wd(8,2, hex(d)) | upcase(wd(8,2, hex(d))) | wd(8,2, sci(d));
+        sout | 0.0 | wd(8, 3.0) | d | wd(8, d) | nodp(wd(8,0, d)) | wd(8,0, d) | wd(8,2, d) | nonl;
+        sout | left(wd(8,2, d)) | left(sign(wd(8,2, d))) | pad0(upcase(wd(8,2, d))) | upcase(wd(8,2, sci(d))) | wd(8,2, hex(d)) | upcase(wd(8,2, hex(d))) | wd(8,2, sci(d));
         sout | "long double";
 …
         printf( "%Lg  %#8Lf %Lg %8Lf %#8.0Lf %8.0Lf %8.2Lf %-8.2Lf %-+#8.2Lf %08.2LF %8.2LE %8.2La %8.2LA %8.2Le\n",
 .0L,  3.0L, ld,  ld,     ld,    ld,    ld,     ld,       ld,     ld,    ld,    ld,    ld,    ld );
+        sout | 0.0L | wd(8, 3.0L) | ld | wd(8, ld) | nodp(wd(8,0, ld)) | wd(8,0, ld) | wd(8,2, ld) | left(wd(8,2, ld)) | left(sign(wd(8,2, ld))) | pad0(upcase(wd(8,2, ld))) | upcase(wd(8,2, sci(ld))) | wd(8,2, hex(ld)) | upcase(wd(8,2, hex(ld))) | wd(8,2, sci(ld));
+        sout | 0.0L | wd(8, 3.0L) | ld | wd(8, ld) | nodp(wd(8,0, ld)) | wd(8,0, ld) | wd(8,2, ld) | nonl;
+        sout | left(wd(8,2, ld)) | left(sign(wd(8,2, ld))) | pad0(upcase(wd(8,2, ld))) | upcase(wd(8,2, sci(ld))) | wd(8,2, hex(ld)) | upcase(wd(8,2, hex(ld))) | wd(8,2, sci(ld));
 …
         char c = 'a';
         printf( "%c %2c %5c %-5c %hho %#hho %hhx %#hhx %#8hhx %#8hhX %-8c %8c\n", c, c, c, c, c, c, c, c, c, c, c, c );
+        sout | c | ' ' | wd(2,c) | wd(5,c) | left(wd(5,c)) | nobase(oct(c)) | oct(c) | nobase(hex(c)) | hex(c) | wd(8,hex(c)) | upcase(wd(8,hex(c))) | left(wd(8,c)) | wd(8,c);
+        sout | c | ' ' | wd(2,c) | wd(5,c) | left(wd(5,c)) | nobase(oct(c)) | oct(c) | nonl;
+        sout | nobase(hex(c)) | hex(c) | wd(8,hex(c)) | upcase(wd(8,hex(c))) | left(wd(8,c)) | wd(8,c);
         sout | nl | "string";

tests/manipulatorsOutput2.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Sat Jun  8 18:04:11 2019
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Jun 10 12:37:57 2019
 // Update Count     : 8
+// Last Modified On : Sun Nov 15 08:11:53 2020
+// Update Count     : 9
 //
 …
 // Local Variables: //
 // tab-width: 4 //
 // compile-command: "cfa -Wall -Wextra amanipulatorsOutput2.cfa" //
+// compile-command: "cfa -Wall -Wextra manipulatorsOutput2.cfa" //
 // End: //

tests/math4.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Thu May 24 20:56:54 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Dec  4 18:15:01 2018
 // Update Count     : 4
+// Last Modified On : Tue Aug 25 17:56:45 2020
+// Update Count     : 7
 //
 …
 int main( void ) {
+        signed char sc, scr1, scr2, scr3;
+        unsigned char uc, ucr1, ucr2, ucr3;
+        short int si, sir1, sir2, sir3;
+        unsigned short int usi, usir1, usir2, usir3;
+        int i, ir1, ir2, ir3;
+        unsigned int ui, uir1, uir2, uir3;
+        long int li, lir1, lir2, lir3;
+        unsigned long int uli, ulir1, ulir2, ulir3;
+        long long int lli, llir1, llir2, llir3;
+        unsigned long long int ulli, ullir1, ullir2, ullir3;
         float f;
         double d;
 …
         //---------------------- Nearest Integer ----------------------
+        //============================================================
+#if 1
+        sout | nl | "floor" | nl | nl;
+        printf( "signed char\n" );
+        for ( sc = 1; sc != 0; sc <<= 1 ) {
+                scr1 = floor( sc, sc ); scr2 = floor( sc + 2hh, sc ); scr3 = floor( -sc - 2hh, sc );
+                printf( "floor(%hhd, %hhd) = %hhd, floor(%hhd, %hhd) = %hhd, floor(%hhd, %hhd) = %hhd\n", sc, sc, scr1, sc + 2hh, sc, scr2, -sc - 2hh, sc, scr3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned char\n" );
+        for ( uc = 1; uc != 0; uc <<= 1 ) {
+                ucr1 = floor( uc, uc ); ucr2 = floor( uc + 2hh, uc ); ucr3 = floor( -uc - 2hh, uc );
+                printf( "floor(%hhu, %hhu) = %hhu, floor(%hhu, %hhu) = %hhu, floor(%hhu, %hhu) = %hhu\n", uc, uc, ucr1, uc + 2uhh, uc, ucr2, -uc - 2uhh, uc, ucr3 );
+        } // for
+        printf( "\n" );
+        printf( "short int\n" );
+        for ( si = 1; si != 0; si <<= 1 ) {
+                sir1 = floor( si, si ); sir2 = floor( si + 2hh, si ); sir3 = floor( -si - 2hh, si );
+                printf( "floor(%hd, %hd) = %hd, floor(%hd, %hd) = %hd, floor(%hd, %hd) = %hd\n", si, si, sir1, si + 2h, si, sir2, -si - 2h, si, sir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned short int\n" );
+        for ( usi = 1; usi != 0; usi <<= 1 ) {
+                usir1 = floor( usi, usi ); usir2 = floor( usi + 2hh, usi ); usir3 = floor( -usi - 2hh, usi );
+                printf( "floor(%hu, %hu) = %hu, floor(%hu, %hu) = %hu, floor(%hu, %hu) = %hu\n", usi, usi, usir1, usi + 2uh, usi, usir2, -usi - 2uh, usi, usir3 );
+        } // for
+        printf( "\n" );
+        printf( "int\n" );
+        for ( i = 1; i != 0; i <<= 1 ) {
+                ir1 = floor( i, i ); ir2 = floor( i + 2hh, i ); ir3 = floor( -i - 2hh, i );
+                printf( "floor(%d, %d) = %d, floor(%d, %d) = %d, floor(%d, %d) = %d\n", i, i, ir1, i + 2h, i, ir2, -i - 2h, i, ir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned int\n" );
+        for ( ui = 1; ui != 0; ui <<= 1 ) {
+                uir1 = floor( ui, ui ); uir2 = floor( ui + 2hh, ui ); uir3 = floor( -ui - 2hh, ui );
+                printf( "floor(%u, %u) = %u, floor(%u, %u) = %u, floor(%u, %u) = %u\n", ui, ui, uir1, ui + 2h, ui, uir2, -ui - 2h, ui, uir3 );
+        } // for
+        printf( "\n" );
+        printf( "long int\n" );
+        for ( li = 1; li != 0; li <<= 1 ) {
+                lir1 = floor( li, li ); lir2 = floor( li + 2hh, li ); lir3 = floor( -li - 2hh, li );
+                printf( "floor(%ld, %ld) = %ld, floor(%ld, %ld) = %ld, floor(%ld, %ld) = %ld\n", li, li, lir1, li + 2h, li, lir2, -li - 2h, li, lir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned long int\n" );
+        for ( uli = 1; uli != 0; uli <<= 1 ) {
+                ulir1 = floor( uli, uli ); ulir2 = floor( uli + 2hh, uli ); ulir3 = floor( -uli - 2hh, uli );
+                printf( "floor(%lu, %lu) = %lu, floor(%lu, %lu) = %lu, floor(%lu, %lu) = %lu\n", uli, uli, ulir1, uli + 2h, uli, ulir2, -uli - 2h, uli, ulir3 );
+        } // for
+        printf( "\n" );
+        printf( "long long int\n" );
+        for ( lli = 1; lli != 0; lli <<= 1 ) {
+                llir1 = floor( lli, lli ); llir2 = floor( lli + 2hh, lli ); llir3 = floor( -lli - 2hh, lli );
+                printf( "floor(%lld, %lld) = %lld, floor(%lld, %lld) = %lld, floor(%lld, %lld) = %lld\n", lli, lli, llir1, lli + 2h, lli, llir2, -lli - 2h, lli, llir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned long long int\n" );
+        for ( ulli = 1; ulli != 0; ulli <<= 1 ) {
+                ullir1 = floor( ulli, ulli ); ullir2 = floor( ulli + 2hh, ulli ); ullir3 = floor( -ulli - 2hh, ulli );
+                printf( "floor(%llu, %llu) = %llu, floor(%llu, %llu) = %llu, floor(%llu, %llu) = %llu\n", ulli, ulli, ullir1, ulli + 2h, ulli, ullir2, -ulli - 2h, ulli, ullir3 );
+        } // for
+        printf( "\n" );
+#endif // 0
+        //============================================================
+#if 1
+        sout | nl | "ceiling_div" | nl | nl;
+        printf( "signed char\n" );
+        for ( sc = 1; sc != 0; sc <<= 1 ) {
+                scr1 = ceiling_div( sc, sc ); scr2 = ceiling_div( sc + 2hh, sc ); scr3 = ceiling_div( -sc - 2hh, sc );
+                printf( "ceiling_div(%hhd, %hhd) = %hhd, ceiling_div(%hhd, %hhd) = %hhd, ceiling_div(%hhd, %hhd) = %hhd\n", sc, sc, scr1, sc + 2hh, sc, scr2, -sc - 2hh, sc, scr3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned char\n" );
+        for ( uc = 1; uc != 0; uc <<= 1 ) {
+                ucr1 = ceiling_div( uc, uc ); ucr2 = ceiling_div( uc + 2hh, uc ); ucr3 = ceiling_div( -uc - 2hh, uc );
+                printf( "ceiling_div(%hhu, %hhu) = %hhu, ceiling_div(%hhu, %hhu) = %hhu, ceiling_div(%hhu, %hhu) = %hhu\n", uc, uc, ucr1, uc + 2uhh, uc, ucr2, -uc - 2uhh, uc, ucr3 );
+        } // for
+        printf( "\n" );
+        printf( "short int\n" );
+        for ( si = 1; si != 0; si <<= 1 ) {
+                sir1 = ceiling_div( si, si ); sir2 = ceiling_div( si + 2hh, si ); sir3 = ceiling_div( -si - 2hh, si );
+                printf( "ceiling_div(%hd, %hd) = %hd, ceiling_div(%hd, %hd) = %hd, ceiling_div(%hd, %hd) = %hd\n", si, si, sir1, si + 2h, si, sir2, -si - 2h, si, sir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned short int\n" );
+        for ( usi = 1; usi != 0; usi <<= 1 ) {
+                usir1 = ceiling_div( usi, usi ); usir2 = ceiling_div( usi + 2hh, usi ); usir3 = ceiling_div( -usi - 2hh, usi );
+                printf( "ceiling_div(%hu, %hu) = %hu, ceiling_div(%hu, %hu) = %hu, ceiling_div(%hu, %hu) = %hu\n", usi, usi, usir1, usi + 2uh, usi, usir2, -usi - 2uh, usi, usir3 );
+        } // for
+        printf( "\n" );
+        printf( "int\n" );
+        for ( i = 1; i != 0; i <<= 1 ) {
+                ir1 = ceiling_div( i, i ); ir2 = ceiling_div( i + 2hh, i ); ir3 = ceiling_div( -i - 2hh, i );
+                printf( "ceiling_div(%d, %d) = %d, ceiling_div(%d, %d) = %d, ceiling_div(%d, %d) = %d\n", i, i, ir1, i + 2h, i, ir2, -i - 2h, i, ir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned int\n" );
+        for ( ui = 1; ui != 0; ui <<= 1 ) {
+                uir1 = ceiling_div( ui, ui ); uir2 = ceiling_div( ui + 2hh, ui ); uir3 = ceiling_div( -ui - 2hh, ui );
+                printf( "ceiling_div(%u, %u) = %u, ceiling_div(%u, %u) = %u, ceiling_div(%u, %u) = %u\n", ui, ui, uir1, ui + 2h, ui, uir2, -ui - 2h, ui, uir3 );
+        } // for
+        printf( "\n" );
+        printf( "long int\n" );
+        for ( li = 1; li != 0; li <<= 1 ) {
+                lir1 = ceiling_div( li, li ); lir2 = ceiling_div( li + 2hh, li ); lir3 = ceiling_div( -li - 2hh, li );
+                printf( "ceiling_div(%ld, %ld) = %ld, ceiling_div(%ld, %ld) = %ld, ceiling_div(%ld, %ld) = %ld\n", li, li, lir1, li + 2h, li, lir2, -li - 2h, li, lir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned long int\n" );
+        for ( uli = 1; uli != 0; uli <<= 1 ) {
+                ulir1 = ceiling_div( uli, uli ); ulir2 = ceiling_div( uli + 2hh, uli ); ulir3 = ceiling_div( -uli - 2hh, uli );
+                printf( "ceiling_div(%lu, %lu) = %lu, ceiling_div(%lu, %lu) = %lu, ceiling_div(%lu, %lu) = %lu\n", uli, uli, ulir1, uli + 2h, uli, ulir2, -uli - 2h, uli, ulir3 );
+        } // for
+        printf( "\n" );
+        printf( "long long int\n" );
+        for ( lli = 1; lli != 0; lli <<= 1 ) {
+                llir1 = ceiling_div( lli, lli ); llir2 = ceiling_div( lli + 2hh, lli ); llir3 = ceiling_div( -lli - 2hh, lli );
+                printf( "ceiling_div(%lld, %lld) = %lld, ceiling_div(%lld, %lld) = %lld, ceiling_div(%lld, %lld) = %lld\n", lli, lli, llir1, lli + 2h, lli, llir2, -lli - 2h, lli, llir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned long long int\n" );
+        for ( ulli = 1; ulli != 0; ulli <<= 1 ) {
+                ullir1 = ceiling_div( ulli, ulli ); ullir2 = ceiling_div( ulli + 2hh, ulli ); ullir3 = ceiling_div( -ulli - 2hh, ulli );
+                printf( "ceiling_div(%llu, %llu) = %llu, ceiling_div(%llu, %llu) = %llu, ceiling_div(%llu, %llu) = %llu\n", ulli, ulli, ullir1, ulli + 2h, ulli, ullir2, -ulli - 2h, ulli, ullir3 );
+        } // for
+        printf( "\n" );
+#endif // 0
+        //============================================================
+#if 1
+        sout | nl | "ceiling" | nl | nl;
+        printf( "signed char\n" );
+        for ( sc = 1; sc != 0; sc <<= 1 ) {
+                scr1 = ceiling( sc, sc ); scr2 = ceiling( sc + 2hh, sc ); scr3 = ceiling( -sc - 2hh, sc );
+                printf( "ceiling(%hhd, %hhd) = %hhd, ceiling(%hhd, %hhd) = %hhd, ceiling(%hhd, %hhd) = %hhd\n", sc, sc, scr1, sc + 2hh, sc, scr2, -sc - 2hh, sc, scr3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned char\n" );
+        for ( uc = 1; uc != 0; uc <<= 1 ) {
+                ucr1 = ceiling( uc, uc ); ucr2 = ceiling( uc + 2hh, uc ); ucr3 = ceiling( -uc - 2hh, uc );
+                printf( "ceiling(%hhu, %hhu) = %hhu, ceiling(%hhu, %hhu) = %hhu, ceiling(%hhu, %hhu) = %hhu\n", uc, uc, ucr1, uc + 2uhh, uc, ucr2, -uc - 2uhh, uc, ucr3 );
+        } // for
+        printf( "\n" );
+        printf( "short int\n" );
+        for ( si = 1; si != 0; si <<= 1 ) {
+                sir1 = ceiling( si, si ); sir2 = ceiling( si + 2hh, si ); sir3 = ceiling( -si - 2hh, si );
+                printf( "ceiling(%hd, %hd) = %hd, ceiling(%hd, %hd) = %hd, ceiling(%hd, %hd) = %hd\n", si, si, sir1, si + 2h, si, sir2, -si - 2h, si, sir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned short int\n" );
+        for ( usi = 1; usi != 0; usi <<= 1 ) {
+                usir1 = ceiling( usi, usi ); usir2 = ceiling( usi + 2hh, usi ); usir3 = ceiling( -usi - 2hh, usi );
+                printf( "ceiling(%hu, %hu) = %hu, ceiling(%hu, %hu) = %hu, ceiling(%hu, %hu) = %hu\n", usi, usi, usir1, usi + 2uh, usi, usir2, -usi - 2uh, usi, usir3 );
+        } // for
+        printf( "\n" );
+        printf( "int\n" );
+        for ( i = 1; i != 0; i <<= 1 ) {
+                ir1 = ceiling( i, i ); ir2 = ceiling( i + 2hh, i ); ir3 = ceiling( -i - 2hh, i );
+                printf( "ceiling(%d, %d) = %d, ceiling(%d, %d) = %d, ceiling(%d, %d) = %d\n", i, i, ir1, i + 2h, i, ir2, -i - 2h, i, ir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned int\n" );
+        for ( ui = 1; ui != 0; ui <<= 1 ) {
+                uir1 = ceiling( ui, ui ); uir2 = ceiling( ui + 2hh, ui ); uir3 = ceiling( -ui - 2hh, ui );
+                printf( "ceiling(%u, %u) = %u, ceiling(%u, %u) = %u, ceiling(%u, %u) = %u\n", ui, ui, uir1, ui + 2h, ui, uir2, -ui - 2h, ui, uir3 );
+        } // for
+        printf( "\n" );
+        printf( "long int\n" );
+        for ( li = 1; li != 0; li <<= 1 ) {
+                lir1 = ceiling( li, li ); lir2 = ceiling( li + 2hh, li ); lir3 = ceiling( -li - 2hh, li );
+                printf( "ceiling(%ld, %ld) = %ld, ceiling(%ld, %ld) = %ld, ceiling(%ld, %ld) = %ld\n", li, li, lir1, li + 2h, li, lir2, -li - 2h, li, lir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned long int\n" );
+        for ( uli = 1; uli != 0; uli <<= 1 ) {
+                ulir1 = ceiling( uli, uli ); ulir2 = ceiling( uli + 2hh, uli ); ulir3 = ceiling( -uli - 2hh, uli );
+                printf( "ceiling(%lu, %lu) = %lu, ceiling(%lu, %lu) = %lu, ceiling(%lu, %lu) = %lu\n", uli, uli, ulir1, uli + 2h, uli, ulir2, -uli - 2h, uli, ulir3 );
+        } // for
+        printf( "\n" );
+        printf( "long long int\n" );
+        for ( lli = 1; lli != 0; lli <<= 1 ) {
+                llir1 = ceiling( lli, lli ); llir2 = ceiling( lli + 2hh, lli ); llir3 = ceiling( -lli - 2hh, lli );
+                printf( "ceiling(%lld, %lld) = %lld, ceiling(%lld, %lld) = %lld, ceiling(%lld, %lld) = %lld\n", lli, lli, llir1, lli + 2h, lli, llir2, -lli - 2h, lli, llir3 );
+        } // for
+        printf( "\n" );
+        printf( "unsigned long long int\n" );
+        for ( ulli = 1; ulli != 0; ulli <<= 1 ) {
+                ullir1 = ceiling( ulli, ulli ); ullir2 = ceiling( ulli + 2hh, ulli ); ullir3 = ceiling( -ulli - 2hh, ulli );
+                printf( "ceiling(%llu, %llu) = %llu, ceiling(%llu, %llu) = %llu, ceiling(%llu, %llu) = %llu\n", ulli, ulli, ullir1, ulli + 2h, ulli, ullir2, -ulli - 2h, ulli, ullir3 );
+        } // for
+        printf( "\n" );
+#endif // 0
         sout | "floor:" | floor( 1.2F ) | floor( 1.2D ) | floor( 1.2L );
 …
 // Local Variables: //
 // tab-width: 4 //
 // compile-command: "cfa math3.cfa" //
+// compile-command: "cfa math4.cfa" //
 // End: //

tests/maybe.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Thr May 25 16:02:00 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jul 20 15:24:07 2017
 // Update Count     : 1
+// Last Modified On : Fri Sep 25 15:13:28 2020
+// Update Count     : 2
 //
 …
         //checkNamedConstructors();
         checkSetters();
+        printf( "done\n" );                             // non-empty .expect file
+}

tests/minmax.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Dec  4 21:45:31 2018
 // Update Count     : 52
+// Last Modified On : Sat Aug 15 08:28:01 2020
+// Update Count     : 54
 //
 …
         sout | "char\t\t\t"                                     | 'z' | ' ' | 'a' | "\tmin " | min( 'z', 'a' );
         sout | "signed int\t\t"                         | 4 | 3 | "\tmin" | min( 4, 3 );
+        sout | "signed int\t\t"                         | 4 | -3 | "\tmin" | min( 4, -3 );
         sout | "unsigned int\t\t"                       | 4u | 3u | "\tmin" | min( 4u, 3u );
         sout | "signed long int\t\t"            | 4l | 3l | "\tmin" | min( 4l, 3l );
+        sout | "signed long int\t\t"            | 4l | -3l | "\tmin" | min( 4l, -3l );
         sout | "unsigned long int\t"            | 4ul | 3ul | "\tmin" | min( 4ul, 3ul );
         sout | "signed long long int\t"         | 4ll | 3ll | "\tmin" | min( 4ll, 3ll );
+        sout | "signed long long int\t"         | 4ll | -3ll | "\tmin" | min( 4ll, -3ll );
         sout | "unsigned long long int\t"       | 4ull | 3ull | "\tmin" | min( 4ull, 3ull );
         sout | "float\t\t\t"                            | 4.0f | 3.1f | "\tmin" | min( 4.0f, 3.1f );
 …
         sout | "char\t\t\t"                                     | 'z' | ' ' | 'a' | "\tmax " | max( 'z', 'a' );
         sout | "signed int\t\t"                         | 4 | 3 | "\tmax" | max( 4, 3 );
+        sout | "signed int\t\t"                         | 4 | -3 | "\tmax" | max( 4, -3 );
         sout | "unsigned int\t\t"                       | 4u | 3u | "\tmax" | max( 4u, 3u );
         sout | "signed long int\t\t"            | 4l | 3l | "\tmax" | max( 4l, 3l );
+        sout | "signed long int\t\t"            | 4l | -3l | "\tmax" | max( 4l, -3l );
         sout | "unsigned long int\t"            | 4ul | 3ul | "\tmax" | max( 4ul, 3ul );
         sout | "signed long long int\t"         | 4ll | 3ll | "\tmax" | max( 4ll, 3ll );
+        sout | "signed long long int\t"         | 4ll | -3ll | "\tmax" | max( 4ll, -3ll );
         sout | "unsigned long long int\t"       | 4ull | 3ull | "\tmax" | max( 4ull, 3ull );
         sout | "float\t\t\t"                            | 4.0f | 3.1f | "\tmax" | max( 4.0f, 3.1f );

tests/nested-types.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Jul 9 10:20:03 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Feb 12 18:21:15 2020
 // Update Count     : 3
+// Last Modified On : Sun Sep 27 08:48:59 2020
+// Update Count     : 6
 //
 typedef int N;
 struct A {
   forall(otype T)
   struct N {
     T x;
   };
+        forall(otype T)
+        struct N {
+                T x;
+        };
 };
 struct S {
   struct T {
     int i;
     typedef int Bar;
   };
   T x;
+        struct T {
+                int i;
+                typedef int Bar;
+        };
+        T x;
   // struct U;
   typedef T Bar;
   typedef int Baz;
+        // struct U;
+        typedef T Bar;
+        typedef int Baz;
 };
 …
 int main() {
   // access nested struct
   S.T x;
+        // access nested struct
+        S.T x;
+  {
     struct S {
       int i;
       struct Z {
         double d;
       };
     };
+        {
+                struct S {
+                  int i;
+                  struct Z {
+                    double d;
+                  };
+                };
     S.Z z;   // gets local S
     .S.T y;  // lookup at global scope only
+                S.Z z;                                                                                  // gets local S
+                .S.T y;                                                                                 // lookup at global scope only
     const volatile .S.T q;
+                const volatile .S.T q;
 #if ERR1
     T err1;           // error: no T in scope
+                T err1;                                                                                 // error: no T in scope
 #endif
 #if ERR2
     .Z err2;          // error: no Z in global scope
     .S.Baz.Bar err3;  // error: .S.Baz => int, int is not aggregate and should not appear left of the dot
     .S.Z err4;        // error: no Z in global S
+                .Z err2;                                                                                // error: no Z in global scope
+                .S.Baz.Bar err3;                                                                // error: .S.Baz => int, int is not aggregate and should not appear left of the dot
+                .S.Z err4;                                                                              // error: no Z in global S
 #endif
+  }
+        }
   // U.S un;
+        // U.S un;
   S.Bar y;
   S.Baz x;
   S.T.Bar z;
+        S.Bar y;
+        S.Baz x;
+        S.T.Bar z;
+  // A.N(int) x;  // xxx - should not be an error, but currently is.
+        // A.N(int) x;  // xxx - should not be an error, but currently is.
+        #pragma message( "Compiled" )                   // force non-empty .expect file
+}

tests/numericConstants.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed May 24 22:10:36 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  5 08:58:16 2019
 // Update Count     : 5
+// Last Modified On : Sun Sep 27 07:55:22 2020
+// Update Count     : 7
 //
 …
 x_ff.ffp0;                                     // hex real
 x_1.ffff_ffff_p_128_l;
+        #pragma message( "Compiled" )   // force non-empty .expect file
 } // main

tests/operators.cfa

r3c64c668	r58fe85a
31	31	int main(int argc, char const *argv[]) {
32	32	/* code */
33		~~return 0;~~
	33	printf( "done\n" ); // non-empty .expect file
34	34	}
35	35

tests/pybin/settings.py

-              r3c64c668
+              r58fe85a
 class Architecture:
         KnownArchitectures = {
+                'x64'           : 'x64',
+                'x86-64'        : 'x64',
+                'x86_64'        : 'x64',
+                'x86'           : 'x86',
+                'aarch64'       : 'arm',
+                'i386'          : 'x86',
+                'i486'          : 'x86',
+                'i686'          : 'x86',
+                'Intel 80386'   : 'x86',
+                'arm'           : 'arm',
+                'ARM'           : 'arm',
+                'x64'         : 'x64',
+                'x86-64'      : 'x64',
+                'x86_64'      : 'x64',
+                'x86'         : 'x86',
+                'aarch64'     : 'arm64',
+                'arm64'       : 'arm64',
+                'ARM64'       : 'arm64',
+                'i386'        : 'x86',
+                'i486'        : 'x86',
+                'i686'        : 'x86',
+                'Intel 80386' : 'x86',
+                'arm'         : 'arm32',
+                'ARM'         : 'arm32',
+                'arm32'       : 'arm32',
+                'ARM32'       : 'arm32',
+        }
         CrossCompileFlags = {
+                'x64' : 'ARCH_FLAGS=-m64',
+                'x86' : 'ARCH_FLAGS=-m32',
+                'x64'  : 'ARCH_FLAGS=-m64',
+                'x86'  : 'ARCH_FLAGS=-m32',
+                'arm64': 'ARCH_FLAGS=',
+                'arm32': 'ARCH_FLAGS=',
+        }
 …
                         print("updated to %s" % self.target)
         def match(self, arch):
                 return True if not arch else self.target == arch
         @classmethod
         def make_canonical(_, arch):
+        def filter(self, tests):
+                return [test for test in tests if not test.arch or self.target == test.arch]
+        @staticmethod
+        def make_canonical(arch):
                 return Architecture.KnownArchitectures[arch]
 …
                 self.path   = "debug" if value else "nodebug"
+class AST:
+        def __init__(self, ast):
+                if ast == "new":
+                        self.target = ast
+                        self.string = "New AST"
+                        self.flags  = """AST_FLAGS=-XCFA,--new-ast"""
+                elif ast == "old":
+                        self.target = ast
+                        self.string = "Old AST"
+                        self.flags  = """AST_FLAGS=-XCFA,--old-ast"""
+                elif ast == None:
+                        self.target = "new" if config.NEWAST else "old"
+                        self.string = "Default AST (%s)" % self.target
+                        self.flags  = """AST_FLAGS="""
+                else:
+                        print("""ERROR: Invalid ast configuration, must be "old", "new" or left unspecified, was %s""" % (value), file=sys.stderr)
+                        sys.exit(1)
+        def filter(self, tests):
+                return [test for test in tests if not test.astv or self.target == test.astv]
 class Install:
         def __init__(self, value):
 …
                 self.total  = Timeouts.check(tg)
         @classmethod
         def check(_, value):
+        @staticmethod
+        def check(value):
                 if value < 1:
                         print("Timeouts must be at least 1 second", file=sys.stderr)
 …
 def init( options ):
+        global all_ast
+        global all_arch
+        global all_debug
+        global all_install
+        global ast
         global arch
+        global debug
         global archive
+        global debug
+        global distcc
+        global install
+        global continue_
         global dry_run
         global generating
-        global install
         global make
         global output_width
         global timeout
+        arch         = Architecture(options.arch)
+        global timeout2gdb
+        all_ast      = [AST(o)          for o in list(dict.fromkeys(options.ast    ))] if options.ast  else [AST(None)]
+        all_arch     = [Architecture(o) for o in list(dict.fromkeys(options.arch   ))] if options.arch else [Architecture(None)]
+        all_debug    = [Debug(o)        for o in list(dict.fromkeys(options.debug  ))]
+        all_install  = [Install(o)      for o in list(dict.fromkeys(options.install))]
         archive      = os.path.abspath(os.path.join(original_path, options.archive_errors)) if options.archive_errors else None
         debug        = Debug(options.debug)
+        continue_    = options.continue_
         dry_run      = options.dry_run # must be called before tools.config_hash()
-        distcc       = "DISTCC_CFA_PATH=~/.cfadistcc/%s/cfa" % tools.config_hash()
         generating   = options.regenerate_expected
-        install      = Install(options.install)
         make         = ['make']
         output_width = 24
         timeout      = Timeouts(options.timeout, options.global_timeout)
+        timeout2gdb  = options.timeout_with_gdb
         # if we distribute, distcc errors will fail tests, use log file for distcc
 …
 def validate():
+        """Validate the current configuration and update globals"""
+        global distcc
+        distcc       = "DISTCC_CFA_PATH=~/.cfadistcc/%s/cfa" % tools.config_hash()
         errf = os.path.join(BUILDDIR, ".validate.err")
         make_ret, out = tools.make( ".validate", error_file = errf, output_file=subprocess.DEVNULL, error=subprocess.DEVNULL )

tests/pybin/test_run.py

-              r3c64c668
+              r58fe85a
                 self.path = ''
                 self.arch = ''
+                self.astv = ''
         def toString(self):
                 return "{:25s} ({:5s} {:s})".format( self.name, self.arch if self.arch else "Any", self.target() )
+                return "{:25s} ({:5s} arch, {:s} ast: {:s})".format( self.name, self.arch if self.arch else "Any", self.astv if self.astv else "Any", self.target() )
         def prepare(self):
 …
         def expect(self):
+                return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s.txt" % (self.name,'' if not self.arch else ".%s" % self.arch)) )
+                arch = '' if not self.arch else ".%s" % self.arch
+                astv = '' if not self.astv else ".nast" if self.astv == "new" else ".oast"
+                return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s%s.txt" % (self.name,astv,arch)) )
         def error_log(self):
 …
                 return os.path.normpath( os.path.join(settings.BUILDDIR, self.path, self.name) )
         @classmethod
         def valid_name(_, name):
+        @staticmethod
+        def valid_name(name):
                 return not name.endswith( ('.c', '.cc', '.cpp', '.cfa') )
         @classmethod
         def from_target(_, target):
+        @staticmethod
+        def new_target(target, arch, astv):
                 test = Test()
                 test.name = os.path.basename(target)
                 test.path = os.path.relpath (os.path.dirname(target), settings.SRCDIR)
+                test.arch = settings.arch.target if settings.arch.cross_compile else ''
+                test.arch = arch.target if arch else ''
+                test.astv = astv.target if astv else ''
                 return test
 …
                         else :                                          text = "FAILED with code %d" % retcode
                 text += "    C%s - R%s" % (cls.fmtDur(duration[0]), cls.fmtDur(duration[1]))
+                text += "    C%s - R%s" % (fmtDur(duration[0]), fmtDur(duration[1]))
                 return text
-        @classmethod
-        def fmtDur( cls, duration ):
-                if duration :
-                        hours, rem = divmod(duration, 3600)
-                        minutes, rem = divmod(rem, 60)
-                        seconds, millis = divmod(rem, 1)
-                        return "%2d:%02d.%03d" % (minutes, seconds, millis * 1000)
-                return " n/a"

tests/pybin/tools.py

-              r3c64c668
+              r58fe85a
+                                        )
                                         return proc.returncode, out.decode("utf-8") if out else None
+                                        return proc.returncode, out.decode("latin-1") if out else None
                                 except subprocess.TimeoutExpired:
+                                        proc.send_signal(signal.SIGABRT)
+                                        proc.communicate()
+                                        return 124, str(None)
+                                        if settings.timeout2gdb:
+                                                print("Process {} timeout".format(proc.pid))
+                                                proc.communicate()
+                                                return 124, str(None)
+                                        else:
+                                                proc.send_signal(signal.SIGABRT)
+                                                proc.communicate()
+                                                return 124, str(None)
         except Exception as ex:
 …
                 raise
+def is_empty(fname):
+        if not os.path.isfile(fname):
+                return True
+        if os.stat(fname).st_size == 0:
+                return True
+        return False
 def is_ascii(fname):
         if settings.dry_run:
                 print("is_ascii: %s" % fname)
                 return True
+                return (True, "")
         if not os.path.isfile(fname):
                 return False
         code, out = sh("file %s" % fname, output_file=subprocess.PIPE)
+                return (False, "No file")
+        code, out = sh("file", fname, output_file=subprocess.PIPE)
         if code != 0:
                 return False
+                return (False, "'file EXPECT' failed with code {}".format(code))
         match = re.search(".*: (.*)", out)
         if not match:
+                return False
+        return match.group(1).startswith("ASCII text")
+                return (False, "Unreadable file type: '{}'".format(out))
+        if "ASCII text" in match.group(1):
+                return (True, "")
+        return (False, "File type should be 'ASCII text', was '{}'".format(match.group(1)))
 def is_exe(fname):
 …
                 return None
         file = open(file, mode)
+        file = open(file, mode, encoding="latin-1") # use latin-1 so all chars mean something.
         exitstack.push(file)
         return file
 …
                 '-s' if silent else None,
                 test_param,
+                settings.ast.flags,
                 settings.arch.flags,
                 settings.debug.flags,
 …
         cmd = [s for s in cmd if s]
         return sh(*cmd, output_file=output_file, error=error)
+def make_recon(target):
+        cmd = [
+                *settings.make,
+                '-W',
+                os.path.abspath(os.path.join(settings.BUILDDIR, '../driver/cfa')),
+                '--recon',
+                target
+        ]
+        cmd = [s for s in cmd if s]
+        return sh(*cmd, output_file=subprocess.PIPE)
 def which(program):
 …
 # helper function to check if a files contains only a specific string
 def file_contains_only(file, text) :
         with open(file) as f:
+        with open(file, encoding="latin-1") as f: # use latin-1 so all chars mean something.
                 ff = f.read().strip()
                 result = ff == text.strip()
 …
 # transform path to canonical form
 def canonical_path(path):
         abspath = os.path.abspath(__main__.__file__)
+        abspath = os.path.abspath(os.path.realpath(__main__.__file__))
         dname = os.path.dirname(abspath)
         return os.path.join(dname, os.path.normpath(path) )
 …
         raise argparse.ArgumentTypeError(msg)
+# Convert a function that converts a string to one that converts comma separated string.
+def comma_separated(elements):
+    return lambda string: [elements(part) for part in string.split(',')]
 def fancy_print(text):
         column = which('column')
 …
                 while True:
                         yield i.next(max(expire - time.time(), 0))
+def fmtDur( duration ):
+        if duration :
+                hours, rem = divmod(duration, 3600)
+                minutes, rem = divmod(rem, 60)
+                seconds, millis = divmod(rem, 1)
+                return "%2d:%02d.%03d" % (minutes, seconds, millis * 1000)
+        return " n/a"

tests/raii/.expect/ctor-autogen.txt

r3c64c668	r58fe85a
	1	done

tests/raii/.expect/init_once.txt

r3c64c668	r58fe85a
	1	done

tests/raii/ctor-autogen.cfa

r3c64c668	r58fe85a
151	151	identity(gcs);
152	152	identity(gcu);
	153	printf( "done\n" ); // non-empty .expect file
153	154	}

tests/raii/init_once.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Jun 14 15:43:35 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Mar 22 13:41:26 2019
 // Update Count     : 4
+// Last Modified On : Fri Sep 25 15:36:39 2020
+// Update Count     : 5
 //
 …
                 static_variable();
+        }
+        printf( "done\n" );                                                                     // non-empty .expect file
+}

tests/references.cfa

-              r3c64c668
+              r58fe85a
                 int *p = &a;
                 asm (
+                        "incl %[p]\n\t"
+                        : [p] "+m" (*p)
+                        #if defined( __i386 ) || defined( __x86_64 )
+                                "incl %[p]\n\t"
+                                : [p] "+m" (*p)
+                        #elif defined( __aarch64__ )
+                                "ldr     w1, %[p]\n\t"
+                                "add     w1, w1, 1\n\t"
+                                "str     w1, %[p]\n\t"
+                                : [p] "+m" (*p) ::"w1"
+                        #endif
                 );
                 printf("%d\n", a);

tests/result.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Thr May 25 16:50:00 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jul 20 15:24:12 2017
 // Update Count     : 1
+// Last Modified On : Fri Sep 25 15:22:59 2020
+// Update Count     : 2
 //
 …
         checkGetters();
         checkSetters();
+        printf( "done\n" );                             // non-empty .expect file
+}

tests/searchsort.cfa

-              r3c64c668
+              r58fe85a
         } // for
         sout | nl;
         for ( i; 0 ~ size ) {           // C version
+        for ( i; 0 ~ size ) {           // C version, returns void*
                 int key = size - i;
                 int * v = bsearch( &key, iarr, size, sizeof( iarr[0] ), comp );
+                int * v = ( int * ) bsearch( &key, iarr, size, sizeof( iarr[0] ), comp );
                 sout | key | ':' | *v | ", ";
         } // for

tests/stdincludes.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Aug 29 08:26:14 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov  6 18:00:53 2018
 // Update Count     : 6
+// Last Modified On : Sun Sep 27 08:51:38 2020
+// Update Count     : 8
 //
 …
 #include <wctype.h>
+int main() {}
+int main() {
+        #pragma message( "Compiled" )                   // force non-empty .expect file
+}
 // Local Variables: //

tests/switch.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Jul 12 06:50:22 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov  6 18:01:34 2018
 // Update Count     : 37
+// Last Modified On : Sun Sep 27 08:35:02 2020
+// Update Count     : 43
 //
 …
                 j = 5;
         } // choose
+        #pragma message( "Compiled" )                                           // force non-empty .expect file
 } // main

tests/test.py

-              r3c64c668
+              r58fe85a
 import argparse
+import itertools
 import re
 import sys
 …
         def match_test(path):
                 match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
+                match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.nast|\.oast)?(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
                 if match :
                         test = Test()
                         test.name = match.group(2)
                         test.path = match.group(1)
+                        test.arch = match.group(3)[1:] if match.group(3) else None
+                        if settings.arch.match(test.arch):
+                                expected.append(test)
+                        test.arch = match.group(4)[1:] if match.group(4) else None
+                        astv = match.group(3)[1:] if match.group(3) else None
+                        if astv == 'oast':
+                                test.astv = 'old'
+                        elif astv == 'nast':
+                                test.astv = 'new'
+                        elif astv:
+                                print('ERROR: "%s", expect file has astv but it is not "nast" or "oast"' % testname, file=sys.stderr)
+                                sys.exit(1)
+                        expected.append(test)
         path_walk( match_test )
 …
+                ]
+        # sort the test alphabetically for convenience
+        test_list.sort(key=lambda t: ('~' if t.arch else '') + t.target() + (t.arch if t.arch else ''))
         return test_list
 …
         if options.regenerate_expected :
                 for testname in options.tests :
+                        testname = canonical_path( testname )
+                        testname = os.path.normpath( os.path.join(settings.SRCDIR, testname) )
+                        # first check if this is a valid name to regenerate
                         if Test.valid_name(testname):
+                                # this is a valid name, let's check if it already exists
                                 found = [test for test in all_tests if canonical_path( test.target() ) == testname]
+                                tests.append( found[0] if len(found) == 1 else Test.from_target(testname) )
+                                setup = itertools.product(settings.all_arch if options.arch else [None], settings.all_ast if options.ast else [None])
+                                if not found:
+                                        # it's a new name, create it according to the name and specified architecture/ast version
+                                        tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
+                                elif len(found) == 1 and not found[0].arch:
+                                        # we found a single test, the user better be wanting to create a cross platform test
+                                        if options.arch:
+                                                print('ERROR: "%s", test has no specified architecture but --arch was specified, ignoring it' % testname, file=sys.stderr)
+                                        elif options.ast:
+                                                print('ERROR: "%s", test has no specified ast version but --ast was specified, ignoring it' % testname, file=sys.stderr)
+                                        else:
+                                                tests.append( found[0] )
+                                else:
+                                        # this test is already cross platform, just add a test for each platform the user asked
+                                        tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
+                                        # print a warning if it users didn't ask for a specific architecture
+                                        found_arch = [f.arch for f in found if f.arch]
+                                        if found_arch and not options.arch:
+                                                print('WARNING: "%s", test has architecture specific expected files but --arch was not specified, regenerating only for current host' % testname, file=sys.stderr)
+                                        # print a warning if it users didn't ask for a specific ast version
+                                        found_astv = [f.astv for f in found if f.astv]
+                                        if found_astv and not options.ast:
+                                                print('WARNING: "%s", test has ast version specific expected files but --ast was not specified, regenerating only for current ast' % testname, file=sys.stderr)
                         else :
                                 print('ERROR: "%s", tests are not allowed to end with a C/C++/CFA extension, ignoring it' % testname, file=sys.stderr)
 …
                         if test :
                                 tests.append( test[0] )
+                                tests.extend( test )
                         else :
                                 print('ERROR: No expected file for test %s, ignoring it' % testname, file=sys.stderr)
 …
         # create a parser with the arguments for the tests script
         parser = argparse.ArgumentParser(description='Script which runs cforall tests')
+        parser.add_argument('--debug', help='Run all tests in debug or release', type=yes_no, default='yes')
+        parser.add_argument('--install', help='Run all tests based on installed binaries or tree binaries', type=yes_no, default='no')
+        parser.add_argument('--arch', help='Test for specific architecture', type=str, default='')
+        parser.add_argument('--timeout', help='Maximum duration in seconds after a single test is considered to have timed out', type=int, default=60)
+        parser.add_argument('--ast', help='Test for specific ast', type=comma_separated(str), default=None)
+        parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None)
+        parser.add_argument('--debug', help='Run all tests in debug or release', type=comma_separated(yes_no), default='yes')
+        parser.add_argument('--install', help='Run all tests based on installed binaries or tree binaries', type=comma_separated(yes_no), default='no')
+        parser.add_argument('--continue', help='When multiple specifications are passed (debug/install/arch), sets whether or not to continue if the last specification failed', type=yes_no, default='yes', dest='continue_')
+        parser.add_argument('--timeout', help='Maximum duration in seconds after a single test is considered to have timed out', type=int, default=120)
         parser.add_argument('--global-timeout', help='Maximum cumulative duration in seconds after the ALL tests are considered to have timed out', type=int, default=7200)
+        parser.add_argument('--timeout-with-gdb', help='Instead of killing the command when it times out, orphan it and print process id to allow gdb to attach', type=yes_no, default="no")
         parser.add_argument('--dry-run', help='Don\'t run the tests, only output the commands', action='store_true')
         parser.add_argument('--list', help='List all test available', action='store_true')
 …
         parser.add_argument('-j', '--jobs', help='Number of tests to run simultaneously', type=int)
         parser.add_argument('--list-comp', help='List all valide arguments', action='store_true')
+        parser.add_argument('--list-dist', help='List all tests for distribution', action='store_true')
         parser.add_argument('-I','--include', help='Directory of test to include, can be used multiple time, All  if omitted', action='append')
         parser.add_argument('-E','--exclude', help='Directory of test to exclude, can be used multiple time, None if omitted', action='append')
 …
         # script must have at least some tests to run or be listing
         listing    = options.list or options.list_comp
+        listing    = options.list or options.list_comp or options.list_dist
         all_tests  = options.all
         some_tests = len(options.tests) > 0
 …
         test.prepare()
+        # ----------
+        # MAKE
+        # ----------
         # build, skipping to next test on error
         with Timed() as comp_dur:
                 make_ret, _ = make( test.target(), output_file=subprocess.DEVNULL, error=out_file, error_file = err_file )
+        # ----------
+        # RUN
+        # ----------
+        # run everything in a temp directory to make sure core file are handled properly
         run_dur = None
-        # run everything in a temp directory to make sure core file are handled properly
         with tempdir():
                 # if the make command succeeds continue otherwise skip to diff
 …
                 if success(retcode):
                         if settings.generating :
                                 # if we are ounly generating the output we still need to check that the test actually exists
+                                # if we are only generating the output we still need to check that the test actually exists
                                 if no_rule(out_file, test.target()) :
                                         retcode = 1
 …
                 else:
+                        with open (out_file, "r") as myfile:
+                                error = myfile.read()
+                        if os.stat(out_file).st_size < 1048576:
+                                with open (out_file, "r", encoding='latin-1') as myfile:  # use latin-1 so all chars mean something.
+                                        error = myfile.read()
+                        else:
+                                error = "Output log can't be read, file is bigger than 1MB, see {} for actual error\n".format(out_file)
                         ret, info = core_info(exe_file)
 …
         except KeyboardInterrupt:
                 return False, ""
         except Exception as ex:
                 print("Unexpected error in worker thread: %s" % ex, file=sys.stderr)
                 sys.stderr.flush()
                 return False, ""
+        # except Exception as ex:
+        #       print("Unexpected error in worker thread running {}: {}".format(t.target(), ex), file=sys.stderr)
+        #       sys.stderr.flush()
+        #       return False, ""
 …
         make('clean', output_file=subprocess.DEVNULL, error=subprocess.DEVNULL)
+        # since python prints stacks by default on a interrupt, redo the interrupt handling to be silent
+        def worker_init():
+                def sig_int(signal_num, frame):
+                        pass
+                signal.signal(signal.SIGINT, sig_int)
+        # create the executor for our jobs and handle the signal properly
+        pool = multiprocessing.Pool(jobs, worker_init)
+        # create the executor for our jobs
+        pool = multiprocessing.Pool(jobs)
         failed = False
-        def stop(x, y):
-                print("Tests interrupted by user", file=sys.stderr)
-                sys.exit(1)
-        signal.signal(signal.SIGINT, stop)
         # for each test to run
 …
         make('clean', output_file=subprocess.DEVNULL, error=subprocess.DEVNULL)
         return 1 if failed else 0
+        return failed
 …
         settings.init( options )
+        # --------------------------------------------------
+        # list all the test for auto completion programs
+        # not pretty, single line, with the command line options
+        if options.list_comp :
+                # fetch the liest of all valid tests
+                tests = list_tests( None, None )
+                # print the possible options
+                print("-h --help --debug --dry-run --list --ast=new --ast=old --arch --all --regenerate-expected --archive-errors --install --timeout --global-timeout --timeout-with-gdb -j --jobs -I --include -E --exclude --continue ", end='')
+                print(" ".join(map(lambda t: "%s" % (t.target()), tests)))
+                # done
+                sys.exit(0)
+        # --------------------------------------------------
+        # list all the test for auto completion programs
+        if options.list_dist :
+                # fetch the liest of all valid tests
+                tests = list_tests( None, None )
+                for t in tests:
+                        print(os.path.relpath(t.expect(), settings.SRCDIR), end=' ')
+                        print(os.path.relpath(t.input() , settings.SRCDIR), end=' ')
+                        code, out = make_recon(t.target())
+                        if code != 0:
+                                print('ERROR: recond failed for test {}'.format(t.target()), file=sys.stderr)
+                                sys.exit(1)
+                        print(' '.join(re.findall('([^\s]+\.cfa)', out)), end=' ')
+                print('')
+                # done
+                sys.exit(0)
+        # --------------------------------------------------
+        # list all the tests for users, in a pretty format
+        if options.list :
+                # fetch the liest of all valid tests
+                tests = list_tests( options.include, options.exclude )
+                # print the available tests
+                fancy_print("\n".join(map(lambda t: t.toString(), tests)))
+                # done
+                sys.exit(0)
         # fetch the liest of all valid tests
         all_tests = list_tests( options.include, options.exclude )
         # if user wants all tests than no other treatement of the test list is required
         if options.all or options.list or options.list_comp or options.include :
+        if options.all or options.include :
                 tests = all_tests
 …
                 sys.exit(1)
+        # sort the test alphabetically for convenience
+        tests.sort(key=lambda t: (t.arch if t.arch else '') + t.target())
+        # users may want to simply list the tests
+        if options.list_comp :
+                print("-h --help --debug --dry-run --list --arch --all --regenerate-expected --archive-errors --install --timeout --global-timeout -j --jobs ", end='')
+                print(" ".join(map(lambda t: "%s" % (t.target()), tests)))
+        elif options.list :
+                print("Listing for %s:%s"% (settings.arch.string, settings.debug.string))
+                fancy_print("\n".join(map(lambda t: t.toString(), tests)))
+        else :
+                # check the build configuration works
+                settings.prep_output(tests)
+                settings.validate()
+                options.jobs, forceJobs = job_count( options, tests )
+                settings.update_make_cmd(forceJobs, options.jobs)
+                print('%s %i tests on %i cores (%s:%s)' % (
+                        'Regenerating' if settings.generating else 'Running',
+                        len(tests),
+                        options.jobs,
+                        settings.arch.string,
+                        settings.debug.string
+                ))
+                # otherwise run all tests and make sure to return the correct error code
+                sys.exit( run_tests(tests, options.jobs) )
+        # prep invariants
+        settings.prep_output(tests)
+        failed = 0
+        # check if the expected files aren't empty
+        if not options.regenerate_expected:
+                for t in tests:
+                        if is_empty(t.expect()):
+                                print('WARNING: test "{}" has empty .expect file'.format(t.target()), file=sys.stderr)
+        # for each build configurations, run the test
+        with Timed() as total_dur:
+                for ast, arch, debug, install in itertools.product(settings.all_ast, settings.all_arch, settings.all_debug, settings.all_install):
+                        settings.ast     = ast
+                        settings.arch    = arch
+                        settings.debug   = debug
+                        settings.install = install
+                        # filter out the tests for a different architecture
+                        # tests are the same across debug/install
+                        local_tests = settings.ast.filter( tests )
+                        local_tests = settings.arch.filter( local_tests )
+                        options.jobs, forceJobs = job_count( options, local_tests )
+                        settings.update_make_cmd(forceJobs, options.jobs)
+                        # check the build configuration works
+                        settings.validate()
+                        # print configuration
+                        print('%s %i tests on %i cores (%s:%s - %s)' % (
+                                'Regenerating' if settings.generating else 'Running',
+                                len(local_tests),
+                                options.jobs,
+                                settings.ast.string,
+                                settings.arch.string,
+                                settings.debug.string
+                        ))
+                        if not local_tests :
+                                print('WARNING: No tests for this configuration')
+                                continue
+                        # otherwise run all tests and make sure to return the correct error code
+                        failed = run_tests(local_tests, options.jobs)
+                        if failed:
+                                result = 1
+                                if not settings.continue_:
+                                        break
+        print('Tests took %s' % fmtDur( total_dur.duration ))
+        sys.exit( failed )

tests/time.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Mar 27 17:24:56 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Jan  5 18:27:37 2020
 // Update Count     : 34
+// Last Modified On : Thu Jun 18 18:14:49 2020
+// Update Count     : 37
 //
 …
         Duration d1 = 3`h, d2 = 2`s, d3 = 3.375`s, d4 = 12`s, d5 = 1`s + 10_000`ns;
         sout | d1 | d2 | d3 | d4 | d5;
+        sout | d1`dd | d2`dm | d3`ds | d4`dms | d5`dus;
         d1 = 0;
         sout | d1 | d2 | d3;

tests/tuple/tupleAssign.cfa

-              r3c64c668
+              r58fe85a
                 double d = 0.0;
                 int i = 0;
                 char c = '\0';
+                signed char c = '\0';
                 struct X {
                         int z;
 …
                 [t, x, d, i, c, x] = (double)94.12;
                 printf( "d=%lg i=%d c=%c t=[%d, %lg, %d]\n", d, i, (int)c, t );
                 sout | "d=" | d | "i=" | i | "c=" | c | ' ' | "t=[" | t | "]";
+                sout | "d=" | d | "i=" | i | "c=" | (char)c | ' ' | "t=[" | t | "]";
                 [x, c, i, d, x, t] = (double)-94.12;
                 printf( "d=%lg i=%d c=%c t=[%d, %lg, %d]\n", d, i, c, t );
                 sout | "d=" | d | "i=" | i | "c=" | c | ' ' | "t=[" | t | "]";
+                sout | "d=" | d | "i=" | i | "c=" | (char)c | ' ' | "t=[" | t | "]";
+        }
+}

tests/typedefRedef.cfa

-              r3c64c668
+              r58fe85a
 typedef int ARR[];
 typedef int ARR[];
+// #ifdef ERR1
+// if a typedef has an array dimension,
+// it can only be redefined to the same dimension
+#ifdef ERR1
+// if a typedef has an array dimension, it can only be redefined to the same dimension
 typedef int ARR[2];
 // #endif
+#endif
 typedef int X;
 …
 int main() {
   typedef int ARR[sz];
+        typedef int ARR[sz];
   // can't redefine typedef which is VLA
+        // can't redefine typedef which is VLA
 #if ERR1
   typedef int ARR[sz];
+        typedef int ARR[sz];
 #endif
   Foo *x;
+        Foo * x;
   typedef struct Bar Foo;
   Foo *y;
+        typedef struct Bar Foo;
+        Foo * y;
+  typedef int *** pt;
+        typedef int *** pt;
+        #pragma message( "Compiled" )                   // force non-empty .expect file
+}

tests/typeof.cfa

-              r3c64c668
+              r58fe85a
 int main() {
+    int *v1;
+    typeof(v1) v2;
+    typeof(*v1) v3[4];
+    char *v4[4];
+    typeof(typeof(char *)[4]) v5;
+    typeof (int *) v6;
+    typeof( int ( int, int p ) ) *v7;
+    typeof( [int] ( int, int p ) ) *v8;
+    (typeof(v1)) v2; // cast with typeof
+        int *v1;
+        typeof(v1) v2;
+        typeof(*v1) v3[4];
+        char *v4[4];
+        typeof(typeof(char *)[4]) v5;
+        typeof (int *) v6;
+        typeof( int ( int, int p ) ) *v7;
+        typeof( [int] ( int, int p ) ) *v8;
+        (typeof(v1)) v2; // cast with typeof
+        printf( "done\n" );                             // non-empty .expect file
+}

tests/variableDeclarator.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Wed Aug 17 08:41:42 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov  6 18:02:16 2018
 // Update Count     : 2
+// Last Modified On : Sun Sep 27 07:46:17 2020
+// Update Count     : 13
 //
 …
 int (f2);
 int *f3;
 int **f4;
 int * const *f5;
+int * f3;
+int ** f4;
+int * const * f5;
 int * const * const f6;
 int *(f7);
 int **(f8);
 int * const *(f9);
+int * (f7);
+int ** (f8);
+int * const * (f9);
 int * const * const (f10);
 int (*f11);
 int (**f12);
 int (* const *f13);
+int (* f11);
+int (** f12);
+int (* const * f13);
 int (* const * const f14);
 int f15[];
+int f15[0];
 int f16[10];
 int (f17[]);
+int (f17[0]);
 int (f18[10]);
 int *f19[];
 int *f20[10];
 int **f21[];
 int **f22[10];
 int * const *f23[];
 int * const *f24[10];
 int * const * const f25[];
+int * f19[0];
+int * f20[10];
+int ** f21[0];
+int ** f22[10];
+int * const * f23[0];
+int * const * f24[10];
+int * const * const f25[0];
 int * const * const f26[10];
 int *(f27[]);
+int *(f27[0]);
 int *(f28[10]);
 int **(f29[]);
+int **(f29[0]);
 int **(f30[10]);
 int * const *(f31[]);
+int * const *(f31[0]);
 int * const *(f32[10]);
 int * const * const (f33[]);
+int * const * const (f33[0]);
 int * const * const (f34[10]);
 int (*f35)[];
 int (*f36)[10];
 int (**f37)[];
 int (**f38)[10];
 int (* const *f39)[];
 int (* const *f40)[10];
+int (* f35)[];
+int (* f36)[10];
+int (** f37)[];
+int (** f38)[10];
+int (* const * f39)[];
+int (* const * f40)[10];
 int (* const * const f41)[];
 int (* const * const f42)[10];
 int f43[][3];
+int f43[0][3];
 int f44[3][3];
 int (f45[])[3];
+int (f45[0])[3];
 int (f46[3])[3];
 int ((f47[]))[3];
+int ((f47[0]))[3];
 int ((f48[3]))[3];
 int *f49[][3];
 int *f50[3][3];
 int **f51[][3];
 int **f52[3][3];
 int * const *f53[][3];
 int * const *f54[3][3];
 int * const * const f55[][3];
+int * f49[0][3];
+int * f50[3][3];
+int ** f51[0][3];
+int ** f52[3][3];
+int * const * f53[0][3];
+int * const * f54[3][3];
+int * const * const f55[0][3];
 int * const * const f56[3][3];
 int (*f57[][3]);
 int (*f58[3][3]);
 int (**f59[][3]);
 int (**f60[3][3]);
 int (* const *f61[][3]);
 int (* const *f62[3][3]);
 int (* const * const f63[][3]);
+int (* f57[0][3]);
+int (* f58[3][3]);
+int (** f59[0][3]);
+int (** f60[3][3]);
+int (* const * f61[0][3]);
+int (* const * f62[3][3]);
+int (* const * const f63[0][3]);
 int (* const * const f64[3][3]);
 …
 int (f66)(int);
 int *f67(int);
 int **f68(int);
 int * const *f69(int);
+int * f67(int);
+int ** f68(int);
+int * const * f69(int);
 int * const * const f70(int);
 …
 int * const * const (f74)(int);
 int (*f75)(int);
 int (**f76)(int);
 int (* const *f77)(int);
+int (* f75)(int);
+int (** f76)(int);
+int (* const * f77)(int);
 int (* const * const f78)(int);
 int (*(*f79)(int))();
+int (*(* f79)(int))();
 int (*(* const f80)(int))();
 int (* const(* const f81)(int))();
 …
 //int fe2()[];                          // returning an array
 //int fe3()();                          // returning a function
 //int (*fe4)()();                               // returning a function
 //int ((*fe5())())[];                   // returning an array
+//int (* fe4)()();                              // returning a function
+//int ((* fe5())())[];                  // returning an array
+#ifdef __CFA__
 // Cforall extensions
 …
 const * const * int cf6;
 [] int cf15;
+[0] int cf15;
 [10] int cf16;
 [] * int cf19;
+[0] * int cf19;
 [10] * int cf20;
 int **cf21[];
+int ** cf21[0];
 [10] * * int cf22;
 [] * const * int cf23;
+[0] * const * int cf23;
 [10] * const * int cf24;
 [] const * const * int cf25;
+[0] const * const * int cf25;
 [10] const * const * int cf26;
 …
 const * const * [10] int cf42;
 [][3] int cf43;
+[0][3] int cf43;
 [3][3] int cf44;
 [][3] * int cf49;
+[0][3] * int cf49;
 [3][3] * int cf50;
 [][3] * * int cf51;
+[0][3] * * int cf51;
 [3][3] * * int cf52;
 [][3] const * int cf53;
+[0][3] const * int cf53;
 [3][3] * const * int cf54;
 [][3] const * const * int cf55;
+[0][3] const * const * int cf55;
 [3][3] const * const * int cf56;
 …
 *[]*[]* [ *[]*[] int ]( *[]*[] int, *[]*[] int ) v3;
+#endif // __CFA__
 //Dummy main
+int main(int argc, char const *argv[])
+{
+        return 0;
+int main( int argc, char const * argv[] ) {
+        #pragma message( "Compiled" )                                           // force non-empty .expect file
+}

tests/vector.cfa

-              r3c64c668
+              r58fe85a
 //
+#include <vector.hfa>
 #include <fstream.hfa>
-#include <vector.hfa>
 #undef assert
 …
 int main() {
         vector( int ) iv;
+        assert( ((uintptr_t)&iv.storage.storage ) == (((uintptr_t)&iv)) );
+        assert( ((uintptr_t)&iv.storage.capacity) == (((uintptr_t)&iv) + sizeof(void *)) );
+        assert( ((uintptr_t)&iv.size            ) == (((uintptr_t)&iv) + sizeof(void *) + sizeof(size_t)) );
         assert( empty( &iv ) );

tests/voidPtr.cfa

-              r3c64c668
+              r58fe85a
         if ( ! a ) {
                 abort();
+        }
+        }
+        printf( "done\n" );                             // non-empty .expect file
+}

tests/warnings/self-assignment.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Thu Mar 1 13:53:57 2018
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Feb 20 07:56:17 2019
 // Update Count     : 3
+// Last Modified On : Sun Sep 27 09:24:34 2020
+// Update Count     : 6
 //
 …
         s.i = s.i;
         t.s.i = t.s.i;
+        #pragma message( "Compiled" )                   // force non-empty .expect file
+}
 // Local Variables: //
 // tab-width: 4 //
 // compile-command: "cfa dtor-early-exit" //
+// compile-command: "cfa self-assignment.cfa" //
 // End: //

tests/zombies/structMember.cfa

-              r3c64c668
+              r58fe85a
 // C useless declarations
+#ifdef ERROR
         int;
         TD;
 …
         W(int);
         W(int).X;
+#endif // ERROR
 };

tools/Makefile.am

-              r3c64c668
+              r58fe85a
 ACLOCAL_AMFLAGS  = -I automake
+noinst_PROGRAMS = busy catchsig repeat watchdog
 AM_CFLAGS = -Wall -Wextra -O2 -g
+busy_LDFLAGS     = -pthread
+noinst_PROGRAMS = busy catchsig repeat watchdog
+busy_SOURCES     = busy.c
+busy_LDFLAGS     = -pthread
+catchsig_SOURCES = catchsig.c
+repeat_SOURCES   = repeat.c
+watchdog_SOURCES = watchdog.c
+nodist_busy_SOURCES     = busy.c
+nodist_catchsig_SOURCES = catchsig.c
+nodist_repeat_SOURCES   = repeat.c
+nodist_watchdog_SOURCES = watchdog.c

tools/build/push2dist.sh

-              r3c64c668
+              r58fe85a
 hash="$1"
+bwlim="$2"
 valid=$(distcc -j 2> /dev/null)
 # if test "${valid}" != 0
 …
 # echo "Copying to machines : ${hosts} (hash=${hash})"
 files="../../../driver/cfa ../../../driver/cfa-cpp ../../../driver/cc1 ../../../driver/as $(find . -name '*.c*' | tr '\n' ' ')"
+files="../../../driver/cfa ../../../driver/cfa-cpp ../../../driver/cc1 ../../../driver/as defines.hfa $(find . -name '*.c*' | tr '\n' ' ')"
 # echo "Files ${files}"
 function push() {
         ssh ${host} "mkdir -p ~/.cfadistcc/${hash}/"
         rsync -a ${dV} ${files} ${host}:~/.cfadistcc/${hash}/.
+        rsync --bwlimit=${bwlim} -a ${dV} ${files} ${host}:~/.cfadistcc/${hash}/.
+}

tools/cfa.nanorc

-              r3c64c668
+              r58fe85a
 # Declarations
 color brightgreen "\<(struct|union|typedef|trait|coroutine|monitor|thread)\>"
 color brightgreen "\<(with)\>"
+color brightgreen "\<(struct|union|typedef|trait|coroutine|generator)\>"
+color brightgreen "\<(monitor|thread|with)\>"
 # Control Flow Structures
 color brightyellow "\<(if|else|while|do|for|switch|choose|case|default)\>"
 color brightyellow "\<(disable|enable|waitfor|when|timeout)\>"
+color brightyellow "\<(disable|enable|waitfor|when|timeout|suspend)\>"
 color brightyellow "\<(try|catch(Resume)?|finally)\>"
 …
 # Escaped Keywords, now Identifiers.
 color white "`\w+`"
+color white "``\w+"
 # Operator Names
 …
 ## Update/Redistribute
 # GCC builtins
 color cyan "__attribute__[[:space:]]*\(\([^()]*(\([^()]*\)[^()]*)*\)\)"
+color cyan "__attribute__[[:space:]]*\(\(([^)]|[^)]\))*\)\)"
 ##color cyan "__(aligned|asm|builtin|hidden|inline|packed|restrict|section|typeof|weak)__"

tools/prettyprinter/Makefile.am

r3c64c668	r58fe85a
30	30	tools_prettyprinter_PROGRAMS = pretty
31	31	tools_prettyprinterdir = ../
32		pretty_SOURCES = ${SRC}
	32	nodist_pretty_SOURCES = ${SRC}
33	33	pretty_LDADD = ${LEXLIB} -ldl # yywrap
34	34	pretty_CXXFLAGS = -Wno-deprecated -Wall -DYY_NO_INPUT -O2 -g -std=c++14

tools/stat.py

-              r3c64c668
+              r58fe85a
 #!/usr/bin/python
+#!/usr/bin/python3
 import sys
 …
                 avg = numpy.mean  (content)
                 std = numpy.std   (content)
                 print "median {0:.1f} avg {1:.1f} stddev {2:.1f}".format( med, avg, std )
+                print("median {0:.1f} avg {1:.1f} stddev {2:.1f}".format( med, avg, std ))

tools/vscode/uwaterloo.cforall-0.1.0/package.json

-              r3c64c668
+              r58fe85a
         "name": "cforall",
         "version": "0.1.0",
         "displayName": "Cforall Language Support",
+        "displayName": "C∀ (C-for-all) Language Support",
         "description": "Cforall - colorizer, grammar and snippets.",
         "publisher": "uwaterloo",
 …
                 "vscode": "^1.5.0"
         },
         "icon": "images/icon.svg",
+        "icon": "images/icon.png",
         "categories": [
                 "Languages",
+                "Programming Languages",
                 "Linters",
                 "Other"
         ],
+        "activationEvents": [
+                "onLanguage:cforall"
+        ],
+        "main": "./client/main.js",
         "contributes": {
                 "languages": [
 …
                                 "aliases": [
                                         "C∀",
+                                        "CForAll",
                                         "Cforall",
-                                        "CForAll",
                                         "cforall"
                                 ],
                                 "extensions": [
+                                        ".cf"
+                                        ".cfa",
+                                        ".hfa",
+                                        ".ifa"
                                 ],
                                 "configuration": "./cforall.configuration.json"
 …
+                        {
                                 "language": "cforall",
                                 "scopeName": "source.cf",
                                 "path": "./syntaxes/cfa.tmLanguage"
+                                "scopeName": "source.cfa",
+                                "path": "./syntaxes/cfa.tmLanguage.json"
+                        }
+                ]
+                ],
+                "configuration": {
+                        "type": "object",
+                        "title": "Example configuration",
+                        "properties": {
+                                "cforall.maxNumberOfProblems": {
+                                        "scope": "resource",
+                                        "type": "number",
+                                        "default": 100,
+                                        "description": "Controls the maximum number of problems produced by the server."
+                                },
+                                "cforall.trace.server": {
+                                        "scope": "window",
+                                        "type": "string",
+                                        "enum": [
+                                                "off",
+                                                "messages",
+                                                "verbose"
+                                        ],
+                                        "default": "off",
+                                        "description": "Traces the communication between VS Code and the language server."
+                                }
+                        }
+                }
+        },
+        "dependencies": {
+                "vscode-languageclient": "^4.1.4"
+        },
+        "devDependencies": {
+                "vscode-languageclient": "^4.1.4"
+        }
+}

Context Navigation

Changes in / [3c64c668:58fe85a]

Legend: