Index: doc/papers/llheap/Makefile
===================================================================
--- doc/papers/llheap/Makefile	(revision 9262fe909e250ac8e6f45eb30157e75ff29969c6)
+++ doc/papers/llheap/Makefile	(revision b93c54440a0fc663f1dc3879d4111a307914de1b)
@@ -17,5 +17,4 @@
 
 FIGURES = ${addsuffix .tex, \
-AddressSpace \
 AllocatorComponents \
 AllocatedObject \
@@ -57,4 +56,5 @@
 
 PICTURES = ${addsuffix .pstex, \
+AddressSpace \
 MultipleHeapsOwnershipStorage \
 PrivatePublicHeaps \
Index: doc/papers/llheap/Paper.tex
===================================================================
--- doc/papers/llheap/Paper.tex	(revision 9262fe909e250ac8e6f45eb30157e75ff29969c6)
+++ doc/papers/llheap/Paper.tex	(revision b93c54440a0fc663f1dc3879d4111a307914de1b)
@@ -82,5 +82,5 @@
 xleftmargin=\parindentlnth,				% indent code to paragraph indentation
 escapechar=\$,							% LaTeX escape in CFA code
-%mathescape=true,						% LaTeX math escape in CFA code $...$
+mathescape=false,						% disable LaTeX math escape in CFA code $...$
 keepspaces=true,						%
 showstringspaces=false,					% do not show spaces with cup
@@ -90,4 +90,15 @@
 numberstyle=\footnotesize\sf,			% numbering style
 moredelim=**[is][\color{red}]{@}{@},
+% replace/adjust listing characters that look bad in sanserif
+literate=
+%  {-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.75ex}{0.1ex}}}}1
+  {-}{\raisebox{-1pt}{\ttfamily-}}1
+  {^}{\raisebox{0.6ex}{\(\scriptstyle\land\,\)}}1
+  {~}{\raisebox{0.3ex}{\(\scriptstyle\sim\,\)}}1
+  {'}{\ttfamily'\hspace*{-0.4ex}}1
+  {`}{\ttfamily\upshape\hspace*{-0.3ex}`}1
+  {<-}{$\leftarrow$}2
+  {=>}{$\Rightarrow$}2
+%  {->}{\raisebox{-1pt}{\texttt{-}}\kern-0.1ex\textgreater}2,
 }% lstset
 
@@ -95,26 +106,23 @@
 \lstdefinelanguage{CFA}[ANSI]{C}{
 	morekeywords={
-		_Alignas, _Alignof, __alignof, __alignof__, asm, __asm, __asm__, __attribute, __attribute__,
-		auto, _Bool, catch, catchResume, choose, _Complex, __complex, __complex__, __const, __const__,
-		coroutine, disable, dtype, enable, exception, __extension__, fallthrough, fallthru, finally,
-		__float80, float80, __float128, float128, forall, ftype, generator, _Generic, _Imaginary, __imag, __imag__,
-		inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or,
-		otype, restrict, resume, __restrict, __restrict__, __signed, __signed__, _Static_assert, suspend, thread,
-		_Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,
-		virtual, __volatile, __volatile__, waitfor, when, with, zero_t},
+		_Alignas, _Alignof, __alignof, __alignof__, and, asm, __asm, __asm__, _Atomic, __attribute, __attribute__,
+		__auto_type, basetypeof, _Bool, catch, catchResume, choose, coerce, _Complex, __complex, __complex__, __const, __const__,
+		coroutine, _Decimal32, _Decimal64, _Decimal128, disable, enable, exception, __extension__, fallthrough, fallthru, finally, fixup,
+		__float80, float80, __float128, float128, _Float16, _Float32, _Float32x, _Float64, _Float64x, _Float128, _Float128x,
+		forall, fortran, generator, _Generic, _Imaginary, __imag, __imag__, inline, __inline, __inline__, int128, __int128, __int128_t,
+		__label__, monitor, mutex, _Noreturn, __builtin_offsetof, one_t, or, recover, report, restrict, __restrict, __restrict__,
+		__signed, __signed__, _Static_assert, suspend, thread, __thread, _Thread_local, throw, throwResume, timeout, trait, try,
+		typeof, __typeof, __typeof__, typeid, __uint128_t, __builtin_va_arg, __builtin_va_list, virtual, __volatile, __volatile__,
+		vtable, waitfor, waituntil, when, with, zero_t,
+    },
 	moredirectives={defined,include_next},
-	% replace/adjust listing characters that look bad in sanserif
-	literate={-}{\makebox[1ex][c]{\raisebox{0.5ex}{\rule{0.8ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptstyle\land\,$}}1
-		{~}{\raisebox{0.3ex}{$\scriptstyle\sim\,$}}1 % {`}{\ttfamily\upshape\hspace*{-0.1ex}`}1
-		{<}{\textrm{\textless}}1 {>}{\textrm{\textgreater}}1
-		{<-}{$\leftarrow$}2 {=>}{$\Rightarrow$}2 {->}{\makebox[1ex][c]{\raisebox{0.5ex}{\rule{0.8ex}{0.075ex}}}\kern-0.2ex{\textrm{\textgreater}}}2,
 }
 
 % uC++ programming language, based on ANSI C++
-\lstdefinelanguage{uC++}[ANSI]{C++}{
+\lstdefinelanguage{uC++}[GNU]{C++}{
 	morekeywords={
-		_Accept, _AcceptReturn, _AcceptWait, _Actor, _At, _CatchResume, _Cormonitor, _Coroutine, _Disable,
-		_Else, _Enable, _Event, _Finally, _Monitor, _Mutex, _Nomutex, _PeriodicTask, _RealTimeTask,
-		_Resume, _Select, _SporadicTask, _Task, _Timeout, _When, _With, _Throw},
+		_Accept, _AcceptReturn, _AcceptWait, _Actor, _At, _Catch, _CatchResume, _CorActor, _Cormonitor, _Coroutine,
+		_Disable, _Else, _Enable, _Event, _Exception, _Finally, _Monitor, _Mutex, _Nomutex, _PeriodicTask, _RealTimeTask,
+		_Resume, _ResumeTop, _Select, _SporadicTask, _Task, _Timeout, _When, _With, _Throw},
 }
 
@@ -133,29 +141,12 @@
 	morestring=[b]",
 	morestring=[s]{`}{`},
-	% replace/adjust listing characters that look bad in sanserif
-	literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.8ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptstyle\land\,$}}1
-		{~}{\raisebox{0.3ex}{$\scriptstyle\sim\,$}}1 % {`}{\ttfamily\upshape\hspace*{-0.1ex}`}1
-		{<}{\textrm{\textless}}1 {>}{\textrm{\textgreater}}1
-		{<-}{\makebox[2ex][c]{\textrm{\textless}\raisebox{0.5ex}{\rule{0.8ex}{0.075ex}}}}2,
 }
 
-\lstnewenvironment{cfa}[1][]
-{\lstset{language=CFA,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
-{}
-\lstnewenvironment{C++}[1][]                            % use C++ style
-{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
-{}
-\lstnewenvironment{uC++}[1][]
-{\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
-{}
-\lstnewenvironment{Go}[1][]
-{\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
-{}
-\lstnewenvironment{python}[1][]
-{\lstset{language=python,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
-{}
-\lstnewenvironment{java}[1][]
-{\lstset{language=java,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
-{}
+\lstnewenvironment{cfa}[1][]{\lstset{language=CFA,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
+\lstnewenvironment{C++}[1][]{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
+\lstnewenvironment{uC++}[1][]{\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
+\lstnewenvironment{Go}[1][]{\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
+\lstnewenvironment{python}[1][]{\lstset{language=python,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
+\lstnewenvironment{java}[1][]{\lstset{language=java,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
 
 % inline code @...@
@@ -193,10 +184,11 @@
 
 \author[1]{Mubeen Zulfiqar}
+\author[1]{Ayelet Wasik}
 \author[1]{Peter A. Buhr*}
-\author[1]{Thierry Delisle}
-\author[1]{Ayelet Wasik}
+\author[2]{Bryan Chan}
 \authormark{ZULFIQAR \textsc{et al.}}
 
 \address[1]{\orgdiv{Cheriton School of Computer Science}, \orgname{University of Waterloo}, \orgaddress{\state{Waterloo, ON}, \country{Canada}}}
+\address[2]{\orgdiv{Huawei Compiler Lab}, \orgname{Huawei}, \orgaddress{\state{Markham, ON}, \country{Canada}}}
 
 \corres{*Peter A. Buhr, Cheriton School of Computer Science, University of Waterloo, 200 University Avenue West, Waterloo, ON N2L 3G1, Canada. \email{pabuhr{\char`\@}uwaterloo.ca}}
@@ -204,27 +196,24 @@
 % \fundingInfo{Natural Sciences and Engineering Research Council of Canada}
 
-\abstract[Summary]{
-A new C-based concurrent memory-allocator is presented, called llheap.
+\abstract[Summary]{%
+A new C-based concurrent memory-allocator is presented, called llheap (low latency).
 It can be used standalone in C/\CC applications with multiple kernel threads, or embedded into high-performance user-threading programming languages.
 llheap extends the feature set of existing C allocation by remembering zero-filled (\lstinline{calloc}) and aligned properties (\lstinline{memalign}) in an allocation.
 These properties can be queried, allowing programmers to write safer programs by preserving these properties in future allocations.
-As well, \lstinline{realloc} preserves these properties when enlarging storage requests, again increasing future allocation safety.
-llheap also extends the C allocation API with \lstinline{resize}, extended \lstinline{realloc}, \lstinline{aalloc}, \lstinline{amemalign}, and \lstinline{cmemalign} providing orthongoal ac, so programmers do not make mistakes writing theses useful allocation operations.
-It is competitive with the best current memory allocators,
-The ability to use \CFA's advanced type-system (and possibly \CC's too) to combine advanced memory operations into one allocation routine using named arguments shows how far the allocation API can be pushed, which increases safety and greatly simplifies programmer's use of dynamic allocation.
- low-latency
- without a performance loss
-The llheap allocator also provides comprehensive statistics for all allocation operations, which are invaluable in understanding and debugging a program's dynamic behaviour.
-As well, llheap provides a debugging mode where allocations are checked with internal pre/post conditions and invariants. It is extremely useful, especially for students.
-% No other memory allocator examined in the work provides such comprehensive statistics gathering.
+As well, \lstinline{realloc} preserves these properties when adjusting storage size, again increasing future allocation safety.
+llheap also extends the C allocation API with \lstinline{aalloc}, \lstinline{amemalign}, \lstinline{cmemalign}, \lstinline{resize}, and extended \lstinline{realloc}, providing orthogonal access to allocation features;
+hence, programmers do have to code missing combinations.
+The llheap allocator also provides a contention-free statistics gathering mode, and a debugging mode for dynamically checking allocation pre/post conditions and invariants.
+These modes are invaluable for understanding and debugging a program's dynamic allocation behaviour, with low enough cost to be used in production code.
+The llheap API is further extended with the \CFA advanced type-system, providing a single type-safe allocation routine using named arguments, increasing safety and simplifying usage.
+Finally, performance results across a number of benchmarks show llheap is competitive with the best memory allocators.
+}% abstract
+
 % While not as powerful as the \lstinline{valgrind} interpreter, a large number of allocations mistakes are detected.
-% Finally, contention-free statistics gathering and debugging have a low enough cost to be used in production code.
-% 
 % A micro-benchmark test-suite is started for comparing allocators, rather than relying on a suite of arbitrary programs. It has been an interesting challenge.
 % These micro-benchmarks have adjustment knobs to simulate allocation patterns hard-coded into arbitrary test programs.
 % Existing memory allocators, glibc, dlmalloc, hoard, jemalloc, ptmalloc3, rpmalloc, tbmalloc, and the new allocator llheap are all compared using the new micro-benchmark test-suite.
-}% aabstract
-
-\keywords{C \CFA (Cforall) coroutine concurrency generator monitor parallelism runtime thread}
+
+\keywords{memory allocation, (user-level) concurrency, type-safety, statistics, debugging, high performance}
 
 
@@ -237,9 +226,9 @@
 \section{Introduction}
 
-Memory management takes a sequence of program generated allocation/deallocation requests and attempts to satisfy them within a fixed-sized block of memory while minimizing the total amount of memory used.
-A general-purpose dynamic-allocation algorithm cannot anticipate future allocation requests so its output is rarely optimal.
-However, memory allocators do take advantage of regularities in allocation patterns for typical programs to produce excellent results, both in time and space (similar to LRU paging).
-In general, allocators use a number of similar techniques, each optimizing specific allocation patterns.
-Nevertheless, memory allocators are a series of compromises, occasionally with some static or dynamic tuning parameters to optimize specific program-request patterns.
+Memory management services a series of program allocation/deallocation requests and attempts to satisfy them from a variable-sized block of memory, while minimizing total memory usage.
+A general-purpose dynamic-allocation algorithm cannot anticipate allocation requests so its time and space performance is rarely optimal.
+However, allocators take advantage of regular allocation patterns in typical programs to produce excellent results, both in time and space (similar to LRU paging).
+Allocators use a number of similar techniques, but each optimizes specific allocation patterns.
+Nevertheless, allocators are a series of compromises, occasionally with some static or dynamic tuning parameters to optimize specific program-request patterns.
 
 
@@ -247,17 +236,17 @@
 \label{s:MemoryStructure}
 
-Figure~\ref{f:ProgramAddressSpace} shows the typical layout of a program's address space divided into the following zones (right to left): static code/data, dynamic allocation, dynamic code/data, and stack, with free memory surrounding the dynamic code/data~\cite{memlayout}.
+Figure~\ref{f:ProgramAddressSpace} shows the typical layout of a program's address space (high to low) divided into a number of zones, with free memory surrounding the dynamic code/data~\cite{memlayout}.
 Static code and data are placed into memory at load time from the executable and are fixed-sized at runtime.
-Dynamic-allocation memory starts empty and grows/shrinks as the program dynamically creates/deletes variables with independent lifetime.
-The programming-language's runtime manages this area, where management complexity is a function of the mechanism for deleting variables.
 Dynamic code/data memory is managed by the dynamic loader for libraries loaded at runtime, which is complex especially in a multi-threaded program~\cite{Huang06}.
 However, changes to the dynamic code/data space are typically infrequent, many occurring at program startup, and are largely outside of a program's control.
 Stack memory is managed by the program call/return-mechanism using a LIFO technique, which works well for sequential programs.
 For stackful coroutines and user threads, a new stack is commonly created in the dynamic-allocation memory.
+The dynamic-allocation memory is often a contiguous area (can be memory mapped as multiple areas), which starts empty and grows/shrinks as the program creates/deletes variables with independent lifetime.
+The programming-language's runtime manages this area, where management complexity is a function of the mechanism for deleting variables.
 This work focuses solely on management of the dynamic-allocation memory.
 
 \begin{figure}
 \centering
-\input{AddressSpace}
+\input{AddressSpace.pstex_t}
 \vspace{-5pt}
 \caption{Program Address Space Divided into Zones}
@@ -269,20 +258,20 @@
 \label{s:DynamicMemoryManagement}
 
-Modern programming languages manage dynamic-allocation memory in different ways.
+Modern programming languages manage dynamic memory in different ways.
 Some languages, such as Lisp~\cite{CommonLisp}, Java~\cite{Java}, Haskell~\cite{Haskell}, Go~\cite{Go}, provide explicit allocation but \emph{implicit} deallocation of data through garbage collection~\cite{Wilson92}.
 In general, garbage collection supports memory compaction, where dynamic (live) data is moved during runtime to better utilize space.
-However, moving data requires finding pointers to it and updating them to reflect new data locations.
+However, moving data requires finding and updating pointers to it to reflect the new data locations.
 Programming languages such as C~\cite{C}, \CC~\cite{C++}, and Rust~\cite{Rust} provide the programmer with explicit allocation \emph{and} deallocation of data.
 These languages cannot find and subsequently move live data because pointers can be created to any storage zone, including internal components of allocated objects, and may contain temporary invalid values generated by pointer arithmetic.
 Attempts have been made to perform quasi garbage collection in C/\CC~\cite{Boehm88}, but it is a compromise.
-This work only examines dynamic memory-management with \emph{explicit} deallocation.
+This work only examines dynamic management with \emph{explicit} deallocation.
 While garbage collection and compaction are not part this work, many of the results are applicable to the allocation phase in any memory-management approach.
 
-Most programs use a general-purpose allocator, often the one provided implicitly by the programming-language's runtime.
-When this allocator proves inadequate, programmers often write specialize allocators for specific needs.
-C and \CC allow easy replacement of the default memory allocator with an alternative specialized or general-purpose memory-allocator.
+Most programs use a general-purpose allocator, usually the one provided by the programming-language's runtime.
+In certain languages, programmers can write specialize allocators for specific needs.
+C and \CC allow easy replacement of the default memory allocator through a standard API.
 Jikes RVM MMTk~\cite{MMTk} provides a similar generalization for the Java virtual machine.
-However, high-performance memory-allocators for kernel and user multi-threaded programs are still being designed and improved.
-For this reason, several alternative general-purpose allocators have been written for C/\CC with the goal of scaling in a multi-threaded program~\cite{Berger00,mtmalloc,streamflow,tcmalloc}.
+As well, new languages support concurrency (kernel and/or user threading), which must be safely handled by the allocator.
+Hence, several alternative allocators exist for C/\CC with the goal of scaling in a multi-threaded program~\cite{Berger00,mtmalloc,streamflow,tcmalloc}.
 This work examines the design of high-performance allocators for use by kernel and user multi-threaded applications written in C/\CC.
 
@@ -294,17 +283,23 @@
 \begin{enumerate}[leftmargin=*,itemsep=0pt]
 \item
-Implementation of a new stand-alone concurrent low-latency memory-allocator ($\approx$1,200 lines of code) for C/\CC programs using kernel threads (1:1 threading), and specialized versions of the allocator for the programming languages \uC~\cite{uC++} and \CFA~\cite{Moss18,Delisle21} using user-level threads running on multiple kernel threads (M:N threading).
-
-\item
-Extend the standard C heap functionality by preserving with each allocation: its request size plus the amount allocated, whether an allocation is zero fill and/or allocation alignment.
+Implementation of a new stand-alone concurrent low-latency memory-allocator ($\approx$1,200 lines of code) for C/\CC programs using kernel threads (1:1 threading), and specialized versions for the concurrent languages \uC~\cite{uC++} and \CFA~\cite{Moss18,Delisle21} using user-level threads running on multiple kernel threads (M:N threading).
+
+\item
+Extend the standard C heap functionality by preserving with each allocation its request size, the amount allocated, whether it is zero fill, and its alignment.
 
 \item
 Use the preserved zero fill and alignment as \emph{sticky} properties for @realloc@ to zero-fill and align when storage is extended or copied.
-Without this extension, it is unsafe to @realloc@ storage initially allocated with zero-fill/alignment as these properties are not preserved when copying.
-This silent generation of a problem is unintuitive to programmers and difficult to locate because it is transient.
-
-\item
-Provide additional heap operations to complete programmer expectation with respect to accessing different allocation properties.
-\begin{itemize}[topsep=3pt,itemsep=2pt,parsep=0pt]
+Without this extension, it is unsafe to @realloc@ storage these allocations if the properties are not preserved when copying.
+This silent problem is unintuitive to programmers and difficult to locate because it is transient.
+
+\item
+Provide additional heap operations to make allocation properties orthogonally accessible.
+\begin{itemize}[topsep=2pt,itemsep=2pt,parsep=0pt]
+\item
+@aalloc( dim, elemSize )@ same as @calloc@ except memory is \emph{not} zero filled.
+\item
+@amemalign( alignment, dim, elemSize )@ same as @aalloc@ with memory alignment.
+\item
+@cmemalign( alignment, dim, elemSize )@ same as @calloc@ with memory alignment.
 \item
 @resize( oaddr, size )@ re-purpose an old allocation for a new type \emph{without} preserving fill or alignment.
@@ -313,14 +308,5 @@
 \item
 @realloc( oaddr, alignment, size )@ same as @realloc@ but adding or changing alignment.
-\item
-@aalloc( dim, elemSize )@ same as @calloc@ except memory is \emph{not} zero filled.
-\item
-@amemalign( alignment, dim, elemSize )@ same as @aalloc@ with memory alignment.
-\item
-@cmemalign( alignment, dim, elemSize )@ same as @calloc@ with memory alignment.
 \end{itemize}
-
-\item
-Provide additional heap wrapper functions in \CFA creating a more usable set of allocation operations and properties.
 
 \item
@@ -328,23 +314,23 @@
 \begin{itemize}[topsep=3pt,itemsep=2pt,parsep=0pt]
 \item
-@malloc_alignment( addr )@ returns the alignment of the allocation pointed-to by @addr@.
+@malloc_alignment( addr )@ returns the alignment of the allocation.
 If the allocation is not aligned or @addr@ is @NULL@, the minimal alignment is returned.
 \item
-@malloc_zero_fill( addr )@ returns a boolean result indicating if the memory pointed-to by @addr@ is allocated with zero fill, e.g., by @calloc@/@cmemalign@.
-\item
-@malloc_size( addr )@ returns the size of the memory allocation pointed-to by @addr@.
-\item
-@malloc_usable_size( addr )@ returns the usable (total) size of the memory pointed-to by @addr@, i.e., the bin size containing the allocation, where @malloc_size( addr )@ $\le$ @malloc_usable_size( addr )@.
+@malloc_zero_fill( addr )@ returns a boolean result indicating if the memory is allocated with zero fill, e.g., by @calloc@/@cmemalign@.
+\item
+@malloc_size( addr )@ returns the size of the memory allocation.
+\item
+@malloc_usable_size( addr )@ returns the usable (total) size of the memory, i.e., the bin size containing the allocation, where @malloc_size( addr )@ $\le$ @malloc_usable_size( addr )@.
 \end{itemize}
 
 \item
-Provide complete, fast, and contention-free allocation statistics to help understand allocation behaviour:
+Provide optional extensive, fast, and contention-free allocation statistics to understand allocation behaviour, accessed by:
 \begin{itemize}[topsep=3pt,itemsep=2pt,parsep=0pt]
 \item
-@malloc_stats()@ print memory-allocation statistics on the file-descriptor set by @malloc_stats_fd@.
-\item
-@malloc_info( options, stream )@ print memory-allocation statistics as an XML string on the specified file-descriptor set by @malloc_stats_fd@.
-\item
-@malloc_stats_fd( fd )@ set file-descriptor number for printing memory-allocation statistics (default @STDERR_FILENO@).
+@malloc_stats()@ print memory-allocation statistics on the file-descriptor set by @malloc_stats_fd@ (default @stderr@).
+\item
+@malloc_info( options, stream )@ print memory-allocation statistics as an XML string on the specified file-descriptor set by @malloc_stats_fd@ (default @stderr@).
+\item
+@malloc_stats_fd( fd )@ set file-descriptor number for printing memory-allocation statistics (default @stderr@).
 This file descriptor is used implicitly by @malloc_stats@ and @malloc_info@.
 \end{itemize}
@@ -355,5 +341,8 @@
 \item
 Build 8 different versions of the allocator: static or dynamic linking, with or without statistics or debugging.
-A program may link to any of these 8 versions of the allocator often without recompilation.
+A program may link to any of these 8 versions of the allocator often without recompilation (@LD_PRELOAD@).
+
+\item
+Provide additional heap wrapper functions in \CFA creating a more usable set of allocation operations and properties.
 
 \item
@@ -365,11 +354,9 @@
 \section{Background}
 
-The following discussion is a quick overview of the moving-pieces that affect the design of a memory allocator and its performance.
-Dynamic acquires and releases obtain storage for a program variable, called an \newterm{object}, through calls such as @malloc@ and @free@ in C, and @new@ and @delete@ in \CC.
-Space for each allocated object comes from the dynamic-allocation zone.
-
+The following is a quick overview of allocator design options that affect memory usage and performance (see~\cite{Zulfiqar22} for more details).
+Dynamic acquires and releases obtain storage for a program variable, called an \newterm{object}, through calls such as @malloc@/@new@ and @free@/@delete@ in C/\CC.
 A \newterm{memory allocator} contains a complex data-structure and code that manages the layout of objects in the dynamic-allocation zone.
 The management goals are to make allocation/deallocation operations as fast as possible while densely packing objects to make efficient use of memory.
-Objects in C/\CC cannot be moved to aid the packing process, only adjacent free storage can be \newterm{coalesced} into larger free areas.
+Since objects in C/\CC cannot be moved to aid the packing process, only adjacent free storage can be \newterm{coalesced} into larger free areas.
 The allocator grows or shrinks the dynamic-allocation zone to obtain storage for objects and reduce memory usage via operating-system calls, such as @mmap@ or @sbrk@ in UNIX.
 
@@ -383,6 +370,6 @@
 The \newterm{storage data} is composed of allocated and freed objects, and \newterm{reserved memory}.
 Allocated objects (light grey) are variable sized, and are allocated and maintained by the program;
-\ie only the program knows the location of allocated storage not the memory allocator.
-Freed objects (white) represent memory deallocated by the program, which are linked into one or more lists facilitating easy location of new allocations.
+\ie only the program knows the location of allocated storage.
+Freed objects (white) represent memory deallocated by the program, which are linked into one or more lists facilitating location of new allocations.
 Reserved memory (dark grey) is one or more blocks of memory obtained from the \newterm{operating system} (OS) but not yet allocated to the program;
 if there are multiple reserved blocks, they are also chained together.
@@ -401,5 +388,4 @@
 An object may be preceded by padding to ensure proper alignment.
 Some algorithms quantize allocation requests, resulting in additional space after an object less than the quantized value.
-% The buckets are often organized as an array of ascending bucket sizes for fast searching, \eg binary search, and the array is stored in the heap management-area, where each bucket is a top point to the freed objects of that size.
 When padding and spacing are necessary, neither can be used to satisfy a future allocation request while the current allocation exists.
 
@@ -407,4 +393,5 @@
 Often the free list is chained internally so it does not consume additional storage, \ie the link fields are placed at known locations in the unused memory blocks.
 For internal chaining, the amount of management data for a free node defines the minimum allocation size, \eg if 16 bytes are needed for a free-list node, allocation requests less than 16 bytes are rounded up.
+Often the minimum storage alignment and free-node size are the same.
 The information in an allocated or freed object is overwritten when it transitions from allocated to freed and vice-versa by new program data and/or management information.
 
@@ -420,15 +407,13 @@
 \label{s:SingleThreadedMemoryAllocator}
 
-A single-threaded memory-allocator does not run any threads itself, but is used by a single-threaded program.
-Because the memory allocator is only executed by a single thread, concurrency issues do not exist.
-The primary issues in designing a single-threaded memory-allocator are fragmentation and locality.
-
+In a sequential (single threaded) program, the program thread performs all allocation operations and concurrency issues do not exist.
+However, interrupts logically introduce concurrency, if the signal handler performs allocation/deallocation (serially reusable problem~\cite{SeriallyReusable}).
+In general, the primary issues in a single-threaded allocator are fragmentation and locality.
 
 \subsubsection{Fragmentation}
 \label{s:Fragmentation}
 
-Fragmentation is memory requested from the OS but not used by the program;
-hence, allocated objects are not fragmentation.
-Figure~\ref{f:InternalExternalFragmentation} shows fragmentation is divided into two forms: internal or external.
+Fragmentation is memory requested from the OS but not used allocated objects in by the program.
+Figure~\ref{f:InternalExternalFragmentation} shows fragmentation is divided into two forms: \emph{internal} or \emph{external}.
 
 \begin{figure}
@@ -439,14 +424,14 @@
 \end{figure}
 
-\newterm{Internal fragmentation} is memory space that is allocated to the program, but is not intended to be accessed by the program, such as headers, trailers, padding, and spacing around an allocated object.
-Internal fragmentation is problematic when management space is a significant proportion of an allocated object, \eg for small objects ($<$16 bytes), memory usage is doubled.
-An allocator should strive to keep internal management information to a minimum.
-
-\newterm{External fragmentation} is all memory space reserved from the OS but not allocated to the program~\cite{Wilson95,Lim98,Siebert00}, which includes all external management data, freed objects, and reserved memory.
+\newterm{Internal fragmentation} is unaccessible allocated memory, such as headers, trailers, padding, and spacing around an allocated object.
+Internal fragmentation is problematic when management space becomes a significant proportion of an allocated object, \eg for objects $<$16 bytes, memory usage doubles.
+An allocator strives to keep internal management information to a minimum.
+
+\newterm{External fragmentation} is memory not allocated in the program~\cite{Wilson95,Lim98,Siebert00}, which includes all external management data, freed objects, and reserved memory.
 This memory is problematic in two ways: heap blowup and highly fragmented memory.
 \newterm{Heap blowup} occurs when freed memory cannot be reused for future allocations leading to potentially unbounded external fragmentation growth~\cite{Berger00}.
-Memory can become \newterm{highly fragmented} after multiple allocations and deallocations of objects, resulting in a checkerboard of adjacent allocated and free areas, where the free blocks have become to small to service requests.
+Memory can become \newterm{highly fragmented} after multiple allocations and deallocations of objects, resulting in a checkerboard of adjacent allocated and free areas, where the free blocks are to small to service requests.
 % Figure~\ref{f:MemoryFragmentation} shows an example of how a small block of memory fragments as objects are allocated and deallocated over time.
-Heap blowup can occur due to allocator policies that are too restrictive in reusing freed memory (the allocated size cannot use a larger free block) and/or no coalescing of free storage.
+Heap blowup occurs with allocator policies that are too restrictive in reusing freed memory, \eg the allocated size cannot use a larger free block and/or no coalescing of free storage.
 % Blocks of free memory become smaller and non-contiguous making them less useful in serving allocation requests.
 % Memory is highly fragmented when most free blocks are unusable because of their sizes.
@@ -479,19 +464,18 @@
 
 The second approach is a \newterm{segregated} or \newterm{binning algorithm} with a set of lists for different sized freed objects.
-When an object is allocated, the requested size is rounded up to the nearest bin-size, often leading to spacing after the object.
+When an object is allocated, the requested size is rounded up to the nearest bin-size, often leading to space after the object.
 A binning algorithm is fast at finding free memory of the appropriate size and allocating it, since the first free object on the free list is used.
-The fewer bin sizes, the fewer lists need to be searched and maintained;
-however, unusable space after object increases, leading to more internal fragmentation.
-The more bin sizes, the longer the search and the less likely a matching free objects is found, leading to more external fragmentation and potentially heap blowup.
-A variation of the binning algorithm allows objects to be allocated from larger bin sizes when the matching bins is empty, and the freed object can be returned to the matching or larger bin (some advantages to either scheme).
+Fewer bin sizes means a faster search to find a matching bin, but larger differences between allocation and bin size, which increases unusable space after objects (internal fragmentation).
+More bin sizes means a slower search but smaller differences matching between allocation and bin size resulting in less internal fragmentation but more external fragmentation if larger bins cannot service smaller requests.
+Allowing larger bins to service smaller allocations when the matching bin is empty means the freed object can be returned to the matching or larger bin (some advantages to either scheme).
 % For example, with bin sizes of 8 and 16 bytes, a request for 12 bytes allocates only 12 bytes, but when the object is freed, it is placed on the 8-byte bin-list.
 % For subsequent requests, the bin free-lists contain objects of different sizes, ranging from one bin-size to the next (8-16 in this example), and a sequential-fit algorithm may be used to find an object large enough for the requested size on the associated bin list.
 
-The third approach is \newterm{splitting} and \newterm{coalescing algorithms}.
-When an object is allocated, if there are no free objects of the requested size, a larger free object is split into two smaller objects to satisfy the allocation request rather than obtaining more memory from the OS.
-For example, in the \newterm{buddy system}, a block of free memory is split into equal chunks, one of those chunks is again split, and so on until a minimal block is created that fits the requested object.
-When an object is deallocated, it is coalesced with the objects immediately before and after it in memory, if they are free, turning them into one larger block.
+The third approach is a \newterm{splitting} and \newterm{coalescing} algorithms.
+When an object is allocated, if there is no matching free storage, a larger free object is split into two smaller objects, one matching the allocation size.
+For example, in the \newterm{buddy system}, a block of free memory is split into equal chunks, splitting continues until a minimal block is created that fits the allocation.
+When an object is deallocated, it is coalesced with the objects immediately before/after it in memory, if they are free, turning them into a larger block.
 Coalescing can be done eagerly at each deallocation or lazily when an allocation cannot be fulfilled.
-In all cases, coalescing increases allocation latency, hence some allocations can cause unbounded delays.
+However, coalescing increases allocation latency (unbounded delays), both for allocation and deallocation.
 While coalescing does not reduce external fragmentation, the coalesced blocks improve fragmentation quality so future allocations are less likely to cause heap blowup.
 % Splitting and coalescing can be used with other algorithms to avoid highly fragmented memory.
@@ -504,20 +488,19 @@
 % Temporal clustering implies a group of objects are accessed repeatedly within a short time period, while spatial clustering implies a group of objects physically close together (nearby addresses) are accessed repeatedly within a short time period.
 % Temporal locality commonly occurs during an iterative computation with a fixed set of disjoint variables, while spatial locality commonly occurs when traversing an array.
-Hardware takes advantage of the working set through multiple levels of caching, \ie memory hierarchy.
+Hardware takes advantage of the working set through multiple levels of caching and paging, \ie memory hierarchy.
 % When an object is accessed, the memory physically located around the object is also cached with the expectation that the current and nearby objects will be referenced within a short period of time.
 For example, entire cache lines are transferred between cache and memory, and entire virtual-memory pages are transferred between memory and disk.
 % A program exhibiting good locality has better performance due to fewer cache misses and page faults\footnote{With the advent of large RAM memory, paging is becoming less of an issue in modern programming.}.
 
-Temporal locality is largely controlled by how a program accesses its variables~\cite{Feng05}.
-Nevertheless, a memory allocator can have some indirect influence on temporal locality and largely dictates spatial locality.
-For temporal locality, an allocator can return storage for new allocations that was just freed as these memory locations are still \emph{warm} in the memory hierarchy.
-For spatial locality, an allocator can place objects used together close together in memory, so the working set of the program fits into the fewest possible cache lines and pages.
+Temporal locality is largely controlled by program accesses to its variables~\cite{Feng05}.
+An allocator has only indirect influence on temporal locality but largely dictates spatial locality.
+For temporal locality, an allocator tries to return recently freed storage for new allocations, as this memory is still \emph{warm} in the memory hierarchy.
+For spatial locality, an allocator places objects used together close together in memory, so the working set of the program fits into the fewest possible cache lines and pages.
 % However, usage patterns are different for every program as is the underlying hardware memory architecture;
 % hence, no general-purpose memory-allocator can provide ideal locality for every program on every computer.
 
-There are a number of ways a memory allocator can degrade locality by increasing the working set.
-For example, a memory allocator may access multiple free objects before finding one to satisfy an allocation request, \eg sequential-fit algorithm, which can perturb the program's memory hierarchy causing multiple cache or page misses~\cite{Grunwald93}.
-Another way locality can be degraded is by spatially separating related data.
-For example, in a binning allocator, objects of different sizes are allocated from different bins that may be located in different pages of memory.
+An allocator can easily degrade locality by increasing the working set.
+An allocator can access an unbounded number of free objects when matching an allocation or coalescing, causing multiple cache or page misses~\cite{Grunwald93}.
+An allocator can spatially separate related data by binning free storage anywhere in memory, so the related objects are highly separated.
 
 
@@ -525,6 +508,6 @@
 \label{s:MultiThreadedMemoryAllocator}
 
-A multi-threaded memory-allocator does not run any threads itself, but is used by a multi-threaded program.
-In addition to single-threaded design issues of fragmentation and locality, a multi-threaded allocator is simultaneously accessed by multiple threads, and hence, must deal with concurrency issues such as mutual exclusion, false sharing, and additional forms of heap blowup.
+In a concurrent (multi-threaded) program, multiple program threads performs allocation operations and all concurrency issues arise.
+Along with fragmentation and locality issues, a multi-threaded allocator must deal with mutual exclusion, false sharing, and additional forms of heap blowup.
 
 
@@ -534,11 +517,11 @@
 \newterm{Mutual exclusion} provides sequential access to the shared-management data of the heap.
 There are two performance issues for mutual exclusion.
-First is the overhead necessary to perform (at least) a hardware atomic operation every time a shared resource is accessed.
-Second is when multiple threads contend for a shared resource simultaneously, and hence, some threads must wait until the resource is released.
+First is the cost of performing at least one hardware atomic operation every time a shared resource is accessed.
+Second is \emph{contention} on simultaneous access, so some threads must wait until the resource is released.
 Contention can be reduced in a number of ways:
-1) Using multiple fine-grained locks versus a single lock to spread the contention across a number of locks.
-2) Using trylock and generating new storage if the lock is busy, yielding a classic space versus time tradeoff.
+1) Using multiple fine-grained locks versus a single lock to spread the contention across the locks.
+2) Using trylock and generating new storage if the lock is busy (classic space versus time tradeoff).
 3) Using one of the many lock-free approaches for reducing contention on basic data-structure operations~\cite{Oyama99}.
-However, all of these approaches have degenerate cases where program contention is high, which occurs outside of the allocator.
+However, all approaches have degenerate cases where program contention to the heap is high, which is beyond the allocator's control.
 
 
@@ -546,39 +529,12 @@
 \label{s:FalseSharing}
 
-False sharing is a dynamic phenomenon leading to cache thrashing.
-When two or more threads on separate CPUs simultaneously change different objects sharing a cache line, the change invalidates the other thread's associated cache, even though these threads may be uninterested in the other modified object.
-False sharing can occur in three different ways: program induced, allocator-induced active, and allocator-induced passive;
-a memory allocator can only affect the latter two.
-
-Specifically, assume two objects, O$_1$ and O$_2$, share a cache line, with threads, T$_1$ and T$_2$.
-\newterm{Program-induced false-sharing} occurs when T$_1$ passes a reference to O$_2$ to T$_2$, and then T$_1$ modifies O$_1$ while T$_2$ modifies O$_2$.
-% Figure~\ref{f:ProgramInducedFalseSharing} shows when Thread$_1$ passes Object$_2$ to Thread$_2$, a false-sharing situation forms when Thread$_1$ modifies Object$_1$ and Thread$_2$ modifies Object$_2$.
-% Changes to Object$_1$ invalidate CPU$_2$'s cache line, and changes to Object$_2$ invalidate CPU$_1$'s cache line.
-% \begin{figure}
-% \centering
-% \subfloat[Program-Induced False-Sharing]{
-% 	\input{ProgramFalseSharing}
-% 	\label{f:ProgramInducedFalseSharing}
-% } \\
-% \vspace{5pt}
-% \subfloat[Allocator-Induced Active False-Sharing]{
-% 	\input{AllocInducedActiveFalseSharing}
-% 	\label{f:AllocatorInducedActiveFalseSharing}
-% } \\
-% \vspace{5pt}
-% \subfloat[Allocator-Induced Passive False-Sharing]{
-% 	\input{AllocInducedPassiveFalseSharing}
-% 	\label{f:AllocatorInducedPassiveFalseSharing}
-% } subfloat
-% \caption{False Sharing}
-% \label{f:FalseSharing}
-% \end{figure}
-\newterm{Allocator-induced active false-sharing}\label{s:AllocatorInducedActiveFalseSharing} occurs when O$_1$ and O$_2$ are heap allocated and their references are passed to T$_1$ and T$_2$, which modify the objects.
-% For example, in Figure~\ref{f:AllocatorInducedActiveFalseSharing}, each thread allocates an object and loads a cache-line of memory into its associated cache.
-% Again, changes to Object$_1$ invalidate CPU$_2$'s cache line, and changes to Object$_2$ invalidate CPU$_1$'s cache line.
-\newterm{Allocator-induced passive false-sharing}\label{s:AllocatorInducedPassiveFalseSharing} occurs
-% is another form of allocator-induced false-sharing caused by program-induced false-sharing.
-% When an object in a program-induced false-sharing situation is deallocated, a future allocation of that object may cause passive false-sharing.
-when T$_1$ passes O$_2$ to T$_2$, and T$_2$ subsequently deallocates O$_2$, and then O$_2$ is reallocated to T$_2$ while T$_1$ is still using O$_1$.
+False sharing occurs when two or more threads simultaneously modify different objects sharing a cache line.
+Changes now invalidate each thread's cache, even though the threads may be uninterested in the other modified object.
+False sharing can occur three ways:
+1) Thread T$_1$ allocates objects O$_1$ and O$_2$ on the same cache line and passes O$_2$'s reference to thread T$_2$;
+both threads now simultaneously modifying the objects on the same cache line.
+2) Objects O$_1$ and O$_2$ are allocated on the same cache line by thread T$_3$ and their references are passed to T$_1$ and T$_2$, which simultaneously modify the objects.
+3) T$_2$ deallocates O$_2$, T$_1$ allocates O$_1$ on the same cache line as O$_2$, and T$_2$ reallocated O$_2$ while T$_1$ is using O$_1$.
+In all three cases, the allocator performs a hidden and possibly transient (non-determinism) operation, making it extremely difficult to find and fix the issue.
 
 
@@ -586,23 +542,17 @@
 \label{s:HeapBlowup}
 
-In a multi-threaded program, heap blowup can occur when memory freed by one thread is inaccessible to other threads due to the allocation strategy.
+In a multi-threaded program, heap blowup occurs when memory freed by one thread is inaccessible to other threads due to the allocation strategy.
 Specific examples are presented in later subsections.
 
 
-\subsection{Multi-Threaded Memory-Allocator Features}
-\label{s:MultiThreadedMemoryAllocatorFeatures}
-
-The following features are used in the construction of multi-threaded memory-allocators: multiple heaps, user-level threading, ownership, object containers, allocation buffer, lock-free operations.
-The first feature, multiple heaps, pertains to different kinds of heaps.
-The second feature, object containers, pertains to the organization of objects within the storage area.
-The remaining features apply to different parts of the allocator design or implementation.
-
+\subsection{Multi-Threaded Allocator Features}
+\label{s:MultiThreadedAllocatorFeatures}
+
+The following features are used in the construction of multi-threaded allocators.
 
 \subsubsection{Multiple Heaps}
 \label{s:MultipleHeaps}
 
-A multi-threaded allocator has potentially multiple threads and heaps.
-The multiple threads cause complexity, and multiple heaps are a mechanism for dealing with the complexity.
-The spectrum ranges from multiple threads using a single heap, denoted as T:1, to multiple threads sharing multiple heaps, denoted as T:H, to one thread per heap, denoted as 1:1, which is almost back to a single-threaded allocator.
+Figure~\ref{f:ThreadHeapRelationship} shows how a multi-threaded allocator can subdivide a single global heap into multiple heaps to reduce contention among threads.
 
 \begin{figure}
@@ -626,19 +576,20 @@
 } % subfloat
 \caption{Multiple Heaps, Thread:Heap Relationship}
-\end{figure}
-
-\paragraph{T:1 model (see Figure~\ref{f:SingleHeap})} where all threads allocate and deallocate objects from one heap.
+\label{f:ThreadHeapRelationship}
+\end{figure}
+
+\begin{description}[leftmargin=*]
+\item[T:1 model (Figure~\ref{f:SingleHeap})] has all threads allocating and deallocating objects from one heap.
 Memory is obtained from the freed objects, or reserved memory in the heap, or from the OS;
 the heap may also return freed memory to the OS.
-The arrows indicate the direction memory conceptually moves for each kind of operation: allocation moves memory along the path from the heap/operating-system to the user application, while deallocation moves memory along the path from the application back to the heap/operating-system.
+The arrows indicate the direction memory moves for each alocation/deallocation operation.
 To safely handle concurrency, a single lock may be used for all heap operations or fine-grained locking for different operations.
-Regardless, a single heap may be a significant source of contention for programs with a large amount of memory allocation.
-
-\paragraph{T:H model (see Figure~\ref{f:SharedHeaps})} where each thread allocates storage from several heaps depending on certain criteria, with the goal of reducing contention by spreading allocations/deallocations across the heaps.
-The decision on when to create a new heap and which heap a thread allocates from depends on the allocator design.
-To determine which heap to access, each thread must point to its associated heap in some way.
-The performance goal is to reduce the ratio of heaps to threads.
-However, the worse case can result in more heaps than threads, \eg if the number of threads is large at startup with many allocations creating a large number of heaps and then the number of threads reduces.
-Locking is required, since more than one thread may concurrently access a heap during its lifetime, but contention is reduced because fewer threads access a specific heap.
+Regardless, a single heap is a significant source of contention for threaded programs with a large amount of memory allocations.
+
+\item[T:H model (Figure~\ref{f:SharedHeaps})] subdivides the heap independently from the threads.
+The decision to create a heap and which heap a thread allocates/deallocates during its lifetime depends on the allocator design.
+Locking is required within each heap because of multiple tread access, but contention is reduced because fewer threads access a specific heap.
+The goal is to have mininal heaps (storage) and thread contention per heap (time).
+However, the worst case results in more heaps than threads, \eg if the number of threads is large at startup creating a large number of heaps and then the number of threads reduces.
 
 % For example, multiple heaps are managed in a pool, starting with a single or a fixed number of heaps that increase\-/decrease depending on contention\-/space issues.
@@ -693,9 +644,9 @@
 In general, the cost is minimal since the majority of memory operations are completed without the use of the global heap.
 
-\paragraph{1:1 model (see Figure~\ref{f:PerThreadHeap})} where each thread has its own heap eliminating most contention and locking because threads seldom access another thread's heap (see Section~\ref{s:Ownership}).
+\item[1:1 model (Figure~\ref{f:PerThreadHeap})] has each thread with its own heap, eliminating most contention and locking because threads seldom access another thread's heap (see Section~\ref{s:Ownership}).
 An additional benefit of thread heaps is improved locality due to better memory layout.
 As each thread only allocates from its heap, all objects are consolidated in the storage area for that heap, better utilizing each CPUs cache and accessing fewer pages.
 In contrast, the T:H model spreads each thread's objects over a larger area in different heaps.
-Thread heaps can also eliminate allocator-induced active false-sharing, if memory is acquired so it does not overlap at crucial boundaries with memory for another thread's heap.
+Thread heaps can also reduces false-sharing, except at crucial boundaries overlapping memory from another thread's heap.
 For example, assume page boundaries coincide with cache line boundaries, if a thread heap always acquires pages of memory then no two threads share a page or cache line unless pointers are passed among them.
 % Hence, allocator-induced active false-sharing cannot occur because the memory for thread heaps never overlaps.
@@ -706,4 +657,5 @@
 Destroying the thread heap immediately may reduce external fragmentation sooner, since all free objects are freed to the global heap and may be reused by other threads.
 Alternatively, reusing thread heaps may improve performance if the inheriting thread makes similar allocation requests as the thread that previously held the thread heap because any unfreed storage is immediately accessible.
+\end{description}
 
 
Index: doc/papers/llheap/figures/AddressSpace.fig
===================================================================
--- doc/papers/llheap/figures/AddressSpace.fig	(revision 9262fe909e250ac8e6f45eb30157e75ff29969c6)
+++ doc/papers/llheap/figures/AddressSpace.fig	(revision b93c54440a0fc663f1dc3879d4111a307914de1b)
@@ -9,40 +9,40 @@
 1200 2
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
-	 5700 1350 6600 1350 6600 2100 5700 2100 5700 1350
-2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
-	 1200 1350 2100 1350 2100 2100 1200 2100 1200 1350
-2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
-	 4800 1350 5700 1350 5700 2100 4800 2100 4800 1350
+	 1200 1200 2100 1200 2100 1800 1200 1800 1200 1200
+2 2 0 1 0 7 60 -1 17 0.000 0 0 -1 0 0 5
+	 2100 1200 3000 1200 3000 1800 2100 1800 2100 1200
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 45.00 90.00
-	 2100 1725 2400 1725
+	 2100 1500 2400 1500
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 45.00 90.00
-	 3000 1725 2700 1725
+	 3000 1500 2700 1500
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 3000 1200 3900 1200 3900 1800 3000 1800 3000 1200
+2 2 0 1 0 7 60 -1 17 0.000 0 0 -1 0 0 5
+	 3900 1200 4800 1200 4800 1800 3900 1800 3900 1200
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 45.00 90.00
-	 3900 1725 4200 1725
+	 3900 1500 4200 1500
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 45.00 90.00
-	 4800 1725 4500 1725
-2 2 0 1 0 7 60 -1 17 0.000 0 0 -1 0 0 5
-	 2100 1350 3000 1350 3000 2100 2100 2100 2100 1350
+	 4800 1500 4500 1500
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
-	 3000 1350 3900 1350 3900 2100 3000 2100 3000 1350
-2 2 0 1 0 7 60 -1 17 0.000 0 0 -1 0 0 5
-	 3900 1350 4800 1350 4800 2100 3900 2100 3900 1350
-4 0 0 50 -1 0 10 0.0000 2 180 900 1200 2325 high address\001
-4 2 0 50 -1 0 10 0.0000 2 135 855 6600 2325 low address\001
-4 1 0 50 -1 0 10 0.0000 2 120 330 6150 2025 Data\001
-4 1 0 50 -1 0 10 0.0000 2 135 675 6150 1800 Code and\001
-4 1 0 50 -1 0 10 0.0000 2 120 390 6150 1575 Static\001
-4 1 0 50 -1 0 10 0.0000 2 135 390 1650 1800 Stack\001
-4 1 0 50 -1 0 10 0.0000 2 165 615 2550 1950 Memory\001
-4 1 0 50 -1 0 10 0.0000 2 165 615 4350 1950 Memory\001
-4 1 0 50 -1 0 10 0.0000 2 120 315 2550 1650 Free\001
-4 1 0 50 -1 0 10 0.0000 2 120 330 3450 2025 Data\001
-4 1 0 50 -1 0 10 0.0000 2 135 675 3450 1800 Code and\001
-4 1 0 50 -1 0 10 0.0000 2 165 645 3450 1575 Dynamic\001
-4 1 0 50 -1 0 10 0.0000 2 120 315 4350 1650 Free\001
-4 1 0 50 -1 0 10 0.0000 2 120 735 5250 1950 Allocation\001
-4 1 0 50 -1 0 10 0.0000 2 165 645 5250 1650 Dynamic\001
+	 4800 1200 5700 1200 5700 1800 4800 1800 4800 1200
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 5700 1200 6600 1200 6600 1800 5700 1800 5700 1200
+4 0 0 50 -1 0 10 0.0000 2 165 870 1200 2025 high address\001
+4 2 0 50 -1 0 10 0.0000 2 120 810 6600 2025 low address\001
+4 1 0 50 -1 0 10 0.0000 2 120 375 1650 1575 Stack\001
+4 1 0 50 -1 0 10 0.0000 2 150 600 2550 1725 Memory\001
+4 1 0 50 -1 0 10 0.0000 2 120 300 2550 1425 Free\001
+4 1 0 50 -1 0 10 0.0000 2 120 660 3450 1575 Code and\001
+4 1 0 50 -1 0 10 0.0000 2 150 630 3450 1350 Dynamic\001
+4 1 0 50 -1 0 10 0.0000 2 120 315 3450 1775 Data\001
+4 1 0 50 -1 0 10 0.0000 2 120 300 4350 1425 Free\001
+4 1 0 50 -1 0 10 0.0000 2 150 600 4350 1725 Memory\001
+4 1 4 50 -1 0 10 0.0000 2 150 630 5250 1425 Dynamic\001
+4 1 0 50 -1 0 10 0.0000 2 120 315 6150 1775 Data\001
+4 1 0 50 -1 0 10 0.0000 2 120 660 6150 1575 Code and\001
+4 1 0 50 -1 0 10 0.0000 2 120 375 6150 1350 Static\001
+4 1 4 50 -1 0 10 0.0000 2 120 720 5250 1725 Allocation\001