Changeset 7a29392f


Ignore:
Timestamp:
Mar 6, 2024, 8:26:49 AM (7 months ago)
Author:
Peter A. Buhr <pabuhr@…>
Branches:
master
Children:
7e13b11
Parents:
647d633
Message:

continue work on memory allocation paper

Location:
doc/papers/llheap
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • doc/papers/llheap/Makefile

    r647d633 r7a29392f  
    1717
    1818FIGURES = ${addsuffix .tex, \
    19 AddressSpace \
    2019AllocatorComponents \
    2120AllocatedObject \
     
    5756
    5857PICTURES = ${addsuffix .pstex, \
     58AddressSpace \
    5959MultipleHeapsOwnershipStorage \
    6060PrivatePublicHeaps \
  • doc/papers/llheap/Paper.tex

    r647d633 r7a29392f  
    8282xleftmargin=\parindentlnth,                             % indent code to paragraph indentation
    8383escapechar=\$,                                                  % LaTeX escape in CFA code
    84 %mathescape=true,                                               % LaTeX math escape in CFA code $...$
     84mathescape=false,                                               % disable LaTeX math escape in CFA code $...$
    8585keepspaces=true,                                                %
    8686showstringspaces=false,                                 % do not show spaces with cup
     
    9090numberstyle=\footnotesize\sf,                   % numbering style
    9191moredelim=**[is][\color{red}]{@}{@},
     92% replace/adjust listing characters that look bad in sanserif
     93literate=
     94%  {-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.75ex}{0.1ex}}}}1
     95  {-}{\raisebox{-1pt}{\ttfamily-}}1
     96  {^}{\raisebox{0.6ex}{\(\scriptstyle\land\,\)}}1
     97  {~}{\raisebox{0.3ex}{\(\scriptstyle\sim\,\)}}1
     98  {'}{\ttfamily'\hspace*{-0.4ex}}1
     99  {`}{\ttfamily\upshape\hspace*{-0.3ex}`}1
     100  {<-}{$\leftarrow$}2
     101  {=>}{$\Rightarrow$}2
     102%  {->}{\raisebox{-1pt}{\texttt{-}}\kern-0.1ex\textgreater}2,
    92103}% lstset
    93104
     
    95106\lstdefinelanguage{CFA}[ANSI]{C}{
    96107        morekeywords={
    97                 _Alignas, _Alignof, __alignof, __alignof__, asm, __asm, __asm__, __attribute, __attribute__,
    98                 auto, _Bool, catch, catchResume, choose, _Complex, __complex, __complex__, __const, __const__,
    99                 coroutine, disable, dtype, enable, exception, __extension__, fallthrough, fallthru, finally,
    100                 __float80, float80, __float128, float128, forall, ftype, generator, _Generic, _Imaginary, __imag, __imag__,
    101                 inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or,
    102                 otype, restrict, resume, __restrict, __restrict__, __signed, __signed__, _Static_assert, suspend, thread,
    103                 _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,
    104                 virtual, __volatile, __volatile__, waitfor, when, with, zero_t},
     108                _Alignas, _Alignof, __alignof, __alignof__, and, asm, __asm, __asm__, _Atomic, __attribute, __attribute__,
     109                __auto_type, basetypeof, _Bool, catch, catchResume, choose, coerce, _Complex, __complex, __complex__, __const, __const__,
     110                coroutine, _Decimal32, _Decimal64, _Decimal128, disable, enable, exception, __extension__, fallthrough, fallthru, finally, fixup,
     111                __float80, float80, __float128, float128, _Float16, _Float32, _Float32x, _Float64, _Float64x, _Float128, _Float128x,
     112                forall, fortran, generator, _Generic, _Imaginary, __imag, __imag__, inline, __inline, __inline__, int128, __int128, __int128_t,
     113                __label__, monitor, mutex, _Noreturn, __builtin_offsetof, one_t, or, recover, report, restrict, __restrict, __restrict__,
     114                __signed, __signed__, _Static_assert, suspend, thread, __thread, _Thread_local, throw, throwResume, timeout, trait, try,
     115                typeof, __typeof, __typeof__, typeid, __uint128_t, __builtin_va_arg, __builtin_va_list, virtual, __volatile, __volatile__,
     116                vtable, waitfor, waituntil, when, with, zero_t,
     117    },
    105118        moredirectives={defined,include_next},
    106         % replace/adjust listing characters that look bad in sanserif
    107         literate={-}{\makebox[1ex][c]{\raisebox{0.5ex}{\rule{0.8ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptstyle\land\,$}}1
    108                 {~}{\raisebox{0.3ex}{$\scriptstyle\sim\,$}}1 % {`}{\ttfamily\upshape\hspace*{-0.1ex}`}1
    109                 {<}{\textrm{\textless}}1 {>}{\textrm{\textgreater}}1
    110                 {<-}{$\leftarrow$}2 {=>}{$\Rightarrow$}2 {->}{\makebox[1ex][c]{\raisebox{0.5ex}{\rule{0.8ex}{0.075ex}}}\kern-0.2ex{\textrm{\textgreater}}}2,
    111119}
    112120
    113121% uC++ programming language, based on ANSI C++
    114 \lstdefinelanguage{uC++}[ANSI]{C++}{
     122\lstdefinelanguage{uC++}[GNU]{C++}{
    115123        morekeywords={
    116                 _Accept, _AcceptReturn, _AcceptWait, _Actor, _At, _CatchResume, _Cormonitor, _Coroutine, _Disable,
    117                 _Else, _Enable, _Event, _Finally, _Monitor, _Mutex, _Nomutex, _PeriodicTask, _RealTimeTask,
    118                 _Resume, _Select, _SporadicTask, _Task, _Timeout, _When, _With, _Throw},
     124                _Accept, _AcceptReturn, _AcceptWait, _Actor, _At, _Catch, _CatchResume, _CorActor, _Cormonitor, _Coroutine,
     125                _Disable, _Else, _Enable, _Event, _Exception, _Finally, _Monitor, _Mutex, _Nomutex, _PeriodicTask, _RealTimeTask,
     126                _Resume, _ResumeTop, _Select, _SporadicTask, _Task, _Timeout, _When, _With, _Throw},
    119127}
    120128
     
    133141        morestring=[b]",
    134142        morestring=[s]{`}{`},
    135         % replace/adjust listing characters that look bad in sanserif
    136         literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.8ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptstyle\land\,$}}1
    137                 {~}{\raisebox{0.3ex}{$\scriptstyle\sim\,$}}1 % {`}{\ttfamily\upshape\hspace*{-0.1ex}`}1
    138                 {<}{\textrm{\textless}}1 {>}{\textrm{\textgreater}}1
    139                 {<-}{\makebox[2ex][c]{\textrm{\textless}\raisebox{0.5ex}{\rule{0.8ex}{0.075ex}}}}2,
    140143}
    141144
    142 \lstnewenvironment{cfa}[1][]
    143 {\lstset{language=CFA,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
    144 {}
    145 \lstnewenvironment{C++}[1][]                            % use C++ style
    146 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
    147 {}
    148 \lstnewenvironment{uC++}[1][]
    149 {\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
    150 {}
    151 \lstnewenvironment{Go}[1][]
    152 {\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
    153 {}
    154 \lstnewenvironment{python}[1][]
    155 {\lstset{language=python,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
    156 {}
    157 \lstnewenvironment{java}[1][]
    158 {\lstset{language=java,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}
    159 {}
     145\lstnewenvironment{cfa}[1][]{\lstset{language=CFA,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
     146\lstnewenvironment{C++}[1][]{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
     147\lstnewenvironment{uC++}[1][]{\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
     148\lstnewenvironment{Go}[1][]{\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
     149\lstnewenvironment{python}[1][]{\lstset{language=python,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
     150\lstnewenvironment{java}[1][]{\lstset{language=java,moredelim=**[is][\protect\color{red}]{@}{@}}\lstset{#1}}{}
    160151
    161152% inline code @...@
     
    193184
    194185\author[1]{Mubeen Zulfiqar}
     186\author[1]{Ayelet Wasik}
    195187\author[1]{Peter A. Buhr*}
    196 \author[1]{Thierry Delisle}
    197 \author[1]{Ayelet Wasik}
     188\author[2]{Bryan Chan}
    198189\authormark{ZULFIQAR \textsc{et al.}}
    199190
    200191\address[1]{\orgdiv{Cheriton School of Computer Science}, \orgname{University of Waterloo}, \orgaddress{\state{Waterloo, ON}, \country{Canada}}}
     192\address[2]{\orgdiv{Huawei Compiler Lab}, \orgname{Huawei}, \orgaddress{\state{Markham, ON}, \country{Canada}}}
    201193
    202194\corres{*Peter A. Buhr, Cheriton School of Computer Science, University of Waterloo, 200 University Avenue West, Waterloo, ON N2L 3G1, Canada. \email{pabuhr{\char`\@}uwaterloo.ca}}
     
    204196% \fundingInfo{Natural Sciences and Engineering Research Council of Canada}
    205197
    206 \abstract[Summary]{
    207 A new C-based concurrent memory-allocator is presented, called llheap.
     198\abstract[Summary]{%
     199A new C-based concurrent memory-allocator is presented, called llheap (low latency).
    208200It can be used standalone in C/\CC applications with multiple kernel threads, or embedded into high-performance user-threading programming languages.
    209201llheap extends the feature set of existing C allocation by remembering zero-filled (\lstinline{calloc}) and aligned properties (\lstinline{memalign}) in an allocation.
    210202These properties can be queried, allowing programmers to write safer programs by preserving these properties in future allocations.
    211 As well, \lstinline{realloc} preserves these properties when enlarging storage requests, again increasing future allocation safety.
    212 llheap also extends the C allocation API with \lstinline{resize}, extended \lstinline{realloc}, \lstinline{aalloc}, \lstinline{amemalign}, and \lstinline{cmemalign} providing orthongoal ac, so programmers do not make mistakes writing theses useful allocation operations.
    213 It is competitive with the best current memory allocators,
    214 The ability to use \CFA's advanced type-system (and possibly \CC's too) to combine advanced memory operations into one allocation routine using named arguments shows how far the allocation API can be pushed, which increases safety and greatly simplifies programmer's use of dynamic allocation.
    215  low-latency
    216  without a performance loss
    217 The llheap allocator also provides comprehensive statistics for all allocation operations, which are invaluable in understanding and debugging a program's dynamic behaviour.
    218 As well, llheap provides a debugging mode where allocations are checked with internal pre/post conditions and invariants. It is extremely useful, especially for students.
    219 % No other memory allocator examined in the work provides such comprehensive statistics gathering.
     203As well, \lstinline{realloc} preserves these properties when adjusting storage size, again increasing future allocation safety.
     204llheap also extends the C allocation API with \lstinline{aalloc}, \lstinline{amemalign}, \lstinline{cmemalign}, \lstinline{resize}, and extended \lstinline{realloc}, providing orthogonal access to allocation features;
     205hence, programmers do have to code missing combinations.
     206The llheap allocator also provides a contention-free statistics gathering mode, and a debugging mode for dynamically checking allocation pre/post conditions and invariants.
     207These modes are invaluable for understanding and debugging a program's dynamic allocation behaviour, with low enough cost to be used in production code.
     208The llheap API is further extended with the \CFA advanced type-system, providing a single type-safe allocation routine using named arguments, increasing safety and simplifying usage.
     209Finally, performance results across a number of benchmarks show llheap is competitive with the best memory allocators.
     210}% abstract
     211
    220212% While not as powerful as the \lstinline{valgrind} interpreter, a large number of allocations mistakes are detected.
    221 % Finally, contention-free statistics gathering and debugging have a low enough cost to be used in production code.
    222 %
    223213% A micro-benchmark test-suite is started for comparing allocators, rather than relying on a suite of arbitrary programs. It has been an interesting challenge.
    224214% These micro-benchmarks have adjustment knobs to simulate allocation patterns hard-coded into arbitrary test programs.
    225215% Existing memory allocators, glibc, dlmalloc, hoard, jemalloc, ptmalloc3, rpmalloc, tbmalloc, and the new allocator llheap are all compared using the new micro-benchmark test-suite.
    226 }% aabstract
    227 
    228 \keywords{C \CFA (Cforall) coroutine concurrency generator monitor parallelism runtime thread}
     216
     217\keywords{memory allocation, (user-level) concurrency, type-safety, statistics, debugging, high performance}
    229218
    230219
     
    237226\section{Introduction}
    238227
    239 Memory management takes a sequence of program generated allocation/deallocation requests and attempts to satisfy them within a fixed-sized block of memory while minimizing the total amount of memory used.
    240 A general-purpose dynamic-allocation algorithm cannot anticipate future allocation requests so its output is rarely optimal.
    241 However, memory allocators do take advantage of regularities in allocation patterns for typical programs to produce excellent results, both in time and space (similar to LRU paging).
    242 In general, allocators use a number of similar techniques, each optimizing specific allocation patterns.
    243 Nevertheless, memory allocators are a series of compromises, occasionally with some static or dynamic tuning parameters to optimize specific program-request patterns.
     228Memory management services a series of program allocation/deallocation requests and attempts to satisfy them from a variable-sized block of memory, while minimizing total memory usage.
     229A general-purpose dynamic-allocation algorithm cannot anticipate allocation requests so its time and space performance is rarely optimal.
     230However, allocators take advantage of regular allocation patterns in typical programs to produce excellent results, both in time and space (similar to LRU paging).
     231Allocators use a number of similar techniques, but each optimizes specific allocation patterns.
     232Nevertheless, allocators are a series of compromises, occasionally with some static or dynamic tuning parameters to optimize specific program-request patterns.
    244233
    245234
     
    247236\label{s:MemoryStructure}
    248237
    249 Figure~\ref{f:ProgramAddressSpace} shows the typical layout of a program's address space divided into the following zones (right to left): static code/data, dynamic allocation, dynamic code/data, and stack, with free memory surrounding the dynamic code/data~\cite{memlayout}.
     238Figure~\ref{f:ProgramAddressSpace} shows the typical layout of a program's address space (high to low) divided into a number of zones, with free memory surrounding the dynamic code/data~\cite{memlayout}.
    250239Static code and data are placed into memory at load time from the executable and are fixed-sized at runtime.
    251 Dynamic-allocation memory starts empty and grows/shrinks as the program dynamically creates/deletes variables with independent lifetime.
    252 The programming-language's runtime manages this area, where management complexity is a function of the mechanism for deleting variables.
    253240Dynamic code/data memory is managed by the dynamic loader for libraries loaded at runtime, which is complex especially in a multi-threaded program~\cite{Huang06}.
    254241However, changes to the dynamic code/data space are typically infrequent, many occurring at program startup, and are largely outside of a program's control.
    255242Stack memory is managed by the program call/return-mechanism using a LIFO technique, which works well for sequential programs.
    256243For stackful coroutines and user threads, a new stack is commonly created in the dynamic-allocation memory.
     244The dynamic-allocation memory is often a contiguous area (can be memory mapped as multiple areas), which starts empty and grows/shrinks as the program creates/deletes variables with independent lifetime.
     245The programming-language's runtime manages this area, where management complexity is a function of the mechanism for deleting variables.
    257246This work focuses solely on management of the dynamic-allocation memory.
    258247
    259248\begin{figure}
    260249\centering
    261 \input{AddressSpace}
     250\input{AddressSpace.pstex_t}
    262251\vspace{-5pt}
    263252\caption{Program Address Space Divided into Zones}
     
    269258\label{s:DynamicMemoryManagement}
    270259
    271 Modern programming languages manage dynamic-allocation memory in different ways.
     260Modern programming languages manage dynamic memory in different ways.
    272261Some languages, such as Lisp~\cite{CommonLisp}, Java~\cite{Java}, Haskell~\cite{Haskell}, Go~\cite{Go}, provide explicit allocation but \emph{implicit} deallocation of data through garbage collection~\cite{Wilson92}.
    273262In general, garbage collection supports memory compaction, where dynamic (live) data is moved during runtime to better utilize space.
    274 However, moving data requires finding pointers to it and updating them to reflect new data locations.
     263However, moving data requires finding and updating pointers to it to reflect the new data locations.
    275264Programming languages such as C~\cite{C}, \CC~\cite{C++}, and Rust~\cite{Rust} provide the programmer with explicit allocation \emph{and} deallocation of data.
    276265These languages cannot find and subsequently move live data because pointers can be created to any storage zone, including internal components of allocated objects, and may contain temporary invalid values generated by pointer arithmetic.
    277266Attempts have been made to perform quasi garbage collection in C/\CC~\cite{Boehm88}, but it is a compromise.
    278 This work only examines dynamic memory-management with \emph{explicit} deallocation.
     267This work only examines dynamic management with \emph{explicit} deallocation.
    279268While garbage collection and compaction are not part this work, many of the results are applicable to the allocation phase in any memory-management approach.
    280269
    281 Most programs use a general-purpose allocator, often the one provided implicitly by the programming-language's runtime.
    282 When this allocator proves inadequate, programmers often write specialize allocators for specific needs.
    283 C and \CC allow easy replacement of the default memory allocator with an alternative specialized or general-purpose memory-allocator.
     270Most programs use a general-purpose allocator, usually the one provided by the programming-language's runtime.
     271In certain languages, programmers can write specialize allocators for specific needs.
     272C and \CC allow easy replacement of the default memory allocator through a standard API.
    284273Jikes RVM MMTk~\cite{MMTk} provides a similar generalization for the Java virtual machine.
    285 However, high-performance memory-allocators for kernel and user multi-threaded programs are still being designed and improved.
    286 For this reason, several alternative general-purpose allocators have been written for C/\CC with the goal of scaling in a multi-threaded program~\cite{Berger00,mtmalloc,streamflow,tcmalloc}.
     274As well, new languages support concurrency (kernel and/or user threading), which must be safely handled by the allocator.
     275Hence, several alternative allocators exist for C/\CC with the goal of scaling in a multi-threaded program~\cite{Berger00,mtmalloc,streamflow,tcmalloc}.
    287276This work examines the design of high-performance allocators for use by kernel and user multi-threaded applications written in C/\CC.
    288277
     
    294283\begin{enumerate}[leftmargin=*,itemsep=0pt]
    295284\item
    296 Implementation of a new stand-alone concurrent low-latency memory-allocator ($\approx$1,200 lines of code) for C/\CC programs using kernel threads (1:1 threading), and specialized versions of the allocator for the programming languages \uC~\cite{uC++} and \CFA~\cite{Moss18,Delisle21} using user-level threads running on multiple kernel threads (M:N threading).
    297 
    298 \item
    299 Extend the standard C heap functionality by preserving with each allocation: its request size plus the amount allocated, whether an allocation is zero fill and/or allocation alignment.
     285Implementation of a new stand-alone concurrent low-latency memory-allocator ($\approx$1,200 lines of code) for C/\CC programs using kernel threads (1:1 threading), and specialized versions for the concurrent languages \uC~\cite{uC++} and \CFA~\cite{Moss18,Delisle21} using user-level threads running on multiple kernel threads (M:N threading).
     286
     287\item
     288Extend the standard C heap functionality by preserving with each allocation its request size, the amount allocated, whether it is zero fill, and its alignment.
    300289
    301290\item
    302291Use the preserved zero fill and alignment as \emph{sticky} properties for @realloc@ to zero-fill and align when storage is extended or copied.
    303 Without this extension, it is unsafe to @realloc@ storage initially allocated with zero-fill/alignment as these properties are not preserved when copying.
    304 This silent generation of a problem is unintuitive to programmers and difficult to locate because it is transient.
    305 
    306 \item
    307 Provide additional heap operations to complete programmer expectation with respect to accessing different allocation properties.
    308 \begin{itemize}[topsep=3pt,itemsep=2pt,parsep=0pt]
     292Without this extension, it is unsafe to @realloc@ storage these allocations if the properties are not preserved when copying.
     293This silent problem is unintuitive to programmers and difficult to locate because it is transient.
     294
     295\item
     296Provide additional heap operations to make allocation properties orthogonally accessible.
     297\begin{itemize}[topsep=2pt,itemsep=2pt,parsep=0pt]
     298\item
     299@aalloc( dim, elemSize )@ same as @calloc@ except memory is \emph{not} zero filled.
     300\item
     301@amemalign( alignment, dim, elemSize )@ same as @aalloc@ with memory alignment.
     302\item
     303@cmemalign( alignment, dim, elemSize )@ same as @calloc@ with memory alignment.
    309304\item
    310305@resize( oaddr, size )@ re-purpose an old allocation for a new type \emph{without} preserving fill or alignment.
     
    313308\item
    314309@realloc( oaddr, alignment, size )@ same as @realloc@ but adding or changing alignment.
    315 \item
    316 @aalloc( dim, elemSize )@ same as @calloc@ except memory is \emph{not} zero filled.
    317 \item
    318 @amemalign( alignment, dim, elemSize )@ same as @aalloc@ with memory alignment.
    319 \item
    320 @cmemalign( alignment, dim, elemSize )@ same as @calloc@ with memory alignment.
    321310\end{itemize}
    322 
    323 \item
    324 Provide additional heap wrapper functions in \CFA creating a more usable set of allocation operations and properties.
    325311
    326312\item
     
    328314\begin{itemize}[topsep=3pt,itemsep=2pt,parsep=0pt]
    329315\item
    330 @malloc_alignment( addr )@ returns the alignment of the allocation pointed-to by @addr@.
     316@malloc_alignment( addr )@ returns the alignment of the allocation.
    331317If the allocation is not aligned or @addr@ is @NULL@, the minimal alignment is returned.
    332318\item
    333 @malloc_zero_fill( addr )@ returns a boolean result indicating if the memory pointed-to by @addr@ is allocated with zero fill, e.g., by @calloc@/@cmemalign@.
    334 \item
    335 @malloc_size( addr )@ returns the size of the memory allocation pointed-to by @addr@.
    336 \item
    337 @malloc_usable_size( addr )@ returns the usable (total) size of the memory pointed-to by @addr@, i.e., the bin size containing the allocation, where @malloc_size( addr )@ $\le$ @malloc_usable_size( addr )@.
     319@malloc_zero_fill( addr )@ returns a boolean result indicating if the memory is allocated with zero fill, e.g., by @calloc@/@cmemalign@.
     320\item
     321@malloc_size( addr )@ returns the size of the memory allocation.
     322\item
     323@malloc_usable_size( addr )@ returns the usable (total) size of the memory, i.e., the bin size containing the allocation, where @malloc_size( addr )@ $\le$ @malloc_usable_size( addr )@.
    338324\end{itemize}
    339325
    340326\item
    341 Provide complete, fast, and contention-free allocation statistics to help understand allocation behaviour:
     327Provide optional extensive, fast, and contention-free allocation statistics to understand allocation behaviour, accessed by:
    342328\begin{itemize}[topsep=3pt,itemsep=2pt,parsep=0pt]
    343329\item
    344 @malloc_stats()@ print memory-allocation statistics on the file-descriptor set by @malloc_stats_fd@.
    345 \item
    346 @malloc_info( options, stream )@ print memory-allocation statistics as an XML string on the specified file-descriptor set by @malloc_stats_fd@.
    347 \item
    348 @malloc_stats_fd( fd )@ set file-descriptor number for printing memory-allocation statistics (default @STDERR_FILENO@).
     330@malloc_stats()@ print memory-allocation statistics on the file-descriptor set by @malloc_stats_fd@ (default @stderr@).
     331\item
     332@malloc_info( options, stream )@ print memory-allocation statistics as an XML string on the specified file-descriptor set by @malloc_stats_fd@ (default @stderr@).
     333\item
     334@malloc_stats_fd( fd )@ set file-descriptor number for printing memory-allocation statistics (default @stderr@).
    349335This file descriptor is used implicitly by @malloc_stats@ and @malloc_info@.
    350336\end{itemize}
     
    355341\item
    356342Build 8 different versions of the allocator: static or dynamic linking, with or without statistics or debugging.
    357 A program may link to any of these 8 versions of the allocator often without recompilation.
     343A program may link to any of these 8 versions of the allocator often without recompilation (@LD_PRELOAD@).
     344
     345\item
     346Provide additional heap wrapper functions in \CFA creating a more usable set of allocation operations and properties.
    358347
    359348\item
     
    365354\section{Background}
    366355
    367 The following discussion is a quick overview of the moving-pieces that affect the design of a memory allocator and its performance.
    368 Dynamic acquires and releases obtain storage for a program variable, called an \newterm{object}, through calls such as @malloc@ and @free@ in C, and @new@ and @delete@ in \CC.
    369 Space for each allocated object comes from the dynamic-allocation zone.
    370 
     356The following is a quick overview of allocator design options that affect memory usage and performance (see~\cite{Zulfiqar22} for more details).
     357Dynamic acquires and releases obtain storage for a program variable, called an \newterm{object}, through calls such as @malloc@/@new@ and @free@/@delete@ in C/\CC.
    371358A \newterm{memory allocator} contains a complex data-structure and code that manages the layout of objects in the dynamic-allocation zone.
    372359The management goals are to make allocation/deallocation operations as fast as possible while densely packing objects to make efficient use of memory.
    373 Objects in C/\CC cannot be moved to aid the packing process, only adjacent free storage can be \newterm{coalesced} into larger free areas.
     360Since objects in C/\CC cannot be moved to aid the packing process, only adjacent free storage can be \newterm{coalesced} into larger free areas.
    374361The allocator grows or shrinks the dynamic-allocation zone to obtain storage for objects and reduce memory usage via operating-system calls, such as @mmap@ or @sbrk@ in UNIX.
    375362
     
    383370The \newterm{storage data} is composed of allocated and freed objects, and \newterm{reserved memory}.
    384371Allocated objects (light grey) are variable sized, and are allocated and maintained by the program;
    385 \ie only the program knows the location of allocated storage not the memory allocator.
    386 Freed objects (white) represent memory deallocated by the program, which are linked into one or more lists facilitating easy location of new allocations.
     372\ie only the program knows the location of allocated storage.
     373Freed objects (white) represent memory deallocated by the program, which are linked into one or more lists facilitating location of new allocations.
    387374Reserved memory (dark grey) is one or more blocks of memory obtained from the \newterm{operating system} (OS) but not yet allocated to the program;
    388375if there are multiple reserved blocks, they are also chained together.
     
    401388An object may be preceded by padding to ensure proper alignment.
    402389Some algorithms quantize allocation requests, resulting in additional space after an object less than the quantized value.
    403 % The buckets are often organized as an array of ascending bucket sizes for fast searching, \eg binary search, and the array is stored in the heap management-area, where each bucket is a top point to the freed objects of that size.
    404390When padding and spacing are necessary, neither can be used to satisfy a future allocation request while the current allocation exists.
    405391
     
    407393Often the free list is chained internally so it does not consume additional storage, \ie the link fields are placed at known locations in the unused memory blocks.
    408394For internal chaining, the amount of management data for a free node defines the minimum allocation size, \eg if 16 bytes are needed for a free-list node, allocation requests less than 16 bytes are rounded up.
     395Often the minimum storage alignment and free-node size are the same.
    409396The information in an allocated or freed object is overwritten when it transitions from allocated to freed and vice-versa by new program data and/or management information.
    410397
     
    420407\label{s:SingleThreadedMemoryAllocator}
    421408
    422 A single-threaded memory-allocator does not run any threads itself, but is used by a single-threaded program.
    423 Because the memory allocator is only executed by a single thread, concurrency issues do not exist.
    424 The primary issues in designing a single-threaded memory-allocator are fragmentation and locality.
    425 
     409In a sequential (single threaded) program, the program thread performs all allocation operations and concurrency issues do not exist.
     410However, interrupts logically introduce concurrency, if the signal handler performs allocation/deallocation (serially reusable problem~\cite{SeriallyReusable}).
     411In general, the primary issues in a single-threaded allocator are fragmentation and locality.
    426412
    427413\subsubsection{Fragmentation}
    428414\label{s:Fragmentation}
    429415
    430 Fragmentation is memory requested from the OS but not used by the program;
    431 hence, allocated objects are not fragmentation.
    432 Figure~\ref{f:InternalExternalFragmentation} shows fragmentation is divided into two forms: internal or external.
     416Fragmentation is memory requested from the OS but not used allocated objects in by the program.
     417Figure~\ref{f:InternalExternalFragmentation} shows fragmentation is divided into two forms: \emph{internal} or \emph{external}.
    433418
    434419\begin{figure}
     
    439424\end{figure}
    440425
    441 \newterm{Internal fragmentation} is memory space that is allocated to the program, but is not intended to be accessed by the program, such as headers, trailers, padding, and spacing around an allocated object.
    442 Internal fragmentation is problematic when management space is a significant proportion of an allocated object, \eg for small objects ($<$16 bytes), memory usage is doubled.
    443 An allocator should strive to keep internal management information to a minimum.
    444 
    445 \newterm{External fragmentation} is all memory space reserved from the OS but not allocated to the program~\cite{Wilson95,Lim98,Siebert00}, which includes all external management data, freed objects, and reserved memory.
     426\newterm{Internal fragmentation} is unaccessible allocated memory, such as headers, trailers, padding, and spacing around an allocated object.
     427Internal fragmentation is problematic when management space becomes a significant proportion of an allocated object, \eg for objects $<$16 bytes, memory usage doubles.
     428An allocator strives to keep internal management information to a minimum.
     429
     430\newterm{External fragmentation} is memory not allocated in the program~\cite{Wilson95,Lim98,Siebert00}, which includes all external management data, freed objects, and reserved memory.
    446431This memory is problematic in two ways: heap blowup and highly fragmented memory.
    447432\newterm{Heap blowup} occurs when freed memory cannot be reused for future allocations leading to potentially unbounded external fragmentation growth~\cite{Berger00}.
    448 Memory can become \newterm{highly fragmented} after multiple allocations and deallocations of objects, resulting in a checkerboard of adjacent allocated and free areas, where the free blocks have become to small to service requests.
     433Memory can become \newterm{highly fragmented} after multiple allocations and deallocations of objects, resulting in a checkerboard of adjacent allocated and free areas, where the free blocks are to small to service requests.
    449434% Figure~\ref{f:MemoryFragmentation} shows an example of how a small block of memory fragments as objects are allocated and deallocated over time.
    450 Heap blowup can occur due to allocator policies that are too restrictive in reusing freed memory (the allocated size cannot use a larger free block) and/or no coalescing of free storage.
     435Heap blowup occurs with allocator policies that are too restrictive in reusing freed memory, \eg the allocated size cannot use a larger free block and/or no coalescing of free storage.
    451436% Blocks of free memory become smaller and non-contiguous making them less useful in serving allocation requests.
    452437% Memory is highly fragmented when most free blocks are unusable because of their sizes.
     
    479464
    480465The second approach is a \newterm{segregated} or \newterm{binning algorithm} with a set of lists for different sized freed objects.
    481 When an object is allocated, the requested size is rounded up to the nearest bin-size, often leading to spacing after the object.
     466When an object is allocated, the requested size is rounded up to the nearest bin-size, often leading to space after the object.
    482467A binning algorithm is fast at finding free memory of the appropriate size and allocating it, since the first free object on the free list is used.
    483 The fewer bin sizes, the fewer lists need to be searched and maintained;
    484 however, unusable space after object increases, leading to more internal fragmentation.
    485 The more bin sizes, the longer the search and the less likely a matching free objects is found, leading to more external fragmentation and potentially heap blowup.
    486 A variation of the binning algorithm allows objects to be allocated from larger bin sizes when the matching bins is empty, and the freed object can be returned to the matching or larger bin (some advantages to either scheme).
     468Fewer bin sizes means a faster search to find a matching bin, but larger differences between allocation and bin size, which increases unusable space after objects (internal fragmentation).
     469More bin sizes means a slower search but smaller differences matching between allocation and bin size resulting in less internal fragmentation but more external fragmentation if larger bins cannot service smaller requests.
     470Allowing larger bins to service smaller allocations when the matching bin is empty means the freed object can be returned to the matching or larger bin (some advantages to either scheme).
    487471% For example, with bin sizes of 8 and 16 bytes, a request for 12 bytes allocates only 12 bytes, but when the object is freed, it is placed on the 8-byte bin-list.
    488472% For subsequent requests, the bin free-lists contain objects of different sizes, ranging from one bin-size to the next (8-16 in this example), and a sequential-fit algorithm may be used to find an object large enough for the requested size on the associated bin list.
    489473
    490 The third approach is \newterm{splitting} and \newterm{coalescing algorithms}.
    491 When an object is allocated, if there are no free objects of the requested size, a larger free object is split into two smaller objects to satisfy the allocation request rather than obtaining more memory from the OS.
    492 For example, in the \newterm{buddy system}, a block of free memory is split into equal chunks, one of those chunks is again split, and so on until a minimal block is created that fits the requested object.
    493 When an object is deallocated, it is coalesced with the objects immediately before and after it in memory, if they are free, turning them into one larger block.
     474The third approach is a \newterm{splitting} and \newterm{coalescing} algorithms.
     475When an object is allocated, if there is no matching free storage, a larger free object is split into two smaller objects, one matching the allocation size.
     476For example, in the \newterm{buddy system}, a block of free memory is split into equal chunks, splitting continues until a minimal block is created that fits the allocation.
     477When an object is deallocated, it is coalesced with the objects immediately before/after it in memory, if they are free, turning them into a larger block.
    494478Coalescing can be done eagerly at each deallocation or lazily when an allocation cannot be fulfilled.
    495 In all cases, coalescing increases allocation latency, hence some allocations can cause unbounded delays.
     479However, coalescing increases allocation latency (unbounded delays), both for allocation and deallocation.
    496480While coalescing does not reduce external fragmentation, the coalesced blocks improve fragmentation quality so future allocations are less likely to cause heap blowup.
    497481% Splitting and coalescing can be used with other algorithms to avoid highly fragmented memory.
     
    504488% Temporal clustering implies a group of objects are accessed repeatedly within a short time period, while spatial clustering implies a group of objects physically close together (nearby addresses) are accessed repeatedly within a short time period.
    505489% Temporal locality commonly occurs during an iterative computation with a fixed set of disjoint variables, while spatial locality commonly occurs when traversing an array.
    506 Hardware takes advantage of the working set through multiple levels of caching, \ie memory hierarchy.
     490Hardware takes advantage of the working set through multiple levels of caching and paging, \ie memory hierarchy.
    507491% When an object is accessed, the memory physically located around the object is also cached with the expectation that the current and nearby objects will be referenced within a short period of time.
    508492For example, entire cache lines are transferred between cache and memory, and entire virtual-memory pages are transferred between memory and disk.
    509493% A program exhibiting good locality has better performance due to fewer cache misses and page faults\footnote{With the advent of large RAM memory, paging is becoming less of an issue in modern programming.}.
    510494
    511 Temporal locality is largely controlled by how a program accesses its variables~\cite{Feng05}.
    512 Nevertheless, a memory allocator can have some indirect influence on temporal locality and largely dictates spatial locality.
    513 For temporal locality, an allocator can return storage for new allocations that was just freed as these memory locations are still \emph{warm} in the memory hierarchy.
    514 For spatial locality, an allocator can place objects used together close together in memory, so the working set of the program fits into the fewest possible cache lines and pages.
     495Temporal locality is largely controlled by program accesses to its variables~\cite{Feng05}.
     496An allocator has only indirect influence on temporal locality but largely dictates spatial locality.
     497For temporal locality, an allocator tries to return recently freed storage for new allocations, as this memory is still \emph{warm} in the memory hierarchy.
     498For spatial locality, an allocator places objects used together close together in memory, so the working set of the program fits into the fewest possible cache lines and pages.
    515499% However, usage patterns are different for every program as is the underlying hardware memory architecture;
    516500% hence, no general-purpose memory-allocator can provide ideal locality for every program on every computer.
    517501
    518 There are a number of ways a memory allocator can degrade locality by increasing the working set.
    519 For example, a memory allocator may access multiple free objects before finding one to satisfy an allocation request, \eg sequential-fit algorithm, which can perturb the program's memory hierarchy causing multiple cache or page misses~\cite{Grunwald93}.
    520 Another way locality can be degraded is by spatially separating related data.
    521 For example, in a binning allocator, objects of different sizes are allocated from different bins that may be located in different pages of memory.
     502An allocator can easily degrade locality by increasing the working set.
     503An allocator can access an unbounded number of free objects when matching an allocation or coalescing, causing multiple cache or page misses~\cite{Grunwald93}.
     504An allocator can spatially separate related data by binning free storage anywhere in memory, so the related objects are highly separated.
    522505
    523506
     
    525508\label{s:MultiThreadedMemoryAllocator}
    526509
    527 A multi-threaded memory-allocator does not run any threads itself, but is used by a multi-threaded program.
    528 In addition to single-threaded design issues of fragmentation and locality, a multi-threaded allocator is simultaneously accessed by multiple threads, and hence, must deal with concurrency issues such as mutual exclusion, false sharing, and additional forms of heap blowup.
     510In a concurrent (multi-threaded) program, multiple program threads performs allocation operations and all concurrency issues arise.
     511Along with fragmentation and locality issues, a multi-threaded allocator must deal with mutual exclusion, false sharing, and additional forms of heap blowup.
    529512
    530513
     
    534517\newterm{Mutual exclusion} provides sequential access to the shared-management data of the heap.
    535518There are two performance issues for mutual exclusion.
    536 First is the overhead necessary to perform (at least) a hardware atomic operation every time a shared resource is accessed.
    537 Second is when multiple threads contend for a shared resource simultaneously, and hence, some threads must wait until the resource is released.
     519First is the cost of performing at least one hardware atomic operation every time a shared resource is accessed.
     520Second is \emph{contention} on simultaneous access, so some threads must wait until the resource is released.
    538521Contention can be reduced in a number of ways:
    539 1) Using multiple fine-grained locks versus a single lock to spread the contention across a number of locks.
    540 2) Using trylock and generating new storage if the lock is busy, yielding a classic space versus time tradeoff.
     5221) Using multiple fine-grained locks versus a single lock to spread the contention across the locks.
     5232) Using trylock and generating new storage if the lock is busy (classic space versus time tradeoff).
    5415243) Using one of the many lock-free approaches for reducing contention on basic data-structure operations~\cite{Oyama99}.
    542 However, all of these approaches have degenerate cases where program contention is high, which occurs outside of the allocator.
     525However, all approaches have degenerate cases where program contention to the heap is high, which is beyond the allocator's control.
    543526
    544527
     
    546529\label{s:FalseSharing}
    547530
    548 False sharing is a dynamic phenomenon leading to cache thrashing.
    549 When two or more threads on separate CPUs simultaneously change different objects sharing a cache line, the change invalidates the other thread's associated cache, even though these threads may be uninterested in the other modified object.
    550 False sharing can occur in three different ways: program induced, allocator-induced active, and allocator-induced passive;
    551 a memory allocator can only affect the latter two.
    552 
    553 Specifically, assume two objects, O$_1$ and O$_2$, share a cache line, with threads, T$_1$ and T$_2$.
    554 \newterm{Program-induced false-sharing} occurs when T$_1$ passes a reference to O$_2$ to T$_2$, and then T$_1$ modifies O$_1$ while T$_2$ modifies O$_2$.
    555 % Figure~\ref{f:ProgramInducedFalseSharing} shows when Thread$_1$ passes Object$_2$ to Thread$_2$, a false-sharing situation forms when Thread$_1$ modifies Object$_1$ and Thread$_2$ modifies Object$_2$.
    556 % Changes to Object$_1$ invalidate CPU$_2$'s cache line, and changes to Object$_2$ invalidate CPU$_1$'s cache line.
    557 % \begin{figure}
    558 % \centering
    559 % \subfloat[Program-Induced False-Sharing]{
    560 %       \input{ProgramFalseSharing}
    561 %       \label{f:ProgramInducedFalseSharing}
    562 % } \\
    563 % \vspace{5pt}
    564 % \subfloat[Allocator-Induced Active False-Sharing]{
    565 %       \input{AllocInducedActiveFalseSharing}
    566 %       \label{f:AllocatorInducedActiveFalseSharing}
    567 % } \\
    568 % \vspace{5pt}
    569 % \subfloat[Allocator-Induced Passive False-Sharing]{
    570 %       \input{AllocInducedPassiveFalseSharing}
    571 %       \label{f:AllocatorInducedPassiveFalseSharing}
    572 % } subfloat
    573 % \caption{False Sharing}
    574 % \label{f:FalseSharing}
    575 % \end{figure}
    576 \newterm{Allocator-induced active false-sharing}\label{s:AllocatorInducedActiveFalseSharing} occurs when O$_1$ and O$_2$ are heap allocated and their references are passed to T$_1$ and T$_2$, which modify the objects.
    577 % For example, in Figure~\ref{f:AllocatorInducedActiveFalseSharing}, each thread allocates an object and loads a cache-line of memory into its associated cache.
    578 % Again, changes to Object$_1$ invalidate CPU$_2$'s cache line, and changes to Object$_2$ invalidate CPU$_1$'s cache line.
    579 \newterm{Allocator-induced passive false-sharing}\label{s:AllocatorInducedPassiveFalseSharing} occurs
    580 % is another form of allocator-induced false-sharing caused by program-induced false-sharing.
    581 % When an object in a program-induced false-sharing situation is deallocated, a future allocation of that object may cause passive false-sharing.
    582 when T$_1$ passes O$_2$ to T$_2$, and T$_2$ subsequently deallocates O$_2$, and then O$_2$ is reallocated to T$_2$ while T$_1$ is still using O$_1$.
     531False sharing occurs when two or more threads simultaneously modify different objects sharing a cache line.
     532Changes now invalidate each thread's cache, even though the threads may be uninterested in the other modified object.
     533False sharing can occur three ways:
     5341) Thread T$_1$ allocates objects O$_1$ and O$_2$ on the same cache line and passes O$_2$'s reference to thread T$_2$;
     535both threads now simultaneously modifying the objects on the same cache line.
     5362) Objects O$_1$ and O$_2$ are allocated on the same cache line by thread T$_3$ and their references are passed to T$_1$ and T$_2$, which simultaneously modify the objects.
     5373) T$_2$ deallocates O$_2$, T$_1$ allocates O$_1$ on the same cache line as O$_2$, and T$_2$ reallocated O$_2$ while T$_1$ is using O$_1$.
     538In all three cases, the allocator performs a hidden and possibly transient (non-determinism) operation, making it extremely difficult to find and fix the issue.
    583539
    584540
     
    586542\label{s:HeapBlowup}
    587543
    588 In a multi-threaded program, heap blowup can occur when memory freed by one thread is inaccessible to other threads due to the allocation strategy.
     544In a multi-threaded program, heap blowup occurs when memory freed by one thread is inaccessible to other threads due to the allocation strategy.
    589545Specific examples are presented in later subsections.
    590546
    591547
    592 \subsection{Multi-Threaded Memory-Allocator Features}
    593 \label{s:MultiThreadedMemoryAllocatorFeatures}
    594 
    595 The following features are used in the construction of multi-threaded memory-allocators: multiple heaps, user-level threading, ownership, object containers, allocation buffer, lock-free operations.
    596 The first feature, multiple heaps, pertains to different kinds of heaps.
    597 The second feature, object containers, pertains to the organization of objects within the storage area.
    598 The remaining features apply to different parts of the allocator design or implementation.
    599 
     548\subsection{Multi-Threaded Allocator Features}
     549\label{s:MultiThreadedAllocatorFeatures}
     550
     551The following features are used in the construction of multi-threaded allocators.
    600552
    601553\subsubsection{Multiple Heaps}
    602554\label{s:MultipleHeaps}
    603555
    604 A multi-threaded allocator has potentially multiple threads and heaps.
    605 The multiple threads cause complexity, and multiple heaps are a mechanism for dealing with the complexity.
    606 The spectrum ranges from multiple threads using a single heap, denoted as T:1, to multiple threads sharing multiple heaps, denoted as T:H, to one thread per heap, denoted as 1:1, which is almost back to a single-threaded allocator.
     556Figure~\ref{f:ThreadHeapRelationship} shows how a multi-threaded allocator can subdivide a single global heap into multiple heaps to reduce contention among threads.
    607557
    608558\begin{figure}
     
    626576} % subfloat
    627577\caption{Multiple Heaps, Thread:Heap Relationship}
    628 \end{figure}
    629 
    630 \paragraph{T:1 model (see Figure~\ref{f:SingleHeap})} where all threads allocate and deallocate objects from one heap.
     578\label{f:ThreadHeapRelationship}
     579\end{figure}
     580
     581\begin{description}[leftmargin=*]
     582\item[T:1 model (Figure~\ref{f:SingleHeap})] has all threads allocating and deallocating objects from one heap.
    631583Memory is obtained from the freed objects, or reserved memory in the heap, or from the OS;
    632584the heap may also return freed memory to the OS.
    633 The arrows indicate the direction memory conceptually moves for each kind of operation: allocation moves memory along the path from the heap/operating-system to the user application, while deallocation moves memory along the path from the application back to the heap/operating-system.
     585The arrows indicate the direction memory moves for each alocation/deallocation operation.
    634586To safely handle concurrency, a single lock may be used for all heap operations or fine-grained locking for different operations.
    635 Regardless, a single heap may be a significant source of contention for programs with a large amount of memory allocation.
    636 
    637 \paragraph{T:H model (see Figure~\ref{f:SharedHeaps})} where each thread allocates storage from several heaps depending on certain criteria, with the goal of reducing contention by spreading allocations/deallocations across the heaps.
    638 The decision on when to create a new heap and which heap a thread allocates from depends on the allocator design.
    639 To determine which heap to access, each thread must point to its associated heap in some way.
    640 The performance goal is to reduce the ratio of heaps to threads.
    641 However, the worse case can result in more heaps than threads, \eg if the number of threads is large at startup with many allocations creating a large number of heaps and then the number of threads reduces.
    642 Locking is required, since more than one thread may concurrently access a heap during its lifetime, but contention is reduced because fewer threads access a specific heap.
     587Regardless, a single heap is a significant source of contention for threaded programs with a large amount of memory allocations.
     588
     589\item[T:H model (Figure~\ref{f:SharedHeaps})] subdivides the heap independently from the threads.
     590The decision to create a heap and which heap a thread allocates/deallocates during its lifetime depends on the allocator design.
     591Locking is required within each heap because of multiple tread access, but contention is reduced because fewer threads access a specific heap.
     592The goal is to have mininal heaps (storage) and thread contention per heap (time).
     593However, the worst case results in more heaps than threads, \eg if the number of threads is large at startup creating a large number of heaps and then the number of threads reduces.
    643594
    644595% For example, multiple heaps are managed in a pool, starting with a single or a fixed number of heaps that increase\-/decrease depending on contention\-/space issues.
     
    693644In general, the cost is minimal since the majority of memory operations are completed without the use of the global heap.
    694645
    695 \paragraph{1:1 model (see Figure~\ref{f:PerThreadHeap})} where each thread has its own heap eliminating most contention and locking because threads seldom access another thread's heap (see Section~\ref{s:Ownership}).
     646\item[1:1 model (Figure~\ref{f:PerThreadHeap})] has each thread with its own heap, eliminating most contention and locking because threads seldom access another thread's heap (see Section~\ref{s:Ownership}).
    696647An additional benefit of thread heaps is improved locality due to better memory layout.
    697648As each thread only allocates from its heap, all objects are consolidated in the storage area for that heap, better utilizing each CPUs cache and accessing fewer pages.
    698649In contrast, the T:H model spreads each thread's objects over a larger area in different heaps.
    699 Thread heaps can also eliminate allocator-induced active false-sharing, if memory is acquired so it does not overlap at crucial boundaries with memory for another thread's heap.
     650Thread heaps can also reduces false-sharing, except at crucial boundaries overlapping memory from another thread's heap.
    700651For example, assume page boundaries coincide with cache line boundaries, if a thread heap always acquires pages of memory then no two threads share a page or cache line unless pointers are passed among them.
    701652% Hence, allocator-induced active false-sharing cannot occur because the memory for thread heaps never overlaps.
     
    706657Destroying the thread heap immediately may reduce external fragmentation sooner, since all free objects are freed to the global heap and may be reused by other threads.
    707658Alternatively, reusing thread heaps may improve performance if the inheriting thread makes similar allocation requests as the thread that previously held the thread heap because any unfreed storage is immediately accessible.
     659\end{description}
    708660
    709661
  • doc/papers/llheap/figures/AddressSpace.fig

    r647d633 r7a29392f  
    991200 2
    10102 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    11          5700 1350 6600 1350 6600 2100 5700 2100 5700 1350
    12 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    13          1200 1350 2100 1350 2100 2100 1200 2100 1200 1350
    14 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    15          4800 1350 5700 1350 5700 2100 4800 2100 4800 1350
     11         1200 1200 2100 1200 2100 1800 1200 1800 1200 1200
     122 2 0 1 0 7 60 -1 17 0.000 0 0 -1 0 0 5
     13         2100 1200 3000 1200 3000 1800 2100 1800 2100 1200
    16142 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    1715        1 1 1.00 45.00 90.00
    18          2100 1725 2400 1725
     16         2100 1500 2400 1500
    19172 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    2018        1 1 1.00 45.00 90.00
    21          3000 1725 2700 1725
     19         3000 1500 2700 1500
     202 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     21         3000 1200 3900 1200 3900 1800 3000 1800 3000 1200
     222 2 0 1 0 7 60 -1 17 0.000 0 0 -1 0 0 5
     23         3900 1200 4800 1200 4800 1800 3900 1800 3900 1200
    22242 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    2325        1 1 1.00 45.00 90.00
    24          3900 1725 4200 1725
     26         3900 1500 4200 1500
    25272 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    2628        1 1 1.00 45.00 90.00
    27          4800 1725 4500 1725
    28 2 2 0 1 0 7 60 -1 17 0.000 0 0 -1 0 0 5
    29          2100 1350 3000 1350 3000 2100 2100 2100 2100 1350
     29         4800 1500 4500 1500
    30302 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    31          3000 1350 3900 1350 3900 2100 3000 2100 3000 1350
    32 2 2 0 1 0 7 60 -1 17 0.000 0 0 -1 0 0 5
    33          3900 1350 4800 1350 4800 2100 3900 2100 3900 1350
    34 4 0 0 50 -1 0 10 0.0000 2 180 900 1200 2325 high address\001
    35 4 2 0 50 -1 0 10 0.0000 2 135 855 6600 2325 low address\001
    36 4 1 0 50 -1 0 10 0.0000 2 120 330 6150 2025 Data\001
    37 4 1 0 50 -1 0 10 0.0000 2 135 675 6150 1800 Code and\001
    38 4 1 0 50 -1 0 10 0.0000 2 120 390 6150 1575 Static\001
    39 4 1 0 50 -1 0 10 0.0000 2 135 390 1650 1800 Stack\001
    40 4 1 0 50 -1 0 10 0.0000 2 165 615 2550 1950 Memory\001
    41 4 1 0 50 -1 0 10 0.0000 2 165 615 4350 1950 Memory\001
    42 4 1 0 50 -1 0 10 0.0000 2 120 315 2550 1650 Free\001
    43 4 1 0 50 -1 0 10 0.0000 2 120 330 3450 2025 Data\001
    44 4 1 0 50 -1 0 10 0.0000 2 135 675 3450 1800 Code and\001
    45 4 1 0 50 -1 0 10 0.0000 2 165 645 3450 1575 Dynamic\001
    46 4 1 0 50 -1 0 10 0.0000 2 120 315 4350 1650 Free\001
    47 4 1 0 50 -1 0 10 0.0000 2 120 735 5250 1950 Allocation\001
    48 4 1 0 50 -1 0 10 0.0000 2 165 645 5250 1650 Dynamic\001
     31         4800 1200 5700 1200 5700 1800 4800 1800 4800 1200
     322 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     33         5700 1200 6600 1200 6600 1800 5700 1800 5700 1200
     344 0 0 50 -1 0 10 0.0000 2 165 870 1200 2025 high address\001
     354 2 0 50 -1 0 10 0.0000 2 120 810 6600 2025 low address\001
     364 1 0 50 -1 0 10 0.0000 2 120 375 1650 1575 Stack\001
     374 1 0 50 -1 0 10 0.0000 2 150 600 2550 1725 Memory\001
     384 1 0 50 -1 0 10 0.0000 2 120 300 2550 1425 Free\001
     394 1 0 50 -1 0 10 0.0000 2 120 660 3450 1575 Code and\001
     404 1 0 50 -1 0 10 0.0000 2 150 630 3450 1350 Dynamic\001
     414 1 0 50 -1 0 10 0.0000 2 120 315 3450 1775 Data\001
     424 1 0 50 -1 0 10 0.0000 2 120 300 4350 1425 Free\001
     434 1 0 50 -1 0 10 0.0000 2 150 600 4350 1725 Memory\001
     444 1 4 50 -1 0 10 0.0000 2 150 630 5250 1425 Dynamic\001
     454 1 0 50 -1 0 10 0.0000 2 120 315 6150 1775 Data\001
     464 1 0 50 -1 0 10 0.0000 2 120 660 6150 1575 Code and\001
     474 1 0 50 -1 0 10 0.0000 2 120 375 6150 1350 Static\001
     484 1 4 50 -1 0 10 0.0000 2 120 720 5250 1725 Allocation\001
Note: See TracChangeset for help on using the changeset viewer.