source: doc/theses/mubeen_zulfiqar_MMath/performance.tex @ 4b2ea0d

ADTast-experimentalpthread-emulationqualifiedEnum
Last change on this file since 4b2ea0d was 4b2ea0d, checked in by m3zulfiq <m3zulfiq@…>, 2 years ago

added analysis for benchmark results

  • Property mode set to 100644
File size: 30.1 KB
Line 
1\chapter{Performance}
2\label{c:Performance}
3
4This chapter uses the micro-benchmarks from \VRef[Chapter]{s:Benchmarks} to test a number of current memory allocators, including llheap.
5The goal is to see if llheap is competitive with the current best memory allocators.
6
7
8\section{Machine Specification}
9
10The performance experiments were run on two different multi-core architectures (x64 and ARM) to determine if there is consistency across platforms:
11\begin{itemize}
12\item
13\textbf{Nasus} AMD EPYC 7662, 64-core socket $\times$ 2, 2.0 GHz, GCC version 9.3.0
14\item
15\textbf{Algol} Huawei ARM TaiShan 2280 V2 Kunpeng 920, 24-core socket $\times$ 4, 2.6 GHz, GCC version 9.4.0
16\end{itemize}
17
18
19\section{Existing Memory Allocators}
20\label{sec:curAllocatorSec}
21
22With dynamic allocation being an important feature of C, there are many stand-alone memory allocators that have been designed for different purposes.
23For this thesis, 7 of the most popular and widely used memory allocators were selected for comparison, along with llheap.
24
25\paragraph{llheap (\textsf{llh})}
26is the thread-safe allocator from \VRef[Chapter]{c:Allocator}
27\\
28\textbf{Version:} 1.0
29\textbf{Configuration:} Compiled with dynamic linking, but without statistics or debugging.\\
30\textbf{Compilation command:} @make@
31
32\paragraph{glibc (\textsf{glc})}
33\cite{glibc} is the default gcc thread-safe allocator.
34\\
35\textbf{Version:} Ubuntu GLIBC 2.31-0ubuntu9.7 2.31\\
36\textbf{Configuration:} Compiled by Ubuntu 20.04.\\
37\textbf{Compilation command:} N/A
38
39\paragraph{dlmalloc (\textsf{dl})}
40\cite{dlmalloc} is a thread-safe allocator that is single threaded and single heap.
41It maintains free-lists of different sizes to store freed dynamic memory.
42\\
43\textbf{Version:} 2.8.6\\
44\textbf{Configuration:} Compiled with preprocessor @USE_LOCKS@.\\
45\textbf{Compilation command:} @gcc -g3 -O3 -Wall -Wextra -fno-builtin-malloc -fno-builtin-calloc@ @-fno-builtin-realloc -fno-builtin-free -fPIC -shared -DUSE_LOCKS -o libdlmalloc.so malloc-2.8.6.c@
46
47\paragraph{hoard (\textsf{hrd})}
48\cite{hoard} is a thread-safe allocator that is multi-threaded and using a heap layer framework. It has per-thread heaps that have thread-local free-lists, and a global shared heap.
49\\
50\textbf{Version:} 3.13\\
51\textbf{Configuration:} Compiled with hoard's default configurations and @Makefile@.\\
52\textbf{Compilation command:} @make all@
53
54\paragraph{jemalloc (\textsf{je})}
55\cite{jemalloc} is a thread-safe allocator that uses multiple arenas. Each thread is assigned an arena.
56Each arena has chunks that contain contagious memory regions of same size. An arena has multiple chunks that contain regions of multiple sizes.
57\\
58\textbf{Version:} 5.2.1\\
59\textbf{Configuration:} Compiled with jemalloc's default configurations and @Makefile@.\\
60\textbf{Compilation command:} @autogen.sh; configure; make; make install@
61
62\paragraph{ptmalloc3 (\textsf{pt3})}
63\cite{ptmalloc3} is a modification of dlmalloc.
64It is a thread-safe multi-threaded memory allocator that uses multiple heaps.
65ptmalloc3 heap has similar design to dlmalloc's heap.
66\\
67\textbf{Version:} 1.8\\
68\textbf{Configuration:} Compiled with ptmalloc3's @Makefile@ using option ``linux-shared''.\\
69\textbf{Compilation command:} @make linux-shared@
70
71\paragraph{rpmalloc (\textsf{rp})}
72\cite{rpmalloc} is a thread-safe allocator that is multi-threaded and uses per-thread heap.
73Each heap has multiple size-classes and each size-class contains memory regions of the relevant size.
74\\
75\textbf{Version:} 1.4.1\\
76\textbf{Configuration:} Compiled with rpmalloc's default configurations and ninja build system.\\
77\textbf{Compilation command:} @python3 configure.py; ninja@
78
79\paragraph{tbb malloc (\textsf{tbb})}
80\cite{tbbmalloc} is a thread-safe allocator that is multi-threaded and uses private heap for each thread.
81Each private-heap has multiple bins of different sizes. Each bin contains free regions of the same size.
82\\
83\textbf{Version:} intel tbb 2020 update 2, tbb\_interface\_version == 11102\\
84\textbf{Configuration:} Compiled with tbbmalloc's default configurations and @Makefile@.\\
85\textbf{Compilation command:} @make@
86
87% \section{Experiment Environment}
88% We used our micro benchmark suite (FIX ME: cite mbench) to evaluate these memory allocators \ref{sec:curAllocatorSec} and our own memory allocator uHeap \ref{sec:allocatorSec}.
89
90\section{Experiments}
91
92The each micro-benchmark is configured and run with each of the allocators,
93The less time an allocator takes to complete a benchmark the better, so lower in the graphs is better.
94
95%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
96%% CHURN
97%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
98
99\subsection{Churn Micro-Benchmark}
100
101Churn tests allocators for speed under intensive dynamic memory usage (see \VRef{s:ChurnBenchmark}).
102This experiment was run with following configurations:
103\begin{description}[itemsep=0pt,parsep=0pt]
104\item[thread:]
1051, 2, 4, 8, 16
106\item[spots:]
10716
108\item[obj:]
109100,000
110\item[max:]
111500
112\item[min:]
11350
114\item[step:]
11550
116\item[distro:]
117fisher
118\end{description}
119
120% -maxS          : 500
121% -minS          : 50
122% -stepS                 : 50
123% -distroS       : fisher
124% -objN          : 100000
125% -cSpots                : 16
126% -threadN       : 1, 2, 4, 8, 16
127
128\VRef[Figure]{fig:churn} shows the results for algol and nasus.
129The X-axis shows the number of threads;
130the Y-axis shows the total experiment time.
131Each allocator's performance for each thread is shown in different colors.
132
133\begin{figure}
134\centering
135    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/churn} }
136    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/churn} }
137\caption{Churn}
138\label{fig:churn}
139\end{figure}
140
141All allocators did well in this micro-benchmark, except for \textsf{dl} on the ARM.
142\textsf{dl}'s performace decreases and the difference with the other allocators starts increases as the number of worker threads increase.
143\textsf{je} was the fastest, although there is not much difference between \textsf{je} and rest of the allocators.
144
145llheap is slightly slower because it uses ownership, where many of the allocations have remote frees, which requires locking.
146When llheap is compiled without ownership, its performance is the same as the other allocators (not shown).
147
148%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
149%% THRASH
150%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
151
152\subsection{Cache Thrash}
153\label{sec:cache-thrash-perf}
154
155Thrash tests memory allocators for active false sharing (see \VRef{sec:benchThrashSec}).
156This experiment was run with following configurations:
157\begin{description}[itemsep=0pt,parsep=0pt]
158\item[threads:]
1591, 2, 4, 8, 16
160\item[iterations:]
1611,000
162\item[cacheRW:]
1631,000,000
164\item[size:]
1651
166\end{description}
167
168% * Each allocator was tested for its performance across different number of threads.
169% Experiment was repeated for each allocator for 1, 2, 4, 8, and 16 threads by setting the configuration -threadN.
170
171\VRef[Figure]{fig:cacheThrash} shows the results for algol and nasus.
172The X-axis shows the number of threads;
173the Y-axis shows the total experiment time.
174Each allocator's performance for each thread is shown in different colors.
175
176\begin{figure}
177\centering
178    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/cache_thrash_0-thrash} }
179    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/cache_thrash_0-thrash} }
180\caption{Cache Thrash}
181\label{fig:cacheThrash}
182\end{figure}
183
184All allocators did well in this micro-benchmark, except for \textsf{dl} and \textsf{pt3}.
185\textsf{dl} uses a single heap for all threads so it is understable that it is generating so much active false-sharing.
186Requests from different threads will be dealt with sequientially by a single heap using locks which can allocate objects to different threads on the same cache line.
187\textsf{pt3} uses multiple heaps but it is not exactly per-thread heap.
188So, it is possible that multiple threads using one heap can get objects allocated on the same cache line which might be causing active false-sharing.
189Rest of the memory allocators generate little or no active false-sharing.
190
191%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
192%% SCRATCH
193%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
194
195\subsection{Cache Scratch}
196
197Scratch tests memory allocators for program-induced allocator-preserved passive false-sharing (see \VRef{s:CacheScratch}).
198This experiment was run with following configurations:
199\begin{description}[itemsep=0pt,parsep=0pt]
200\item[threads:]
2011, 2, 4, 8, 16
202\item[iterations:]
2031,000
204\item[cacheRW:]
2051,000,000
206\item[size:]
2071
208\end{description}
209
210% * Each allocator was tested for its performance across different number of threads.
211% Experiment was repeated for each allocator for 1, 2, 4, 8, and 16 threads by setting the configuration -threadN.
212
213\VRef[Figure]{fig:cacheScratch} shows the results for algol and nasus.
214The X-axis shows the number of threads;
215the Y-axis shows the total experiment time.
216Each allocator's performance for each thread is shown in different colors.
217
218\begin{figure}
219\centering
220    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/cache_scratch_0-scratch} }
221    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/cache_scratch_0-scratch} }
222\caption{Cache Scratch}
223\label{fig:cacheScratch}
224\end{figure}
225
226This micro-benchmark divided the allocators in 2 groups.
227First is the group of best performers \textsf{llh}, \textsf{je}, and \textsf{rp}.
228These memory alloctors generate little or no passive false-sharing and their performance difference is negligible.
229Second is the group of the low performers which includes rest of the memory allocators.
230These memory allocators seem to preserve program-induced passive false-sharing.
231\textsf{hrd}'s performance keeps getting worst as the number of threads increase.
232
233Interestingly, allocators such as \textsf{hrd} and \textsf{glc} were among the best performers in micro-benchmark cache thrash as described in section \ref{sec:cache-thrash-perf}.
234But, these allocators were among the low performers in this micro-benchmark.
235It tells us that these allocators do not actively produce false-sharing but they may preserve program-induced passive false sharing.
236
237%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
238%% SPEED
239%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
240
241\subsection{Speed Micro-Benchmark}
242
243Speed tests memory allocators for runtime latency (see \VRef{s:SpeedMicroBenchmark}).
244This experiment was run with following configurations:
245\begin{description}[itemsep=0pt,parsep=0pt]
246\item[max:]
247500
248\item[min:]
24950
250\item[step:]
25150
252\item[distro:]
253fisher
254\item[objects:]
2551,000,000
256\item[workers:]
2571, 2, 4, 8, 16
258\end{description}
259
260% -maxS    :  500
261% -minS    :  50
262% -stepS   :  50
263% -distroS :  fisher
264% -objN    :  1000000
265% -threadN    : \{ 1, 2, 4, 8, 16 \} *
266
267%* Each allocator was tested for its performance across different number of threads.
268%Experiment was repeated for each allocator for 1, 2, 4, 8, and 16 threads by setting the configuration -threadN.
269
270\VRefrange[Figures]{fig:speed-3-malloc}{fig:speed-14-malloc-calloc-realloc-free} show 12 figures, one figure for each chain of the speed benchmark.
271The X-axis shows the number of threads;
272the Y-axis shows the total experiment time.
273Each allocator's performance for each thread is shown in different colors.
274
275\begin{itemize}
276\item \VRef[Figure]{fig:speed-3-malloc} shows results for chain: malloc
277\item \VRef[Figure]{fig:speed-4-realloc} shows results for chain: realloc
278\item \VRef[Figure]{fig:speed-5-free} shows results for chain: free
279\item \VRef[Figure]{fig:speed-6-calloc} shows results for chain: calloc
280\item \VRef[Figure]{fig:speed-7-malloc-free} shows results for chain: malloc-free
281\item \VRef[Figure]{fig:speed-8-realloc-free} shows results for chain: realloc-free
282\item \VRef[Figure]{fig:speed-9-calloc-free} shows results for chain: calloc-free
283\item \VRef[Figure]{fig:speed-10-malloc-realloc} shows results for chain: malloc-realloc
284\item \VRef[Figure]{fig:speed-11-calloc-realloc} shows results for chain: calloc-realloc
285\item \VRef[Figure]{fig:speed-12-malloc-realloc-free} shows results for chain: malloc-realloc-free
286\item \VRef[Figure]{fig:speed-13-calloc-realloc-free} shows results for chain: calloc-realloc-free
287\item \VRef[Figure]{fig:speed-14-malloc-calloc-realloc-free} shows results for chain: malloc-realloc-free-calloc
288\end{itemize}
289
290All allocators did well in this micro-benchmark across all allocation chains, except for \textsf{dl} and \textsf{pt3}.
291\textsf{dl} performed the lowest overall and its performce kept getting worse with increasing number of threads.
292\textsf{dl} uses a single heap with a global lock that can become a bottleneck.
293Multiple threads doing memory allocation in parallel can create contention on \textsf{dl}'s single heap.
294\textsf{pt3} which is a modification of \textsf{dl} for multi-threaded applications does not use per-thread heaps and may also have similar bottlenecks.
295
296There's a sudden increase in program completion time of chains that include \textsf{calloc} and all allocators perform relatively slower in these chains including \textsf{calloc}.
297\textsf{calloc} uses \textsf{memset} to set the allocated memory to zero.
298\textsf{memset} is a slow routine which takes a long time compared to the actual memory allocation.
299So, a major part of the time is taken for \textsf{memset} in performance of chains that include \textsf{calloc}.
300But the relative difference among the different memory allocators running the same chain of memory allocation operations still gives us an idea of theor relative performance.
301
302%speed-3-malloc.eps
303\begin{figure}
304\centering
305    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-3-malloc} }
306    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-3-malloc} }
307\caption{Speed benchmark chain: malloc}
308\label{fig:speed-3-malloc}
309\end{figure}
310
311%speed-4-realloc.eps
312\begin{figure}
313\centering
314    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-4-realloc} }
315    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-4-realloc} }
316\caption{Speed benchmark chain: realloc}
317\label{fig:speed-4-realloc}
318\end{figure}
319
320%speed-5-free.eps
321\begin{figure}
322\centering
323    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-5-free} }
324    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-5-free} }
325\caption{Speed benchmark chain: free}
326\label{fig:speed-5-free}
327\end{figure}
328
329%speed-6-calloc.eps
330\begin{figure}
331\centering
332    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-6-calloc} }
333    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-6-calloc} }
334\caption{Speed benchmark chain: calloc}
335\label{fig:speed-6-calloc}
336\end{figure}
337
338%speed-7-malloc-free.eps
339\begin{figure}
340\centering
341    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-7-malloc-free} }
342    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-7-malloc-free} }
343\caption{Speed benchmark chain: malloc-free}
344\label{fig:speed-7-malloc-free}
345\end{figure}
346
347%speed-8-realloc-free.eps
348\begin{figure}
349\centering
350    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-8-realloc-free} }
351    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-8-realloc-free} }
352\caption{Speed benchmark chain: realloc-free}
353\label{fig:speed-8-realloc-free}
354\end{figure}
355
356%speed-9-calloc-free.eps
357\begin{figure}
358\centering
359    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-9-calloc-free} }
360    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-9-calloc-free} }
361\caption{Speed benchmark chain: calloc-free}
362\label{fig:speed-9-calloc-free}
363\end{figure}
364
365%speed-10-malloc-realloc.eps
366\begin{figure}
367\centering
368    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-10-malloc-realloc} }
369    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-10-malloc-realloc} }
370\caption{Speed benchmark chain: malloc-realloc}
371\label{fig:speed-10-malloc-realloc}
372\end{figure}
373
374%speed-11-calloc-realloc.eps
375\begin{figure}
376\centering
377    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-11-calloc-realloc} }
378    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-11-calloc-realloc} }
379\caption{Speed benchmark chain: calloc-realloc}
380\label{fig:speed-11-calloc-realloc}
381\end{figure}
382
383%speed-12-malloc-realloc-free.eps
384\begin{figure}
385\centering
386    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-12-malloc-realloc-free} }
387    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-12-malloc-realloc-free} }
388\caption{Speed benchmark chain: malloc-realloc-free}
389\label{fig:speed-12-malloc-realloc-free}
390\end{figure}
391
392%speed-13-calloc-realloc-free.eps
393\begin{figure}
394\centering
395    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-13-calloc-realloc-free} }
396    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-13-calloc-realloc-free} }
397\caption{Speed benchmark chain: calloc-realloc-free}
398\label{fig:speed-13-calloc-realloc-free}
399\end{figure}
400
401%speed-14-{m,c,re}alloc-free.eps
402\begin{figure}
403\centering
404    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/speed-14-m-c-re-alloc-free} }
405    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/speed-14-m-c-re-alloc-free} }
406\caption{Speed benchmark chain: malloc-calloc-realloc-free}
407\label{fig:speed-14-malloc-calloc-realloc-free}
408\end{figure}
409
410%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
411%% MEMORY
412%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
413
414\newpage
415\subsection{Memory Micro-Benchmark}
416
417This experiment is run with the following two configurations for each allocator.
418The difference between the two configurations is the number of producers and consumers.
419Configuration 1 has one producer and one consumer, and configuration 2 has 4 producers, where each producer has 4 consumers.
420
421\noindent
422Configuration 1:
423\begin{description}[itemsep=0pt,parsep=0pt]
424\item[producer (K):]
4251
426\item[consumer (M):]
4271
428\item[round:]
429100,000
430\item[max:]
431500
432\item[min:]
43350
434\item[step:]
43550
436\item[distro:]
437fisher
438\item[objects (N):]
439100,000
440\end{description}
441
442% -threadA :  1
443% -threadF :  1
444% -maxS    :  500
445% -minS    :  50
446% -stepS   :  50
447% -distroS :  fisher
448% -objN    :  100000
449% -consumeS:  100000
450
451\noindent
452Configuration 2:
453\begin{description}[itemsep=0pt,parsep=0pt]
454\item[producer (K):]
4554
456\item[consumer (M):]
4574
458\item[round:]
459100,000
460\item[max:]
461500
462\item[min:]
46350
464\item[step:]
46550
466\item[distro:]
467fisher
468\item[objects (N):]
469100,000
470\end{description}
471
472% -threadA :  4
473% -threadF :  4
474% -maxS    :  500
475% -minS    :  50
476% -stepS   :  50
477% -distroS :  fisher
478% -objN    :  100000
479% -consumeS:  100000
480
481% \begin{table}[b]
482% \centering
483%     \begin{tabular}{ |c|c|c| }
484%      \hline
485%     Memory Allocator & Configuration 1 Result & Configuration 2 Result\\
486%      \hline
487%     llh & \VRef[Figure]{fig:mem-1-prod-1-cons-100-llh} & \VRef[Figure]{fig:mem-4-prod-4-cons-100-llh}\\
488%      \hline
489%     dl & \VRef[Figure]{fig:mem-1-prod-1-cons-100-dl} & \VRef[Figure]{fig:mem-4-prod-4-cons-100-dl}\\
490%      \hline
491%     glibc & \VRef[Figure]{fig:mem-1-prod-1-cons-100-glc} & \VRef[Figure]{fig:mem-4-prod-4-cons-100-glc}\\
492%      \hline
493%     hoard & \VRef[Figure]{fig:mem-1-prod-1-cons-100-hrd} & \VRef[Figure]{fig:mem-4-prod-4-cons-100-hrd}\\
494%      \hline
495%     je & \VRef[Figure]{fig:mem-1-prod-1-cons-100-je} & \VRef[Figure]{fig:mem-4-prod-4-cons-100-je}\\
496%      \hline
497%     pt3 & \VRef[Figure]{fig:mem-1-prod-1-cons-100-pt3} & \VRef[Figure]{fig:mem-4-prod-4-cons-100-pt3}\\
498%      \hline
499%     rp & \VRef[Figure]{fig:mem-1-prod-1-cons-100-rp} & \VRef[Figure]{fig:mem-4-prod-4-cons-100-rp}\\
500%      \hline
501%     tbb & \VRef[Figure]{fig:mem-1-prod-1-cons-100-tbb} & \VRef[Figure]{fig:mem-4-prod-4-cons-100-tbb}\\
502%      \hline
503%     \end{tabular}
504% \caption{Memory benchmark results}
505% \label{table:mem-benchmark-figs}
506% \end{table}
507% Table \ref{table:mem-benchmark-figs} shows the list of figures that contain memory benchmark results.
508
509\VRefrange[Figures]{fig:mem-1-prod-1-cons-100-llh}{fig:mem-4-prod-4-cons-100-tbb} show 16 figures, two figures for each of the 8 allocators, one for each configuration.
510Each figure has 2 graphs, one for each experiment environment.
511Each graph has following 5 subgraphs that show memory usage and statistics throughout the micro-benchmark's lifetime.
512\begin{itemize}
513\item \textit{\textbf{current\_req\_mem(B)}} shows the amount of dynamic memory requested and currently in-use of the benchmark.
514\item \textit{\textbf{heap}}* shows the memory requested by the program (allocator) from the system that lies in the heap (@sbrk@) area.
515\item \textit{\textbf{mmap\_so}}* shows the memory requested by the program (allocator) from the system that lies in the @mmap@ area.
516\item \textit{\textbf{mmap}}* shows the memory requested by the program (allocator or shared libraries) from the system that lies in the @mmap@ area.
517\item \textit{\textbf{total\_dynamic}} shows the total usage of dynamic memory by the benchmark program, which is a sum of \textit{heap}, \textit{mmap}, and \textit{mmap\_so}.
518\end{itemize}
519* These statistics are gathered by monitoring a process's @/proc/self/maps@ file.
520
521The X-axis shows the time when the memory information is polled.
522The Y-axis shows the memory usage in bytes.
523
524For the experiment, at a certain time in the program's life, the difference between the memory requested by the benchmark (\textit{current\_req\_mem(B)}) and the memory that the process has received from system (\textit{heap}, \textit{mmap}) should be minimum.
525This difference is the memory overhead caused by the allocator and shows the level of fragmentation in the allocator.
526
527First, the differences in the shape of the curves between architectures (top ARM, bottom x64) is small, where the differences are in the amount of memory used.
528Hence, it is possible to focus on either the top or bottom graph.
529The heap curve is remains zero for 4 memory allocators: \textsf{hrd}, \textsf{je}, \textsf{pt3}, and \textsf{rp}.
530These memory allocators are not using the sbrk area, instead they only use mmap to get memory from the system.
531
532\textsf{hrd}, and \textsf{tbb} have higher memory footprint than the others as they use more total dynamic memory.
533One reason for that can be the usage of superblocks as both of these memory allocators create superblocks where each block contains objects of the same size.
534These superblocks are maintained throughout the life of the program.
535
536\textsf{pt3} is the only memory allocator for which the total dynamic memory goes down in the second half of the program lifetime when the memory is freed by the benchmark program.
537It makes pt3 the only memory allocator that gives memory back to operating system as it is freed by the program.
538
539% FOR 1 THREAD
540
541%mem-1-prod-1-cons-100-llh.eps
542\begin{figure}
543\centering
544    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-1-prod-1-cons-100-llh} }
545    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-1-prod-1-cons-100-llh} }
546\caption{Memory benchmark results with Configuration-1 for llh memory allocator}
547\label{fig:mem-1-prod-1-cons-100-llh}
548\end{figure}
549
550%mem-1-prod-1-cons-100-dl.eps
551\begin{figure}
552\centering
553    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-1-prod-1-cons-100-dl} }
554    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-1-prod-1-cons-100-dl} }
555\caption{Memory benchmark results with Configuration-1 for dl memory allocator}
556\label{fig:mem-1-prod-1-cons-100-dl}
557\end{figure}
558
559%mem-1-prod-1-cons-100-glc.eps
560\begin{figure}
561\centering
562    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-1-prod-1-cons-100-glc} }
563    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-1-prod-1-cons-100-glc} }
564\caption{Memory benchmark results with Configuration-1 for glibc memory allocator}
565\label{fig:mem-1-prod-1-cons-100-glc}
566\end{figure}
567
568%mem-1-prod-1-cons-100-hrd.eps
569\begin{figure}
570\centering
571    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-1-prod-1-cons-100-hrd} }
572    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-1-prod-1-cons-100-hrd} }
573\caption{Memory benchmark results with Configuration-1 for hoard memory allocator}
574\label{fig:mem-1-prod-1-cons-100-hrd}
575\end{figure}
576
577%mem-1-prod-1-cons-100-je.eps
578\begin{figure}
579\centering
580    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-1-prod-1-cons-100-je} }
581    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-1-prod-1-cons-100-je} }
582\caption{Memory benchmark results with Configuration-1 for je memory allocator}
583\label{fig:mem-1-prod-1-cons-100-je}
584\end{figure}
585
586%mem-1-prod-1-cons-100-pt3.eps
587\begin{figure}
588\centering
589    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-1-prod-1-cons-100-pt3} }
590    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-1-prod-1-cons-100-pt3} }
591\caption{Memory benchmark results with Configuration-1 for pt3 memory allocator}
592\label{fig:mem-1-prod-1-cons-100-pt3}
593\end{figure}
594
595%mem-1-prod-1-cons-100-rp.eps
596\begin{figure}
597\centering
598    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-1-prod-1-cons-100-rp} }
599    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-1-prod-1-cons-100-rp} }
600\caption{Memory benchmark results with Configuration-1 for rp memory allocator}
601\label{fig:mem-1-prod-1-cons-100-rp}
602\end{figure}
603
604%mem-1-prod-1-cons-100-tbb.eps
605\begin{figure}
606\centering
607    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-1-prod-1-cons-100-tbb} }
608    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-1-prod-1-cons-100-tbb} }
609\caption{Memory benchmark results with Configuration-1 for tbb memory allocator}
610\label{fig:mem-1-prod-1-cons-100-tbb}
611\end{figure}
612
613% FOR 4 THREADS
614
615%mem-4-prod-4-cons-100-llh.eps
616\begin{figure}
617\centering
618    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-4-prod-4-cons-100-llh} }
619    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-4-prod-4-cons-100-llh} }
620\caption{Memory benchmark results with Configuration-2 for llh memory allocator}
621\label{fig:mem-4-prod-4-cons-100-llh}
622\end{figure}
623
624%mem-4-prod-4-cons-100-dl.eps
625\begin{figure}
626\centering
627    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-4-prod-4-cons-100-dl} }
628    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-4-prod-4-cons-100-dl} }
629\caption{Memory benchmark results with Configuration-2 for dl memory allocator}
630\label{fig:mem-4-prod-4-cons-100-dl}
631\end{figure}
632
633%mem-4-prod-4-cons-100-glc.eps
634\begin{figure}
635\centering
636    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-4-prod-4-cons-100-glc} }
637    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-4-prod-4-cons-100-glc} }
638\caption{Memory benchmark results with Configuration-2 for glibc memory allocator}
639\label{fig:mem-4-prod-4-cons-100-glc}
640\end{figure}
641
642%mem-4-prod-4-cons-100-hrd.eps
643\begin{figure}
644\centering
645    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-4-prod-4-cons-100-hrd} }
646    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-4-prod-4-cons-100-hrd} }
647\caption{Memory benchmark results with Configuration-2 for hoard memory allocator}
648\label{fig:mem-4-prod-4-cons-100-hrd}
649\end{figure}
650
651%mem-4-prod-4-cons-100-je.eps
652\begin{figure}
653\centering
654    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-4-prod-4-cons-100-je} }
655    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-4-prod-4-cons-100-je} }
656\caption{Memory benchmark results with Configuration-2 for je memory allocator}
657\label{fig:mem-4-prod-4-cons-100-je}
658\end{figure}
659
660%mem-4-prod-4-cons-100-pt3.eps
661\begin{figure}
662\centering
663    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-4-prod-4-cons-100-pt3} }
664    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-4-prod-4-cons-100-pt3} }
665\caption{Memory benchmark results with Configuration-2 for pt3 memory allocator}
666\label{fig:mem-4-prod-4-cons-100-pt3}
667\end{figure}
668
669%mem-4-prod-4-cons-100-rp.eps
670\begin{figure}
671\centering
672    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-4-prod-4-cons-100-rp} }
673    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-4-prod-4-cons-100-rp} }
674\caption{Memory benchmark results with Configuration-2 for rp memory allocator}
675\label{fig:mem-4-prod-4-cons-100-rp}
676\end{figure}
677
678%mem-4-prod-4-cons-100-tbb.eps
679\begin{figure}
680\centering
681    \subfigure[Algol]{ \includegraphics[width=0.95\textwidth]{evaluations/algol-perf-eps/mem-4-prod-4-cons-100-tbb} }
682    \subfigure[Nasus]{ \includegraphics[width=0.95\textwidth]{evaluations/nasus-perf-eps/mem-4-prod-4-cons-100-tbb} }
683\caption{Memory benchmark results with Configuration-2 for tbb memory allocator}
684\label{fig:mem-4-prod-4-cons-100-tbb}
685\end{figure}
Note: See TracBrowser for help on using the repository browser.