Changes in / [ffec1bf:9e23b446]
- Files:
-
- 1 added
- 20 deleted
- 106 edited
-
Jenkins/FullBuild (modified) (1 diff)
-
Jenkinsfile (modified) (3 diffs)
-
Makefile.am (modified) (1 diff)
-
benchmark/readyQ/churn.cfa (modified) (2 diffs)
-
benchmark/readyQ/cycle.cfa (modified) (2 diffs)
-
benchmark/readyQ/cycle.cpp (modified) (2 diffs)
-
benchmark/readyQ/locality.cfa (modified) (2 diffs)
-
benchmark/readyQ/locality.cpp (modified) (4 diffs)
-
benchmark/readyQ/yield.cfa (modified) (2 diffs)
-
benchmark/readyQ/yield.cpp (modified) (2 diffs)
-
doc/bibliography/pl.bib (modified) (2 diffs)
-
doc/theses/mike_brooks_MMath/array.tex (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/.gitignore (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/Makefile (modified) (4 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/fig/MQMS.fig (deleted)
-
doc/theses/thierry_delisle_PhD/thesis/fig/MQMSG.fig (deleted)
-
doc/theses/thierry_delisle_PhD/thesis/fig/SAVE.fig (added)
-
doc/theses/thierry_delisle_PhD/thesis/fig/SQMS.fig (deleted)
-
doc/theses/thierry_delisle_PhD/thesis/fig/base.fig (modified) (2 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/fig/base_avg.fig (modified) (5 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/fig/base_ts2.fig (deleted)
-
doc/theses/thierry_delisle_PhD/thesis/fig/cache-noshare.fig (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/fig/cache-share.fig (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/fig/cycle.fig (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/fig/executionStates.fig (deleted)
-
doc/theses/thierry_delisle_PhD/thesis/fig/idle.fig (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/fig/idle1.fig (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/fig/idle2.fig (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/fig/idle_state.fig (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/fig/io_uring.fig (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/fig/system.fig (modified) (3 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/local.bib (modified) (20 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/text/core.tex (modified) (7 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex (modified) (6 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/text/existing.tex (modified) (4 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/text/intro.tex (modified) (1 diff)
-
doc/theses/thierry_delisle_PhD/thesis/text/io.tex (modified) (13 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/text/practice.tex (modified) (2 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex (modified) (5 diffs)
-
doc/theses/thierry_delisle_PhD/thesis/thesis.tex (modified) (2 diffs)
-
libcfa/Makefile.am (modified) (1 diff)
-
libcfa/configure.ac (modified) (2 diffs)
-
libcfa/src/Makefile.am (modified) (2 diffs)
-
libcfa/src/concurrency/kernel/fwd.hfa (modified) (2 diffs)
-
libcfa/src/concurrency/locks.cfa (modified) (6 diffs)
-
libcfa/src/concurrency/locks.hfa (modified) (6 diffs)
-
libcfa/src/concurrency/ready_subqueue.hfa (modified) (1 diff)
-
libcfa/src/heap.cfa (modified) (1 diff)
-
src/AST/Convert.cpp (modified) (3 diffs)
-
src/AST/Decl.hpp (modified) (1 diff)
-
src/AST/Expr.cpp (modified) (1 diff)
-
src/AST/Inspect.cpp (deleted)
-
src/AST/Inspect.hpp (deleted)
-
src/AST/Pass.impl.hpp (modified) (1 diff)
-
src/AST/module.mk (modified) (1 diff)
-
src/CodeGen/CodeGenerator.cc (modified) (5 diffs)
-
src/CodeGen/CodeGenerator.h (modified) (3 diffs)
-
src/CodeGen/FixNames.cc (modified) (4 diffs)
-
src/CodeGen/FixNames.h (modified) (2 diffs)
-
src/CodeGen/GenType.cc (modified) (1 diff)
-
src/Common/Eval.cc (modified) (5 diffs)
-
src/Common/ResolvProtoDump.cpp (modified) (1 diff)
-
src/Concurrency/Keywords.h (modified) (1 diff)
-
src/ControlStruct/ExceptDecl.cc (modified) (1 diff)
-
src/ControlStruct/ExceptDecl.h (modified) (2 diffs)
-
src/ControlStruct/ExceptDeclNew.cpp (deleted)
-
src/ControlStruct/HoistControlDecls.hpp (modified) (1 diff)
-
src/ControlStruct/MultiLevelExit.cpp (modified) (6 diffs)
-
src/ControlStruct/module.mk (modified) (1 diff)
-
src/GenPoly/Box.cc (modified) (6 diffs)
-
src/GenPoly/GenPoly.cc (modified) (1 diff)
-
src/InitTweak/GenInit.cc (modified) (2 diffs)
-
src/InitTweak/GenInit.h (modified) (2 diffs)
-
src/Parser/lex.ll (modified) (1 diff)
-
src/Parser/parser.yy (modified) (8 diffs)
-
src/ResolvExpr/CandidateFinder.cpp (modified) (3 diffs)
-
src/ResolvExpr/CurrentObject.cc (modified) (10 diffs)
-
src/SymTab/FixFunction.cc (modified) (3 diffs)
-
src/SymTab/FixFunction.h (modified) (3 diffs)
-
src/SymTab/Mangler.cc (modified) (1 diff)
-
src/SymTab/Validate.cc (modified) (7 diffs)
-
src/SymTab/Validate.h (modified) (2 diffs)
-
src/SymTab/ValidateType.cc (modified) (1 diff)
-
src/SynTree/AggregateDecl.cc (modified) (2 diffs)
-
src/SynTree/Type.h (modified) (2 diffs)
-
src/Tuples/Tuples.cc (modified) (1 diff)
-
src/Tuples/Tuples.h (modified) (1 diff)
-
src/Validate/Autogen.cpp (modified) (4 diffs)
-
src/Validate/Autogen.hpp (modified) (1 diff)
-
src/Validate/CompoundLiteral.hpp (modified) (1 diff)
-
src/Validate/EliminateTypedef.cpp (modified) (3 diffs)
-
src/Validate/EnumAndPointerDecay.cpp (deleted)
-
src/Validate/EnumAndPointerDecay.hpp (deleted)
-
src/Validate/FindSpecialDecls.h (modified) (2 diffs)
-
src/Validate/FixQualifiedTypes.cpp (modified) (2 diffs)
-
src/Validate/FixQualifiedTypes.hpp (modified) (2 diffs)
-
src/Validate/FixReturnTypes.cpp (deleted)
-
src/Validate/FixReturnTypes.hpp (deleted)
-
src/Validate/ForallPointerDecay.hpp (modified) (1 diff)
-
src/Validate/GenericParameter.cpp (modified) (1 diff)
-
src/Validate/GenericParameter.hpp (modified) (2 diffs)
-
src/Validate/HoistStruct.hpp (modified) (1 diff)
-
src/Validate/HoistTypeDecls.cpp (deleted)
-
src/Validate/HoistTypeDecls.hpp (deleted)
-
src/Validate/LabelAddressFixer.cpp (modified) (1 diff)
-
src/Validate/LabelAddressFixer.hpp (modified) (2 diffs)
-
src/Validate/LinkReferenceToTypes.cpp (deleted)
-
src/Validate/LinkReferenceToTypes.hpp (deleted)
-
src/Validate/ReplaceTypedef.cpp (deleted)
-
src/Validate/ReplaceTypedef.hpp (deleted)
-
src/Validate/VerifyCtorDtorAssign.cpp (deleted)
-
src/Validate/VerifyCtorDtorAssign.hpp (deleted)
-
src/Validate/module.mk (modified) (2 diffs)
-
src/Virtual/Tables.h (modified) (1 diff)
-
src/main.cc (modified) (5 diffs)
-
tests/.expect/attributes.nast.arm64.txt (modified) (1 diff)
-
tests/.expect/attributes.nast.x64.txt (modified) (1 diff)
-
tests/.expect/attributes.nast.x86.txt (modified) (1 diff)
-
tests/.expect/attributes.oast.x64.txt (modified) (1 diff)
-
tests/alloc2.cfa (modified) (3 diffs)
-
tests/enum.cfa (modified) (1 diff)
-
tests/enum_tests/structEnum.cfa (modified) (1 diff)
-
tests/pybin/tools.py (modified) (2 diffs)
-
tests/unified_locking/.expect/pthread_locks.txt (modified) (1 diff)
-
tests/unified_locking/mutex_test.hfa (modified) (4 diffs)
-
tests/unified_locking/pthread_locks.cfa (modified) (3 diffs)
-
tools/gdb/utils-gdb.py (modified) (7 diffs)
Legend:
- Unmodified
- Added
- Removed
-
Jenkins/FullBuild
rffec1bf r9e23b446 161 161 <p>${result}</p> 162 162 163 <p>- Performance ---------------------------------------------------------</p> 164 165 <img src="https://cforall.uwaterloo.ca/jenkins/job/Cforall/job/master/plot/Compilation/getPlot?index=0" > 166 <img src="https://cforall.uwaterloo.ca/jenkins/job/Cforall/job/master/plot/Compilation/getPlot?index=1" > 167 163 168 <p>- Logs ----------------------------------------------------------------</p> 164 169 """ -
Jenkinsfile
rffec1bf r9e23b446 209 209 210 210 if( Settings.Publish && !Settings.RunBenchmark ) { echo 'No results to publish!!!' } 211 212 def groupCompile = new PlotGroup('Compilation', 'duration (s) - lower is better', true) 213 def groupConcurrency = new PlotGroup('Concurrency', 'duration (n) - lower is better', false) 214 215 //Then publish the results 216 do_plot(Settings.RunBenchmark && Settings.Publish, 'compile' , groupCompile , false, 'Compilation') 217 do_plot(Settings.RunBenchmark && Settings.Publish, 'compile.diff' , groupCompile , true , 'Compilation (relative)') 218 do_plot(Settings.RunBenchmark && Settings.Publish, 'ctxswitch' , groupConcurrency, false, 'Context Switching') 219 do_plot(Settings.RunBenchmark && Settings.Publish, 'ctxswitch.diff' , groupConcurrency, true , 'Context Switching (relative)') 220 do_plot(Settings.RunBenchmark && Settings.Publish, 'mutex' , groupConcurrency, false, 'Mutual Exclusion') 221 do_plot(Settings.RunBenchmark && Settings.Publish, 'mutex.diff' , groupConcurrency, true , 'Mutual Exclusion (relative)') 222 do_plot(Settings.RunBenchmark && Settings.Publish, 'scheduling' , groupConcurrency, false, 'Internal and External Scheduling') 223 do_plot(Settings.RunBenchmark && Settings.Publish, 'scheduling.diff', groupConcurrency, true , 'Internal and External Scheduling (relative)') 211 224 } 212 225 } … … 363 376 this.GitNewRef = '' 364 377 this.GitOldRef = '' 378 } 379 } 380 381 class PlotGroup implements Serializable { 382 public String name 383 public String unit 384 public boolean log 385 386 PlotGroup(String name, String unit, boolean log) { 387 this.name = name 388 this.unit = unit 389 this.log = log 365 390 } 366 391 } … … 451 476 } 452 477 } 478 479 def do_plot(boolean new_data, String file, PlotGroup group, boolean relative, String title) { 480 481 if(new_data) { 482 echo "Publishing new data" 483 } 484 485 def series = new_data ? [[ 486 file: "${file}.csv", 487 exclusionValues: '', 488 displayTableFlag: false, 489 inclusionFlag: 'OFF', 490 url: '' 491 ]] : []; 492 493 echo "file is ${BuildDir}/benchmark/${file}.csv, group ${group}, title ${title}" 494 dir("${BuildDir}/benchmark/") { 495 plot csvFileName: "cforall-${env.BRANCH_NAME}-${file}.csv", 496 csvSeries: series, 497 group: "${group.name}", 498 title: "${title}", 499 style: 'lineSimple', 500 exclZero: false, 501 keepRecords: false, 502 logarithmic: !relative && group.log, 503 numBuilds: '120', 504 useDescr: true, 505 yaxis: group.unit, 506 yaxisMaximum: '', 507 yaxisMinimum: '' 508 } 509 } -
Makefile.am
rffec1bf r9e23b446 52 52 @find libcfa -name config.status -printf "\n%h\n\t" -exec {} --config \; | sed "s/ /\n\t/g; s/\t'/\t/g; s/'\n/\n/g; s/^'//g; s/'$$//g" 53 53 54 @LIBCFA_TARGET_DIRS@:: 55 $(MAKE) -C $@ $(MAKECMDGOALS) 54 mostlyclean-local: @LIBCFA_TARGET_MAKEFILES@ 55 for dir in @LIBCFA_TARGET_DIRS@; do \ 56 $(MAKE) -C $${dir} mostlyclean; \ 57 done 56 58 57 mostlyclean clean distclean maintainer-clean: @LIBCFA_TARGET_DIRS@ 59 clean-local: @LIBCFA_TARGET_MAKEFILES@ 60 for dir in @LIBCFA_TARGET_DIRS@; do \ 61 $(MAKE) -C $${dir} clean; \ 62 done 63 64 distclean-local: @LIBCFA_TARGET_MAKEFILES@ 65 for dir in @LIBCFA_TARGET_DIRS@; do \ 66 $(MAKE) -C $${dir} distclean; \ 67 rm $${dir}/config.data; \ 68 done -
benchmark/readyQ/churn.cfa
rffec1bf r9e23b446 58 58 59 59 threads_left = nthreads; 60 BThrd * * threads = alloc(nthreads);60 BThrd * threads[nthreads]; 61 61 for(i; nthreads ) { 62 62 BThrd & t = *(threads[i] = malloc()); … … 90 90 91 91 free(spots); 92 free(threads);93 92 } 94 93 -
benchmark/readyQ/cycle.cfa
rffec1bf r9e23b446 52 52 { 53 53 threads_left = tthreads; 54 BThrd * * threads = alloc(tthreads);55 Partner * thddata = alloc(tthreads);54 BThrd * threads[tthreads]; 55 Partner thddata[tthreads]; 56 56 for(i; tthreads) { 57 (thddata[i]){};58 57 unsigned pi = (i + nthreads) % tthreads; 59 58 thddata[i].next = &thddata[pi].self; … … 84 83 delete(threads[i]); 85 84 } 86 free(threads);87 free(thddata);88 85 } 89 86 -
benchmark/readyQ/cycle.cpp
rffec1bf r9e23b446 39 39 { 40 40 threads_left = tthreads; 41 Fibre * * threads = new Fibre *[tthreads]();42 Partner * thddata = new Partner[tthreads]();41 Fibre * threads[tthreads]; 42 Partner thddata[tthreads]; 43 43 for(unsigned i = 0; i < tthreads; i++) { 44 44 unsigned pi = (i + nthreads) % tthreads; … … 69 69 global_blocks += thddata[i].blocks; 70 70 } 71 72 delete[](threads);73 delete[](thddata);74 71 } 75 72 -
benchmark/readyQ/locality.cfa
rffec1bf r9e23b446 222 222 threads_left = nprocs; 223 223 { 224 MyThread * * threads = alloc(nthreads);224 MyThread * threads[nthreads]; 225 225 for(i; nthreads) { 226 226 threads[i] = malloc(); … … 259 259 free( threads[i] ); 260 260 } 261 free( threads );262 261 } 263 262 -
benchmark/readyQ/locality.cpp
rffec1bf r9e23b446 217 217 { 218 218 FibreInit(1, nprocs); 219 MyData * * data_arrays = new MyData *[nthreads]();219 MyData * data_arrays[nthreads]; 220 220 for(size_t i = 0; i < nthreads; i++) { 221 221 data_arrays[i] = new MyData( i, wsize ); … … 228 228 229 229 threads_left = nthreads - nspots; 230 Fibre * * threads = new Fibre *[nthreads]();231 MyCtx * * thddata = new MyCtx *[nthreads]();230 Fibre * threads[nthreads]; 231 MyCtx * thddata[nthreads]; 232 232 { 233 233 for(size_t i = 0; i < nthreads; i++) { … … 240 240 i 241 241 ); 242 threads[i] = new Fibre(); 243 threads[i]->run( reinterpret_cast<void (*)(MyCtx*)>(thread_main), thddata[i] ); 242 threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(thread_main), thddata[i] ); 244 243 } 245 244 … … 268 267 delete( data_arrays[i] ); 269 268 } 270 delete[](data_arrays);271 269 272 270 for(size_t i = 0; i < nspots; i++) { 273 271 delete( spots[i] ); 274 272 } 275 276 delete[](threads);277 delete[](thddata);278 273 } 279 274 -
benchmark/readyQ/yield.cfa
rffec1bf r9e23b446 34 34 { 35 35 threads_left = nthreads; 36 Yielder * threads = alloc(nthreads); 37 for(i; nthreads) { 38 (threads[i]){}; 39 } 40 36 Yielder threads[nthreads]; 41 37 printf("Starting\n"); 42 38 … … 56 52 Yielder & y = join( threads[i] ); 57 53 global_counter += y.count; 58 ^(threads[i]){};59 54 } 60 free(threads);61 55 } 62 56 -
benchmark/readyQ/yield.cpp
rffec1bf r9e23b446 33 33 { 34 34 threads_left = nthreads; 35 Fibre * * threads = new Fibre *[nthreads]();35 Fibre * threads[nthreads]; 36 36 for(unsigned i = 0; i < nthreads; i++) { 37 37 threads[i] = new Fibre(); … … 52 52 fibre_join( threads[i], nullptr ); 53 53 } 54 delete[] threads;55 54 } 56 55 -
doc/bibliography/pl.bib
rffec1bf r9e23b446 2024 2024 @manual{C++20Coroutine19, 2025 2025 keywords = {coroutine}, 2026 key = {Coroutines},2027 2026 contributer = {pabuhr@plg}, 2028 2027 title = {Coroutines (C++20)}, 2029 2028 organization= {cppreference.com}, 2030 month = jun,2031 year = 20 22,2029 month = apr, 2030 year = 2019, 2032 2031 note = {\href{https://en.cppreference.com/w/cpp/language/coroutines}{https://\-en.cppreference.com/\-w/\-cpp/\-language/\-coroutines}}, 2033 2032 } … … 6992 6991 % S 6993 6992 6994 @inproceedings{Imam14,6995 keywords = {actor model, performance comparison, java actor libraries, benchmark suite},6996 contributer = {pabuhr@plg},6997 author = {Shams M. Imam and Vivek Sarkar},6998 title = {Savina - An Actor Benchmark Suite: Enabling Empirical Evaluation of Actor Libraries},6999 year = {2014},7000 publisher = {ACM},7001 address = {New York, NY, USA},7002 booktitle = {Proceedings of the 4th International Workshop on Programming Based on Actors Agents \& Decentralized Control},7003 pages = {67-80},7004 numpages = {14},7005 location = {Portland, Oregon, USA},7006 series = {AGERE! '14}7007 }7008 7009 6993 @manual{Scala, 7010 6994 keywords = {Scala programming language}, -
doc/theses/mike_brooks_MMath/array.tex
rffec1bf r9e23b446 182 182 \CFA's array is also the first extension of C to use its tracked bounds to generate the pointer arithmetic implied by advanced allocation patterns. Other bound-tracked extensions of C either forbid certain C patterns entirely, or address the problem of \emph{verifying} that the user's provided pointer arithmetic is self-consistent. The \CFA array, applied to accordion structures [TOD: cross-reference] \emph{implies} the necessary pointer arithmetic, generated automatically, and not appearing at all in a user's program. 183 183 184 \subs ection{Safety in a padded room}184 \subsction{Safety in a padded room} 185 185 186 186 Java's array [todo:cite] is a straightforward example of assuring safety against undefined behaviour, at a cost of expressiveness for more applied properties. Consider the array parameter declarations in: -
doc/theses/thierry_delisle_PhD/thesis/.gitignore
rffec1bf r9e23b446 1 1 back_text/ 2 SAVE.fig -
doc/theses/thierry_delisle_PhD/thesis/Makefile
rffec1bf r9e23b446 34 34 base \ 35 35 base_avg \ 36 base_ts2 \37 36 cache-share \ 38 37 cache-noshare \ … … 41 40 emptytls \ 42 41 emptytree \ 43 executionStates \44 42 fairness \ 45 43 idle \ … … 49 47 io_uring \ 50 48 pivot_ring \ 51 MQMS \52 MQMSG \53 49 system \ 54 50 cycle \ … … 69 65 result.memcd.rate.qps \ 70 66 result.memcd.rate.99th \ 71 SQMS \72 67 } 73 68 -
doc/theses/thierry_delisle_PhD/thesis/fig/base.fig
rffec1bf r9e23b446 13 13 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200 14 14 -6 15 6 6 450 5025 6750 517516 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6 525 5100 20 20 6525 5100 6545 510017 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6 600 5100 20 20 6600 5100 6620 510018 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 66 75 5100 20 20 6675 5100 6695 510015 6 6375 5100 6675 5250 16 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6450 5175 20 20 6450 5175 6470 5175 17 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6525 5175 20 20 6525 5175 6545 5175 18 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6600 5175 20 20 6600 5175 6620 5175 19 19 -6 20 20 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 2400 300 300 3900 2400 4200 2400 … … 80 80 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 81 81 2400 2475 3000 2475 82 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 83 3300 5210 3150 4950 2850 4950 2700 5210 2850 5470 3150 5470 84 3300 5210 85 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 86 4500 5210 4350 4950 4050 4950 3900 5210 4050 5470 4350 5470 87 4500 5210 88 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 89 5700 5210 5550 4950 5250 4950 5100 5210 5250 5470 5550 5470 90 5700 5210 82 91 2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 83 3600 5 400 3600 120092 3600 5700 3600 1200 84 93 2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 85 4800 5 400 4800 120094 4800 5700 4800 1200 86 95 2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 87 6000 5400 6000 1200 88 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 89 2700 4800 3300 4800 3300 5400 2700 5400 2700 4800 90 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 91 3900 4800 4500 4800 4500 5400 3900 5400 3900 4800 92 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 93 5100 4800 5700 4800 5700 5400 5100 5400 5100 4800 94 4 2 -1 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001 95 4 2 -1 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001 96 4 1 -1 50 -1 0 11 0.0000 2 120 210 2700 4450 TS\001 97 4 2 -1 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001 98 4 2 -1 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001 99 4 1 -1 50 -1 0 11 0.0000 2 120 210 2700 3550 TS\001 100 4 2 -1 50 -1 0 12 0.0000 2 135 840 2100 5175 Processors\001 96 6000 5700 6000 1200 97 4 2 -1 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001 98 4 2 -1 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001 99 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 4450 TS\001 100 4 2 -1 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001 101 4 2 -1 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001 102 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 3550 TS\001 103 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 2650 TS\001 104 4 2 -1 50 -1 0 12 0.0000 2 135 900 2100 5175 Processors\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/base_avg.fig
rffec1bf r9e23b446 13 13 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200 14 14 -6 15 6 6 450 5025 6750 517516 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6 525 5100 20 20 6525 5100 6545 510017 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6 600 5100 20 20 6600 5100 6620 510018 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 66 75 5100 20 20 6675 5100 6695 510015 6 6375 5100 6675 5250 16 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6450 5175 20 20 6450 5175 6470 5175 17 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6525 5175 20 20 6525 5175 6545 5175 18 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6600 5175 20 20 6600 5175 6620 5175 19 19 -6 20 20 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 2400 300 300 3900 2400 4200 2400 … … 52 52 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 53 53 1 1 1.00 45.00 90.00 54 3900 42003900 360054 3900 3975 3900 3600 55 55 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 56 56 1 1 1.00 45.00 90.00 … … 61 61 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 62 62 1 1 1.00 45.00 90.00 63 5100 42005100 360063 5100 3975 5100 3600 64 64 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 65 65 1 1 1.00 45.00 90.00 … … 67 67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 68 68 1 1 1.00 45.00 90.00 69 6300 42006300 360069 6300 3975 6300 3600 70 70 2 1 0 1 -1 7 50 -1 -1 0.000 0 0 -1 1 0 2 71 71 1 1 1.00 45.00 90.00 … … 75 75 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 76 76 1 1 1.00 45.00 90.00 77 4500 42004500 360077 4500 3975 4500 3600 78 78 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 79 79 2400 3375 3000 3375 80 80 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 81 81 2400 2475 3000 2475 82 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 83 3300 5210 3150 4950 2850 4950 2700 5210 2850 5470 3150 5470 84 3300 5210 85 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 86 4500 5210 4350 4950 4050 4950 3900 5210 4050 5470 4350 5470 87 4500 5210 88 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 89 5700 5210 5550 4950 5250 4950 5100 5210 5250 5470 5550 5470 90 5700 5210 82 91 2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 83 3600 5 400 3600 120092 3600 5700 3600 1200 84 93 2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 85 4800 5 400 4800 120094 4800 5700 4800 1200 86 95 2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2 87 6000 5400 6000 1200 88 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 89 2700 4800 3300 4800 3300 5400 2700 5400 2700 4800 90 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 91 3900 4800 4500 4800 4500 5400 3900 5400 3900 4800 92 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 93 5100 4800 5700 4800 5700 5400 5100 5400 5100 4800 96 6000 5700 6000 1200 94 97 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 95 98 2400 4050 3000 4050 96 4 2 -1 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001 97 4 2 -1 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001 98 4 1 -1 50 -1 0 11 0.0000 2 120 300 2700 4450 MA\001 99 4 2 -1 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001 100 4 2 -1 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001 101 4 1 -1 50 -1 0 11 0.0000 2 120 210 2700 3550 TS\001 102 4 2 -1 50 -1 0 12 0.0000 2 135 840 2100 5175 Processors\001 103 4 1 -1 50 -1 0 11 0.0000 2 120 210 2700 4225 TS\001 99 4 2 -1 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001 100 4 2 -1 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001 101 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 4450 MA\001 102 4 2 -1 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001 103 4 2 -1 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001 104 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 3550 TS\001 105 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 2650 TS\001 106 4 2 -1 50 -1 0 12 0.0000 2 135 900 2100 5175 Processors\001 107 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 4200 TS\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/cache-noshare.fig
rffec1bf r9e23b446 8 8 -2 9 9 1200 2 10 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1650 1650 456 456 1650 1650 1200 157511 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2850 1650 456 456 2850 1650 2400 157512 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4 050 1650 456 456 4050 1650 3600 157513 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5250 1650 456 456 5250 1650 4800 157510 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2550 2550 456 456 2550 2550 2100 2475 11 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3750 2550 456 456 3750 2550 3300 2475 12 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4950 2550 456 456 4950 2550 4500 2475 13 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6150 2550 456 456 6150 2550 5700 2475 14 14 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 15 1200 2400 2100 2400 2100 2700 1200 2700 1200 240015 2100 3300 3000 3300 3000 3600 2100 3600 2100 3300 16 16 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 17 1200 3000 2100 3000 2100 3600 1200 3600 1200 300017 2100 3900 3000 3900 3000 4500 2100 4500 2100 3900 18 18 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 19 2400 2400 3300 2400 3300 2700 2400 2700 2400 240019 3300 3300 4200 3300 4200 3600 3300 3600 3300 3300 20 20 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 21 2400 3000 3300 3000 3300 3600 2400 3600 2400 300021 3300 3900 4200 3900 4200 4500 3300 4500 3300 3900 22 22 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 23 3600 2400 4500 2400 4500 2700 3600 2700 3600 240023 4500 3300 5400 3300 5400 3600 4500 3600 4500 3300 24 24 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 25 3600 3000 4500 3000 4500 3600 3600 3600 3600 300025 4500 3900 5400 3900 5400 4500 4500 4500 4500 3900 26 26 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 27 4800 2400 5700 2400 5700 2700 4800 2700 4800 240027 5700 3300 6600 3300 6600 3600 5700 3600 5700 3300 28 28 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 29 4800 3000 5700 3000 5700 3600 4800 3600 4800 300029 5700 3900 6600 3900 6600 4500 5700 4500 5700 3900 30 30 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 31 1200 3900 3300 3900 3300 4800 1200 4800 1200 390031 2100 4800 4200 4800 4200 5700 2100 5700 2100 4800 32 32 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 33 3600 3900 5700 3900 5700 4800 3600 4800 3600 390033 4500 4800 6600 4800 6600 5700 4500 5700 4500 4800 34 34 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 35 35 1 1 1.00 60.00 45.00 36 36 1 1 1.00 60.00 45.00 37 1650 2100 1650 240037 2550 3000 2550 3300 38 38 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 39 39 1 1 1.00 60.00 45.00 40 40 1 1 1.00 60.00 45.00 41 5250 2100 5250 240041 6150 3000 6150 3300 42 42 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 43 43 1 1 1.00 60.00 45.00 44 44 1 1 1.00 60.00 45.00 45 5250 2700 5250 300045 6150 3600 6150 3900 46 46 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 47 47 1 1 1.00 60.00 45.00 48 48 1 1 1.00 60.00 45.00 49 2850 2100 2850 240049 3750 3000 3750 3300 50 50 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 51 51 1 1 1.00 60.00 45.00 52 52 1 1 1.00 60.00 45.00 53 4 050 2100 4050 240053 4950 3000 4950 3300 54 54 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 55 55 1 1 1.00 60.00 45.00 56 56 1 1 1.00 60.00 45.00 57 4 050 2700 4050 300057 4950 3600 4950 3900 58 58 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 59 59 1 1 1.00 60.00 45.00 60 60 1 1 1.00 60.00 45.00 61 1650 2700 1650 300061 3750 3600 3750 3900 62 62 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 63 63 1 1 1.00 60.00 45.00 64 64 1 1 1.00 60.00 45.00 65 1650 3600 1650 390065 2550 3600 2550 3900 66 66 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 67 67 1 1 1.00 60.00 45.00 68 68 1 1 1.00 60.00 45.00 69 2 850 3600 2850 390069 2550 4500 2550 4800 70 70 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 71 71 1 1 1.00 60.00 45.00 72 72 1 1 1.00 60.00 45.00 73 4050 3600 4050 390073 3750 4500 3750 4800 74 74 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 75 75 1 1 1.00 60.00 45.00 76 76 1 1 1.00 60.00 45.00 77 5250 3600 5250 390077 4950 4500 4950 4800 78 78 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 79 79 1 1 1.00 60.00 45.00 80 80 1 1 1.00 60.00 45.00 81 3300 4350 3600 435081 6150 4500 6150 4800 82 82 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 83 83 1 1 1.00 60.00 45.00 84 84 1 1 1.00 60.00 45.00 85 2850 2700 2850 300086 4 1 0 50 -1 0 12 0.0000 2 165 945 1650 1725 CORE$_0$\00187 4 1 0 50 -1 0 12 0.0000 2 135 225 2250 4425 L3\00188 4 1 0 50 -1 0 12 0.0000 2 135 225 4650 4425 L3\00189 4 1 0 50 -1 0 12 0.0000 2 135 225 5250 3375 L2\00190 4 1 0 50 -1 0 12 0.0000 2 135 225 4050 3375 L2\00191 4 1 0 50 -1 0 12 0.0000 2 135 225 2850 3375 L2\00192 4 1 0 50 -1 0 12 0.0000 2 135 225 1650 3375 L2\00193 4 1 0 50 -1 0 12 0.0000 2 135 225 1650 2625 L1\00194 4 1 0 50 -1 0 12 0.0000 2 135 225 2850 2625 L1\00195 4 1 0 50 -1 0 12 0.0000 2 135 225 4050 2625 L1\00196 4 1 0 50 -1 0 12 0.0000 2 135 225 5250 2625 L1\00197 4 1 0 50 -1 0 12 0.0000 2 165 945 2850 1725 CORE$_1$\00198 4 1 0 50 -1 0 12 0.0000 2 165 945 4050 1725 CORE$_2$\00199 4 1 0 50 -1 0 12 0.0000 2 165 945 5250 1725 CORE$_3$\00185 4200 5250 4500 5250 86 4 0 0 50 -1 0 11 0.0000 2 135 360 4725 2625 CPU2\001 87 4 0 0 50 -1 0 11 0.0000 2 135 360 2325 2625 CPU0\001 88 4 0 0 50 -1 0 11 0.0000 2 135 360 5925 2625 CPU3\001 89 4 0 0 50 -1 0 11 0.0000 2 135 360 3525 2625 CPU1\001 90 4 0 0 50 -1 0 11 0.0000 2 135 180 2475 3525 L1\001 91 4 0 0 50 -1 0 11 0.0000 2 135 180 4875 3525 L1\001 92 4 0 0 50 -1 0 11 0.0000 2 135 180 6075 3525 L1\001 93 4 0 0 50 -1 0 11 0.0000 2 135 180 2400 4275 L2\001 94 4 0 0 50 -1 0 11 0.0000 2 135 180 4875 4275 L2\001 95 4 0 0 50 -1 0 11 0.0000 2 135 180 3675 4275 L2\001 96 4 0 0 50 -1 0 11 0.0000 2 135 180 6075 4275 L2\001 97 4 0 0 50 -1 0 11 0.0000 2 135 180 3675 3525 L1\001 98 4 0 0 50 -1 0 11 0.0000 2 135 180 3000 5250 L3\001 99 4 0 0 50 -1 0 11 0.0000 2 135 180 5475 5250 L3\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/cache-share.fig
rffec1bf r9e23b446 8 8 -2 9 9 1200 2 10 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1650 1650 456 456 1650 1650 1200 157511 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4050 1650 456 456 4050 1650 3600 157512 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5250 1650 456 456 5250 1650 4800 157513 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2850 1650 456 456 2850 1650 2400 157510 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2550 2550 456 456 2550 2550 2100 2475 11 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3750 2550 456 456 3750 2550 3300 2475 12 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4950 2550 456 456 4950 2550 4500 2475 13 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6150 2550 456 456 6150 2550 5700 2475 14 14 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 15 1200 2400 2100 2400 2100 2700 1200 2700 1200 240015 2100 3300 3000 3300 3000 3600 2100 3600 2100 3300 16 16 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 17 1200 3000 2100 3000 2100 3600 1200 3600 1200 300017 2100 3900 3000 3900 3000 4500 2100 4500 2100 3900 18 18 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 19 2400 2400 3300 2400 3300 2700 2400 2700 2400 240019 3300 3300 4200 3300 4200 3600 3300 3600 3300 3300 20 20 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 21 2400 3000 3300 3000 3300 3600 2400 3600 2400 300021 3300 3900 4200 3900 4200 4500 3300 4500 3300 3900 22 22 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 23 3600 2400 4500 2400 4500 2700 3600 2700 3600 240023 4500 3300 5400 3300 5400 3600 4500 3600 4500 3300 24 24 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 25 3600 3000 4500 3000 4500 3600 3600 3600 3600 300025 4500 3900 5400 3900 5400 4500 4500 4500 4500 3900 26 26 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 27 4800 2400 5700 2400 5700 2700 4800 2700 4800 240027 5700 3300 6600 3300 6600 3600 5700 3600 5700 3300 28 28 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 29 4800 3000 5700 3000 5700 3600 4800 3600 4800 3000 29 5700 3900 6600 3900 6600 4500 5700 4500 5700 3900 30 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 31 2100 4800 6600 4800 6600 5775 2100 5775 2100 4800 30 32 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 31 33 1 1 1.00 60.00 45.00 32 34 1 1 1.00 60.00 45.00 33 1650 2100 1650 240035 2550 3000 2550 3300 34 36 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 35 37 1 1 1.00 60.00 45.00 36 38 1 1 1.00 60.00 45.00 37 2850 2100 2850 240039 3750 3000 3750 3300 38 40 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 39 41 1 1 1.00 60.00 45.00 40 42 1 1 1.00 60.00 45.00 41 4 050 2100 4050 240043 4950 3000 4950 3300 42 44 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 43 45 1 1 1.00 60.00 45.00 44 46 1 1 1.00 60.00 45.00 45 5250 2100 5250 240047 6150 3000 6150 3300 46 48 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 47 49 1 1 1.00 60.00 45.00 48 50 1 1 1.00 60.00 45.00 49 5250 2700 5250 300051 6150 3600 6150 3900 50 52 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 51 53 1 1 1.00 60.00 45.00 52 54 1 1 1.00 60.00 45.00 53 4 050 2700 4050 300055 4950 3600 4950 3900 54 56 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 55 57 1 1 1.00 60.00 45.00 56 58 1 1 1.00 60.00 45.00 57 2850 2700 2850 300059 3750 3600 3750 3900 58 60 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 59 61 1 1 1.00 60.00 45.00 60 62 1 1 1.00 60.00 45.00 61 1650 2700 1650 300063 2550 3600 2550 3900 62 64 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 63 65 1 1 1.00 60.00 45.00 64 66 1 1 1.00 60.00 45.00 65 1650 3600 1650 390067 2550 4500 2550 4800 66 68 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 67 69 1 1 1.00 60.00 45.00 68 70 1 1 1.00 60.00 45.00 69 2850 3600 2850 390071 3750 4500 3750 4800 70 72 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 71 73 1 1 1.00 60.00 45.00 72 74 1 1 1.00 60.00 45.00 73 4 050 3600 4050 390075 4950 4500 4950 4800 74 76 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 75 77 1 1 1.00 60.00 45.00 76 78 1 1 1.00 60.00 45.00 77 5250 3600 5250 3900 78 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 79 1200 3900 5700 3900 5700 4800 1200 4800 1200 3900 80 4 1 0 50 -1 0 12 0.0000 2 135 225 3450 4425 L3\001 81 4 1 0 50 -1 0 12 0.0000 2 135 225 1650 3375 L2\001 82 4 1 0 50 -1 0 12 0.0000 2 135 225 2850 3375 L2\001 83 4 1 0 50 -1 0 12 0.0000 2 135 225 4050 3375 L2\001 84 4 1 0 50 -1 0 12 0.0000 2 135 225 5250 3375 L2\001 85 4 1 0 50 -1 0 12 0.0000 2 135 225 5250 2625 L1\001 86 4 1 0 50 -1 0 12 0.0000 2 135 225 4050 2625 L1\001 87 4 1 0 50 -1 0 12 0.0000 2 135 225 2850 2625 L1\001 88 4 1 0 50 -1 0 12 0.0000 2 135 225 1650 2625 L1\001 89 4 1 0 50 -1 0 12 0.0000 2 165 945 1650 1725 CORE$_0$\001 90 4 1 0 50 -1 0 12 0.0000 2 165 945 2850 1725 CORE$_1$\001 91 4 1 0 50 -1 0 12 0.0000 2 165 945 4050 1725 CORE$_2$\001 92 4 1 0 50 -1 0 12 0.0000 2 165 945 5250 1725 CORE$_3$\001 79 6150 4500 6150 4800 80 4 0 0 50 -1 0 11 0.0000 2 135 360 4725 2625 CPU2\001 81 4 0 0 50 -1 0 11 0.0000 2 135 360 2325 2625 CPU0\001 82 4 0 0 50 -1 0 11 0.0000 2 135 360 5925 2625 CPU3\001 83 4 0 0 50 -1 0 11 0.0000 2 135 360 3525 2625 CPU1\001 84 4 0 0 50 -1 0 11 0.0000 2 135 180 2475 3525 L1\001 85 4 0 0 50 -1 0 11 0.0000 2 135 180 4875 3525 L1\001 86 4 0 0 50 -1 0 11 0.0000 2 135 180 6075 3525 L1\001 87 4 0 0 50 -1 0 11 0.0000 2 135 180 2400 4275 L2\001 88 4 0 0 50 -1 0 11 0.0000 2 135 180 4875 4275 L2\001 89 4 0 0 50 -1 0 11 0.0000 2 135 180 3675 4275 L2\001 90 4 0 0 50 -1 0 11 0.0000 2 135 180 6075 4275 L2\001 91 4 0 0 50 -1 0 11 0.0000 2 135 180 3675 3525 L1\001 92 4 0 0 50 -1 0 11 0.0000 2 135 180 4275 5325 L3\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/cycle.fig
rffec1bf r9e23b446 8 8 -2 9 9 1200 2 10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 31 50.000 4012.500 2850 4575 3150 4650 3450 457511 1 11.00 60.00 120.0012 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 2268.750 3450.000 1950 3825 1800 3600 1800 330013 1 11.00 60.00 120.0014 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 4031.250 3450.000 4350 3825 4500 3600 4500 330015 1 11.00 60.00 120.0016 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 3675.000 2250.000 3750 1725 4050 1875 4200 217517 1 11.00 60.00 120.0018 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2625.000 2250.000 2550 1725 2250 1875 2100 217519 1 11.00 60.00 120.0020 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3 150 1800 600 600 3150 1800 3750 180021 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1875 2700 600 600 1875 2700 2475 270022 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2400 4200 600 600 2400 4200 3000 420023 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 4200 600 600 3900 4200 4500 420024 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4425 2700 600 600 4425 2700 5025 270025 4 1 0 50 -1 0 11 0.0000 2 165 855 2400 4275 Thread$_3$\00126 4 1 0 50 -1 0 11 0.0000 2 165 855 3900 4275 Thread$_4$\00127 4 1 0 50 -1 0 11 0.0000 2 165 855 1875 2775 Thread$_2$\00128 4 1 0 50 -1 0 11 0.0000 2 165 855 3150 1875 Thread$_1$\00129 4 1 0 50 -1 0 11 0.0000 2 165 855 4425 2775 Thread$_5$\00130 4 1 0 50 -1 0 11 0.0000 2 180 540 3150 4875 Unpark\00131 4 0 0 50 -1 0 11 0.0000 2 1 80 540 4650 3675 Unpark\00132 4 2 0 50 -1 0 11 0.0000 2 180 540 1650 3600 Unpark\00133 4 2 0 50 -1 0 11 0.0000 2 180 540 2100 1875 Unpark\00134 4 0 0 50 -1 0 11 0.0000 2 1 80 540 4200 1875 Unpark\00110 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3144.643 2341.072 3525 2250 3375 2025 3150 1950 11 2 0 1.00 60.00 120.00 12 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 1955.357 2341.072 1950 1950 1725 2025 1575 2250 13 2 0 1.00 60.00 120.00 14 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3637.500 3487.500 3750 3750 3900 3600 3900 3375 15 2 0 1.00 60.00 120.00 16 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2587.500 4087.500 2325 4500 2550 4575 2850 4500 17 2 0 1.00 60.00 120.00 18 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 1612.500 3487.500 1200 3375 1200 3600 1350 3825 19 2 0 1.00 60.00 120.00 20 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3675 2850 586 586 3675 2850 4125 3225 21 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3300 4125 586 586 3300 4125 3750 4500 22 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1875 4125 586 586 1875 4125 2325 4500 23 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1425 2850 586 586 1425 2850 1875 3225 24 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2550 1950 586 586 2550 1950 3000 2325 25 4 0 0 50 -1 0 11 0.0000 2 135 720 1125 2925 Thread 2\001 26 4 2 0 50 -1 0 11 0.0000 2 165 540 1650 1950 Unpark\001 27 4 0 0 50 -1 0 11 0.0000 2 165 540 4050 3600 Unpark\001 28 4 2 0 50 -1 0 11 0.0000 2 165 540 1125 3750 Unpark\001 29 4 2 0 50 -1 0 11 0.0000 2 165 540 2850 4800 Unpark\001 30 4 0 0 50 -1 0 11 0.0000 2 135 720 2250 2025 Thread 1\001 31 4 0 0 50 -1 0 11 0.0000 2 135 720 3000 4200 Thread 4\001 32 4 0 0 50 -1 0 11 0.0000 2 135 720 1575 4200 Thread 3\001 33 4 0 0 50 -1 0 11 0.0000 2 165 540 3525 2025 Unpark\001 34 4 0 0 50 -1 0 11 0.0000 2 135 720 3375 2925 Thread 5\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/idle.fig
rffec1bf r9e23b446 8 8 -2 9 9 1200 2 10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3376.136 2169.318 2250 2625 2775 3225 3525 3375 11 1 1 1.00 60.00 120.00 12 7 1 1.00 60.00 60.00 13 6 3466 2774 3899 3149 10 6 5919 5250 6375 5775 11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410 12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410 13 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 14 6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501 15 6284 5501 6284 5410 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 17 6102 5410 6102 5501 6192 5501 6192 5410 18 -6 19 6 7442 6525 7875 6900 14 20 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 15 3525 2833 3466 314921 7501 6584 7442 6900 16 22 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 17 3880 2833 3860 295223 7856 6584 7836 6703 18 24 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 19 3505 2952 3623 2912 3761 2971 3860 295225 7481 6703 7599 6663 7737 6722 7836 6703 20 26 0.000 -0.500 -0.500 0.000 21 27 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 22 3527 2828 3645 2789 3783 2848 3881 282828 7503 6579 7621 6540 7759 6599 7857 6579 23 29 0.000 -0.500 -0.500 0.000 24 30 -6 25 6 3599 3074 3974 357431 6 7575 6825 7950 7325 26 32 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 27 3599 3199 3724 3074 3974 3074 3974 3574 3599 3574 3599 319928 3724 3199 3724 307433 7575 6950 7700 6825 7950 6825 7950 7325 7575 7325 7575 6950 34 7700 6950 7700 6825 29 35 -6 30 6 5116 2774 5549 314936 6 9092 6525 9525 6900 31 37 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 32 5175 2833 5116 314938 9151 6584 9092 6900 33 39 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 34 5530 2833 5510 295240 9506 6584 9486 6703 35 41 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 36 5155 2952 5273 2912 5411 2971 5510 295242 9131 6703 9249 6663 9387 6722 9486 6703 37 43 0.000 -0.500 -0.500 0.000 38 44 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 39 5177 2828 5295 2789 5433 2848 5531 282845 9153 6579 9271 6540 9409 6599 9507 6579 40 46 0.000 -0.500 -0.500 0.000 41 47 -6 42 6 5249 3074 5625 357448 6 9225 6825 9600 7325 43 49 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 44 5249 3199 5374 3074 5625 3074 5625 3574 5249 3574 5249 319945 5374 3199 5374 307450 9225 6950 9350 6825 9600 6825 9600 7325 9225 7325 9225 6950 51 9350 6950 9350 6825 46 52 -6 47 6 6766 2774 7199 314953 6 10742 6525 11175 6900 48 54 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 49 6825 2833 6766 314955 10801 6584 10742 6900 50 56 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 51 7180 2833 7160 295257 11156 6584 11136 6703 52 58 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 53 6805 2952 6923 2912 7061 2971 7160 295259 10781 6703 10899 6663 11037 6722 11136 6703 54 60 0.000 -0.500 -0.500 0.000 55 61 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 56 6827 2828 6945 2789 7083 2848 7181 282862 10803 6579 10921 6540 11059 6599 11157 6579 57 63 0.000 -0.500 -0.500 0.000 58 64 -6 59 6 6899 3074 7274 357465 6 10875 6825 11250 7325 60 66 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 61 6899 3199 7024 3074 7274 3074 7274 3574 6899 3574 6899 3199 62 7024 3199 7024 3074 63 -6 64 6 1875 1500 2331 2025 65 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660 66 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660 67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 68 1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751 69 2240 1751 2240 1660 70 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 71 2058 1660 2058 1751 2148 1751 2148 1660 67 10875 6950 11000 6825 11250 6825 11250 7325 10875 7325 10875 6950 68 11000 6950 11000 6825 72 69 -6 73 70 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 74 1800 2400 2699 2399 71 5850 6150 6675 6150 72 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 73 5850 5250 6675 5250 6675 6600 5850 6600 5850 5250 74 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 75 1 1 1.00 60.00 120.00 76 7 0 1.00 60.00 60.00 77 7725 6150 7725 6525 78 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 79 1 1 1.00 60.00 120.00 80 7 0 1.00 60.00 60.00 81 9375 6150 9375 6525 82 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 83 1 1 1.00 60.00 120.00 84 7 0 1.00 60.00 60.00 85 11025 6150 11025 6525 86 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 87 10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400 88 10500 5854 89 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 90 8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400 91 8850 5854 92 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 93 7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400 94 7200 5854 75 95 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 76 96 1 1 1.00 60.00 120.00 77 97 7 1 1.00 60.00 60.00 78 3749 2399 3749 277498 6450 5925 7275 5925 79 99 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 80 100 1 1 1.00 60.00 120.00 81 101 7 1 1.00 60.00 60.00 82 5399 2399 5399 2774102 8025 5925 8925 5925 83 103 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 84 104 1 1 1.00 60.00 120.00 85 105 7 1 1.00 60.00 60.00 86 2550 2175 3299 2174106 9675 5925 10575 5925 87 107 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 88 108 1 1 1.00 60.00 120.00 89 109 7 1 1.00 60.00 60.00 90 4049 2174 4949 2174110 10725 5775 9825 5775 91 111 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 92 112 1 1 1.00 60.00 120.00 93 113 7 1 1.00 60.00 60.00 94 5699 2174 6599 217495 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 114 9075 5775 8175 5775 115 3 2 0 1 0 7 50 -1 -1 0.000 0 1 1 4 96 116 1 1 1.00 60.00 120.00 97 117 7 1 1.00 60.00 60.00 98 6749 2024 5849 2024 99 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 100 1 1 1.00 60.00 120.00 101 7 1 1.00 60.00 60.00 102 5099 2024 4199 2024 103 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 104 1800 1499 2699 1499 2699 2850 1800 2850 1800 1499 105 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 106 4950 1650 5850 1650 5850 2550 4950 2550 4950 1650 107 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 108 3300 1650 4200 1650 4200 2550 3300 2550 3300 1650 109 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 110 6600 1650 7500 1650 7500 2550 6600 2550 6600 1650 111 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 112 1 1 1.00 60.00 120.00 113 7 1 1.00 60.00 60.00 114 7049 2399 7049 2774 115 4 0 0 50 -1 0 11 0.0000 2 120 525 1799 3149 Atomic\001 116 4 0 0 50 -1 0 11 0.0000 2 120 510 1799 3374 Pointer\001 117 4 0 0 50 -1 0 11 0.0000 2 180 765 3974 2924 Benaphore\001 118 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3374 Event FD\001 119 4 0 0 50 -1 0 11 0.0000 2 180 765 5625 2924 Benaphore\001 120 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3374 Event FD\001 121 4 0 0 50 -1 0 11 0.0000 2 180 765 7274 2924 Benaphore\001 122 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3374 Event FD\001 123 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001 124 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001 125 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001 126 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001 127 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001 128 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001 118 6300 6375 6375 6825 6750 7050 7350 6975 119 0.000 -0.500 -0.500 0.000 120 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001 121 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001 122 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001 123 4 0 0 50 -1 0 11 0.0000 2 135 540 5775 6900 Atomic\001 124 4 0 0 50 -1 0 11 0.0000 2 135 630 5775 7125 Pointer\001 125 4 0 0 50 -1 0 11 0.0000 2 165 810 7950 6675 Benaphore\001 126 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 7125 Event FD\001 127 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001 128 4 0 0 50 -1 0 11 0.0000 2 165 810 9600 6675 Benaphore\001 129 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 7125 Event FD\001 130 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001 131 4 0 0 50 -1 0 11 0.0000 2 165 810 11250 6675 Benaphore\001 132 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 7125 Event FD\001 133 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/idle1.fig
rffec1bf r9e23b446 8 8 -2 9 9 1200 2 10 6 1875 1500 2331 202511 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 166012 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 166010 6 5919 5250 6375 5775 11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410 12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410 13 13 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 14 1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 175115 2240 1751 2240 166014 6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501 15 6284 5501 6284 5410 16 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 17 2058 1660 2058 1751 2148 1751 2148 166017 6102 5410 6102 5501 6192 5501 6192 5410 18 18 -6 19 6 3599 2774 3974 327419 6 7575 6525 7950 7025 20 20 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 21 3599 2899 3724 2774 3974 2774 3974 3274 3599 3274 3599 289922 3724 2899 3724 277421 7575 6650 7700 6525 7950 6525 7950 7025 7575 7025 7575 6650 22 7700 6650 7700 6525 23 23 -6 24 6 5249 2774 5625 327424 6 9225 6525 9600 7025 25 25 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 26 5249 2899 5374 2774 5625 2774 5625 3274 5249 3274 5249 289927 5374 2899 5374 277426 9225 6650 9350 6525 9600 6525 9600 7025 9225 7025 9225 6650 27 9350 6650 9350 6525 28 28 -6 29 6 6899 2774 7274 327429 6 10875 6525 11250 7025 30 30 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 31 6899 2899 7024 2774 7274 2774 7274 3274 6899 3274 6899 289932 7024 2899 7024 277431 10875 6650 11000 6525 11250 6525 11250 7025 10875 7025 10875 6650 32 11000 6650 11000 6525 33 33 -6 34 34 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 35 35 1 1 1.00 60.00 120.00 36 7 1 1.00 60.00 60.00 37 3749 2399 3749 2774 36 7 0 1.00 60.00 60.00 37 7725 6150 7725 6525 38 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 39 1 1 1.00 60.00 120.00 40 7 0 1.00 60.00 60.00 41 9375 6150 9375 6525 42 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 43 1 1 1.00 60.00 120.00 44 7 0 1.00 60.00 60.00 45 11025 6150 11025 6525 46 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 47 10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400 48 10500 5854 49 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 50 8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400 51 8850 5854 52 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 53 7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400 54 7200 5854 38 55 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 39 56 1 1 1.00 60.00 120.00 40 57 7 1 1.00 60.00 60.00 41 5399 2399 5399 277458 6450 5925 7275 5925 42 59 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 43 60 1 1 1.00 60.00 120.00 44 61 7 1 1.00 60.00 60.00 45 7049 2399 7049 277462 8025 5925 8925 5925 46 63 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 47 64 1 1 1.00 60.00 120.00 48 65 7 1 1.00 60.00 60.00 49 2550 2175 3299 217466 9675 5925 10575 5925 50 67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 51 68 1 1 1.00 60.00 120.00 52 69 7 1 1.00 60.00 60.00 53 4049 2174 4949 217470 10725 5775 9825 5775 54 71 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 55 72 1 1 1.00 60.00 120.00 56 73 7 1 1.00 60.00 60.00 57 5699 2174 6599 2174 58 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 59 1 1 1.00 60.00 120.00 60 7 1 1.00 60.00 60.00 61 6749 2024 5849 2024 62 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 63 1 1 1.00 60.00 120.00 64 7 1 1.00 60.00 60.00 65 5099 2024 4199 2024 74 9075 5775 8175 5775 66 75 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 67 4950 1650 5850 1650 5850 2550 4950 2550 4950 1650 68 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 69 3300 1650 4200 1650 4200 2550 3300 2550 3300 1650 70 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 71 6600 1650 7500 1650 7500 2550 6600 2550 6600 1650 72 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 73 1800 1499 2699 1499 2699 2400 1800 2400 1800 1499 74 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001 75 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001 76 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001 77 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001 78 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001 79 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001 80 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3074 Event FD\001 81 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3074 Event FD\001 82 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3074 Event FD\001 76 5850 5250 6675 5250 6675 6075 5850 6075 5850 5250 77 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001 78 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001 79 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001 80 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001 81 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001 82 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001 83 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 6825 Event FD\001 84 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 6825 Event FD\001 85 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 6825 Event FD\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/idle2.fig
rffec1bf r9e23b446 8 8 -2 9 9 1200 2 10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3150.000 2106.250 2250 2625 2775 3075 3525 3075 11 1 1 1.00 60.00 120.00 12 7 1 1.00 60.00 60.00 13 6 1875 1500 2331 2025 14 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660 15 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660 10 6 5919 5250 6375 5775 11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410 12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410 16 13 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 17 1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 175118 2240 1751 2240 166014 6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501 15 6284 5501 6284 5410 19 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 20 2058 1660 2058 1751 2148 1751 2148 166017 6102 5410 6102 5501 6192 5501 6192 5410 21 18 -6 22 6 3599 2774 3974 327419 6 7575 6525 7950 7025 23 20 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 24 3599 2899 3724 2774 3974 2774 3974 3274 3599 3274 3599 289925 3724 2899 3724 277421 7575 6650 7700 6525 7950 6525 7950 7025 7575 7025 7575 6650 22 7700 6650 7700 6525 26 23 -6 27 6 5249 2774 5625 327424 6 9225 6525 9600 7025 28 25 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 29 5249 2899 5374 2774 5625 2774 5625 3274 5249 3274 5249 289930 5374 2899 5374 277426 9225 6650 9350 6525 9600 6525 9600 7025 9225 7025 9225 6650 27 9350 6650 9350 6525 31 28 -6 32 6 6899 2774 7274 327429 6 10875 6525 11250 7025 33 30 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 34 6899 2899 7024 2774 7274 2774 7274 3274 6899 3274 6899 289935 7024 2899 7024 277431 10875 6650 11000 6525 11250 6525 11250 7025 10875 7025 10875 6650 32 11000 6650 11000 6525 36 33 -6 37 34 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 38 1800 2400 2699 2399 35 5850 6150 6675 6150 36 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 37 5850 5250 6675 5250 6675 6600 5850 6600 5850 5250 38 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 39 1 1 1.00 60.00 120.00 40 7 0 1.00 60.00 60.00 41 7725 6150 7725 6525 42 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 43 1 1 1.00 60.00 120.00 44 7 0 1.00 60.00 60.00 45 9375 6150 9375 6525 46 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 47 1 1 1.00 60.00 120.00 48 7 0 1.00 60.00 60.00 49 11025 6150 11025 6525 50 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 51 10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400 52 10500 5854 53 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 54 8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400 55 8850 5854 56 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 57 7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400 58 7200 5854 39 59 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 40 60 1 1 1.00 60.00 120.00 41 61 7 1 1.00 60.00 60.00 42 3749 2399 3749 277462 6450 5925 7275 5925 43 63 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 44 64 1 1 1.00 60.00 120.00 45 65 7 1 1.00 60.00 60.00 46 5399 2399 5399 277466 8025 5925 8925 5925 47 67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 48 68 1 1 1.00 60.00 120.00 49 69 7 1 1.00 60.00 60.00 50 7049 2399 7049 277470 9675 5925 10575 5925 51 71 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 52 72 1 1 1.00 60.00 120.00 53 73 7 1 1.00 60.00 60.00 54 2550 2175 3299 217474 10725 5775 9825 5775 55 75 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 56 76 1 1 1.00 60.00 120.00 57 77 7 1 1.00 60.00 60.00 58 4049 2174 4949 217459 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 78 9075 5775 8175 5775 79 3 2 0 1 0 7 50 -1 -1 0.000 0 1 1 4 60 80 1 1 1.00 60.00 120.00 61 81 7 1 1.00 60.00 60.00 62 5699 2174 6599 2174 63 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 64 1 1 1.00 60.00 120.00 65 7 1 1.00 60.00 60.00 66 6749 2024 5849 2024 67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 68 1 1 1.00 60.00 120.00 69 7 1 1.00 60.00 60.00 70 5099 2024 4199 2024 71 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 72 1800 1499 2699 1499 2699 2850 1800 2850 1800 1499 73 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 74 4950 1650 5850 1650 5850 2550 4950 2550 4950 1650 75 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 76 3300 1650 4200 1650 4200 2550 3300 2550 3300 1650 77 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 78 6600 1650 7500 1650 7500 2550 6600 2550 6600 1650 79 4 0 0 50 -1 0 11 0.0000 2 120 525 1799 3149 Atomic\001 80 4 0 0 50 -1 0 11 0.0000 2 120 510 1799 3374 Pointer\001 81 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001 82 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001 83 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001 84 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001 85 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001 86 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001 87 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3074 Event FD\001 88 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3074 Event FD\001 89 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3074 Event FD\001 82 6300 6375 6375 6825 6900 6975 7500 6750 83 0.000 -0.500 -0.500 0.000 84 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001 85 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001 86 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001 87 4 0 0 50 -1 0 11 0.0000 2 135 540 5775 6900 Atomic\001 88 4 0 0 50 -1 0 11 0.0000 2 135 630 5775 7125 Pointer\001 89 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001 90 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001 91 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001 92 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 6825 Event FD\001 93 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 6825 Event FD\001 94 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 6825 Event FD\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/idle_state.fig
rffec1bf r9e23b446 8 8 -2 9 9 1200 2 10 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3 000 3600 600 600 3000 3600 2400 360011 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1800 1800 600 600 1800 1800 1200 180012 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4205 1800 600 600 4205 1800 3605 180010 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 3600 571 571 3900 3600 3375 3375 11 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 3600 605 605 6300 3600 5775 3300 12 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 5400 600 600 5100 5400 4500 5400 13 13 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 14 1 11.00 60.00 120.0015 2100 2325 2625 315014 0 0 1.00 60.00 120.00 15 4200 4125 4725 4950 16 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 17 1 11.00 60.00 120.0018 2400 1800 3600 180017 0 0 1.00 60.00 120.00 18 4500 3600 5700 3600 19 19 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 20 1 11.00 60.00 120.0021 3900 2325 3375 315022 4 1 0 50 -1 0 11 0.0000 2 1 20 675 3000 3675 AWAKE\00123 4 1 0 50 -1 0 11 0.0000 2 1 20 525 4200 1875 SLEEP\00124 4 1 0 50 -1 0 11 0.0000 2 1 20 720 1800 1875 SEARCH\00125 4 2 0 50 -1 0 11 0.0000 2 120 720 2250 2850 CANCEL\00126 4 1 0 50 -1 0 11 0.0000 2 120 840 2925 1650 CONFIRM\00127 4 0 0 50 -1 0 11 0.0000 2 120 540 3750 2850 WAKE\00120 0 0 1.00 60.00 120.00 21 5923 4125 5475 4875 22 4 1 0 50 -1 0 11 0.0000 2 135 450 5100 5475 AWAKE\001 23 4 1 0 50 -1 0 11 0.0000 2 135 450 6300 3675 SLEEP\001 24 4 1 0 50 -1 0 11 0.0000 2 135 540 3900 3675 SEARCH\001 25 4 0 0 50 -1 0 11 0.0000 2 135 360 5775 4650 WAKE\001 26 4 2 0 50 -1 0 11 0.0000 2 135 540 4350 4650 CANCEL\001 27 4 1 0 50 -1 0 11 0.0000 2 135 630 5025 3450 CONFIRM\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/io_uring.fig
rffec1bf r9e23b446 8 8 -2 9 9 1200 2 10 6 675 3105 2520 337510 6 180 3240 2025 3510 11 11 2 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2 12 1215 3105 1215 337512 720 3240 720 3510 13 13 2 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2 14 945 3105 945 337514 450 3240 450 3510 15 15 2 2 0 1 0 7 45 -1 20 0.000 0 0 -1 0 0 5 16 675 3105 1755 3105 1755 3375 675 3375 675 310516 180 3240 1260 3240 1260 3510 180 3510 180 3240 17 17 2 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2 18 1485 3105 1485 337519 4 0 0 40 -1 0 12 0.0000 2 165 9 30 1530 3285{\\small S3}\00120 4 0 0 40 -1 0 12 0.0000 2 165 9 30 1260 3285{\\small S2}\00121 4 0 0 40 -1 0 12 0.0000 2 165 9 30 720 3285{\\small S0}\00122 4 0 0 40 -1 0 12 0.0000 2 165 9 30 990 3285{\\small S1}\00118 990 3240 990 3510 19 4 0 0 40 -1 0 12 0.0000 2 165 990 1035 3420 {\\small S3}\001 20 4 0 0 40 -1 0 12 0.0000 2 165 990 765 3420 {\\small S2}\001 21 4 0 0 40 -1 0 12 0.0000 2 165 990 225 3420 {\\small S0}\001 22 4 0 0 40 -1 0 12 0.0000 2 165 990 495 3420 {\\small S1}\001 23 23 -6 24 6 2025 2475 3735 400525 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 2 950.714 3240.000 2385 2565 2070 3240 2385 391524 6 1530 2610 3240 4140 25 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 2455.714 3375.000 1890 2700 1575 3375 1890 4050 26 26 1 1 1.00 60.00 120.00 27 1 3 0 1 0 7 40 -1 20 0.000 1 0.0000 2 970 3240 315 315 2970 3240 3285 324028 1 3 0 1 0 7 50 -1 20 0.000 1 0.0000 2 970 3240 765 765 2970 3240 3735 324027 1 3 0 1 0 7 40 -1 20 0.000 1 0.0000 2475 3375 315 315 2475 3375 2790 3375 28 1 3 0 1 0 7 50 -1 20 0.000 1 0.0000 2475 3375 765 765 2475 3375 3240 3375 29 29 2 1 0 1 0 7 45 -1 -1 0.000 0 0 -1 0 0 2 30 2 970 3240 2628 255530 2475 3375 2133 2690 31 31 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 32 2 970 3240 2264 295832 2475 3375 1769 3093 33 33 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 34 2 970 3240 2264 352634 2475 3375 1769 3661 35 35 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 36 2 970 3240 2628 392236 2475 3375 2133 4057 37 37 2 1 1 1 0 7 35 -1 0 4.000 0 0 -1 0 0 2 38 2 700 3240 3240 324038 2205 3375 2745 3375 39 39 -6 40 6 1080 2115 1980 247541 4 2 0 50 -1 0 12 0.0000 2 135 9 45 1980 2250Submission\00142 4 2 0 50 -1 0 12 0.0000 2 1 80 405 1980 2445Ring\00140 6 585 2250 1485 2610 41 4 2 0 50 -1 0 12 0.0000 2 135 900 1485 2385 Submission\001 42 4 2 0 50 -1 0 12 0.0000 2 165 360 1485 2580 Ring\001 43 43 -6 44 6 4095 2475 5760 400545 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 4 879.000 3240.000 5445 3915 5760 3240 5445 256544 6 3600 2610 5265 4140 45 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 4384.000 3375.000 4950 4050 5265 3375 4950 2700 46 46 1 1 1.00 60.00 120.00 47 1 3 0 1 0 7 40 -1 20 0.000 1 3.1416 4 860 3240 315 315 4860 3240 4545 324048 1 3 0 1 0 7 50 -1 20 0.000 1 3.1416 4 860 3240 765 765 4860 3240 4095 324047 1 3 0 1 0 7 40 -1 20 0.000 1 3.1416 4365 3375 315 315 4365 3375 4050 3375 48 1 3 0 1 0 7 50 -1 20 0.000 1 3.1416 4365 3375 765 765 4365 3375 3600 3375 49 49 2 1 0 1 0 7 45 -1 -1 0.000 0 0 -1 0 0 2 50 4 860 3240 5202 392550 4365 3375 4707 4060 51 51 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 52 4 860 3240 5566 352252 4365 3375 5071 3657 53 53 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 54 4 860 3240 5566 295454 4365 3375 5071 3089 55 55 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 56 4 860 3240 5202 255856 4365 3375 4707 2693 57 57 2 1 1 1 0 7 35 -1 0 4.000 0 0 -1 0 0 2 58 5130 3240 4590 324058 4635 3375 4095 3375 59 59 -6 60 6 5 850 2115 6750 247561 4 0 0 50 -1 0 12 0.0000 2 1 80 405 5850 2445Ring\00162 4 0 0 50 -1 0 12 0.0000 2 1 80 975 5850 2250Completion\00160 6 5355 2250 6255 2610 61 4 0 0 50 -1 0 12 0.0000 2 165 360 5355 2580 Ring\001 62 4 0 0 50 -1 0 12 0.0000 2 165 900 5355 2385 Completion\001 63 63 -6 64 64 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 65 65 1 1 1.00 60.00 120.00 66 3420 1890 3045 235166 2925 2025 2550 2486 67 67 2 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2 68 68 1 1 1.00 60.00 120.00 69 4 770 2340 4320 189069 4275 2475 3825 2025 70 70 2 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2 71 71 1 1 1.00 60.00 120.00 72 3060 4095 3600 441072 2751 4268 3066 4538 73 73 2 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2 74 74 1 1 1.00 60.00 120.00 75 4275 4410 4770 409575 3780 4545 4275 4230 76 76 2 1 1 1 0 7 55 -1 -1 4.000 0 0 -1 0 0 2 77 495 3240 6750 324078 4 0 0 35 -1 0 12 0.0000 2 165 11 40 2340 2925{\\small \\&S2}\00179 4 0 0 50 -1 0 12 0.0000 6 135 390 3285 2430 Push\00180 4 0 0 50 -1 0 12 0.0000 6 135 330 2520 2430 Tail\00181 4 0 0 35 -1 0 12 0.0000 2 165 960 5130 2925 {\\small C0}\00182 4 0 0 35 -1 0 12 0.0000 2 165 960 5310 3285 {\\small C1}\00183 4 0 0 35 -1 0 12 0.0000 2 165 960 5130 3645 {\\small C2}\00184 4 0 0 50 -1 0 12 0.0000 4 135 330 5220 4140Tail\00185 4 0 0 50 -1 0 12 0.0000 6 135 420 5085 2430 Head\00186 4 0 0 50 -1 0 12 0.0000 2 135 960 6030 3150 Kernel Line\00187 4 0 0 50 -1 0 12 0.0000 2 135 105 495 3150 S\00188 4 0 0 35 -1 0 12 0.0000 2 165 1140 2385 3645 {\\small \\&S0}\00189 4 0 0 50 -1 0 12 0.0000 6 135 420 2340 4140Head\00190 4 0 0 35 -1 0 12 0.0000 2 165 1140 2250 3285 {\\small \\&S3}\00191 4 2 0 50 -1 0 12 0.0000 4 135 390 4500 4140 Push\00192 4 1 0 50 -1 0 12 0.0000 2 180 1 290 3915 4680 {\\Large Kernel}\00193 4 0 0 50 -1 0 12 0.0000 6 1 80 315 3285 4140Pop\00194 4 1 0 50 -1 0 12 0.0000 2 180 1725 3915 1755 {\\Large Application}\00195 4 2 0 50 -1 0 12 0.0000 6 180 315 4545 2430 Pop\00177 0 3375 6255 3375 78 4 0 0 35 -1 0 12 0.0000 2 165 1170 1845 3060 {\\small \\&S2}\001 79 4 0 0 35 -1 0 12 0.0000 2 165 1170 1755 3420 {\\small \\&S3}\001 80 4 0 0 35 -1 0 12 0.0000 2 165 1170 1890 3735 {\\small \\&S0}\001 81 4 0 0 50 -1 0 12 0.0000 6 135 360 2790 2565 Push\001 82 4 0 0 50 -1 0 12 0.0000 6 165 270 2880 4230 Pop\001 83 4 0 0 50 -1 0 12 0.0000 6 135 360 2025 4275 Head\001 84 4 0 0 50 -1 0 12 0.0000 6 135 360 2025 2565 Tail\001 85 4 0 0 35 -1 0 12 0.0000 2 165 990 4635 3060 {\\small C0}\001 86 4 0 0 35 -1 0 12 0.0000 2 165 990 4815 3420 {\\small C1}\001 87 4 0 0 35 -1 0 12 0.0000 2 165 990 4635 3780 {\\small C2}\001 88 4 0 0 50 -1 0 12 0.0000 4 135 360 4725 4275 Tail\001 89 4 0 0 50 -1 0 12 0.0000 6 135 360 4590 2565 Head\001 90 4 0 0 50 -1 0 12 0.0000 2 135 990 5535 3285 Kernel Line\001 91 4 1 0 50 -1 0 12 0.0000 2 180 1350 3375 4815 {\\Large Kernel}\001 92 4 1 0 50 -1 0 12 0.0000 2 180 1800 3375 1845 {\\Large Application}\001 93 4 0 0 50 -1 0 12 0.0000 6 165 270 3690 2565 Pop\001 94 4 0 0 50 -1 0 12 0.0000 4 135 360 3465 4230 Push\001 95 4 0 0 50 -1 0 12 0.0000 2 135 90 0 3285 S\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/system.fig
rffec1bf r9e23b446 49 49 7800 3750 8025 3750 50 50 -6 51 6 4125 4725 4950 495052 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4250 4838 100 100 4250 4838 4350 483853 4 0 -1 0 0 0 12 0.0000 2 135 510 4425 4875 thread\00154 -655 6 5175 4725 6300 495056 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 557 5400 4950 5400 4725 5175 4725 5175 4950 5400 495058 4 0 -1 0 0 0 12 0.0000 2 135 765 5475 4875 processor\00159 -660 6 6600 4725 7500 495061 2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 562 6825 4950 6600 4950 6600 4725 6825 4725 6825 495063 4 0 -1 0 0 0 12 0.0000 2 135 540 6900 4875 cluster\00164 -665 6 2175 4725 3975 495066 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 483067 4 0 -1 0 0 0 12 0.0000 2 180 1605 2325 4875 generator/coroutine\00168 -669 6 1575 2550 2775 390070 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 571 2400 3450 2400 3000 1950 3000 1950 3450 2400 345072 4 1 -1 0 0 0 12 0.0000 2 135 1170 2175 2700 Discrete-event\00173 4 1 -1 0 0 0 12 0.0000 2 180 720 2175 2925 Manager\00174 4 1 -1 0 0 0 12 0.0000 2 180 930 2175 3675 preemption\00175 4 1 -1 0 0 0 12 0.0000 2 135 630 2175 3900 timeout\00176 -677 51 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 2625 150 150 5550 2625 5700 2625 78 52 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 3225 150 150 5550 3225 5700 3225 … … 88 62 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 2850 150 150 3975 2850 4125 2850 89 63 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 7200 2775 150 150 7200 2775 7350 2775 64 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830 90 65 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 7200 2775 30 30 7200 2775 7230 2805 91 66 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 3600 150 150 3525 3600 3675 3600 67 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4625 4838 100 100 4625 4838 4725 4838 68 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5 69 2400 4200 2400 3750 1950 3750 1950 4200 2400 4200 92 70 2 2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5 93 71 6300 4500 6300 1800 3000 1800 3000 4500 6300 4500 … … 157 135 1 1 1.00 45.00 90.00 158 136 7875 3750 7875 2325 7200 2325 7200 2550 159 4 1 -1 0 0 0 12 0.0000 2 135 840 5550 4425 Processors\001 160 4 1 -1 0 0 0 12 0.0000 2 180 1215 4200 3975 Ready Threads\001 161 4 1 -1 0 0 0 12 0.0000 2 165 1275 7350 1725 Other Cluster(s)\001 162 4 1 -1 0 0 0 12 0.0000 2 135 990 4650 1725 User Cluster\001 163 4 1 -1 0 0 0 12 0.0000 2 135 1380 4200 3225 Blocked Threads\001 137 2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5 138 6975 4950 6750 4950 6750 4725 6975 4725 6975 4950 139 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5 140 5850 4950 5850 4725 5625 4725 5625 4950 5850 4950 141 4 1 -1 0 0 0 10 0.0000 2 135 900 5550 4425 Processors\001 142 4 1 -1 0 0 0 10 0.0000 2 165 1170 4200 3975 Ready Threads\001 143 4 1 -1 0 0 0 10 0.0000 2 165 1440 7350 1725 Other Cluster(s)\001 144 4 1 -1 0 0 0 10 0.0000 2 135 1080 4650 1725 User Cluster\001 145 4 1 -1 0 0 0 10 0.0000 2 165 630 2175 3675 Manager\001 146 4 1 -1 0 0 0 10 0.0000 2 135 1260 2175 3525 Discrete-event\001 147 4 1 -1 0 0 0 10 0.0000 2 150 900 2175 4350 preemption\001 148 4 0 -1 0 0 0 10 0.0000 2 135 630 7050 4875 cluster\001 149 4 1 -1 0 0 0 10 0.0000 2 135 1350 4200 3225 Blocked Threads\001 150 4 0 -1 0 0 0 10 0.0000 2 135 540 4800 4875 thread\001 151 4 0 -1 0 0 0 10 0.0000 2 120 810 5925 4875 processor\001 152 4 0 -1 0 0 0 10 0.0000 2 165 1710 2325 4875 generator/coroutine\001 -
doc/theses/thierry_delisle_PhD/thesis/local.bib
rffec1bf r9e23b446 2 2 % Cforall 3 3 @misc{cfa:frontpage, 4 howpublished = {\href{https://cforall.uwaterloo.ca}{https://\-cforall.uwaterloo.ca}}4 url = {https://cforall.uwaterloo.ca/} 5 5 } 6 6 @article{cfa:typesystem, … … 481 481 @misc{MAN:linux/cfs, 482 482 title = {{CFS} Scheduler - The Linux Kernel documentation}, 483 howpublished = {\href{https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}{https://\-www.kernel.org/\-doc/\-html/\-latest/\-scheduler/\-sched-design-CFS.html}}483 url = {https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html} 484 484 } 485 485 … … 489 489 year = {2019}, 490 490 month = {February}, 491 howpublished = {\href{https://opensource.com/article/19/2/fair-scheduling-linux}{https://\-opensource.com/\-article/\-19/2\-/\-fair-scheduling-linux}}491 url = {https://opensource.com/article/19/2/fair-scheduling-linux} 492 492 } 493 493 … … 499 499 } 500 500 501 @ misc{MAN:linux/cfs/balancing,501 @article{MAN:linux/cfs/balancing, 502 502 title={Reworking {CFS} load balancing}, 503 journal={LWN article}, 504 year={2019}, 505 howpublished = {\href{https://lwn.net/Articles/793427}{https://\-lwn.net/\-Articles/\-793427}}, 503 journal={LWN article, available at: https://lwn.net/Articles/793427/}, 504 year={2013} 506 505 } 507 506 … … 524 523 title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide}, 525 524 organization = {Apple Inc.}, 526 howPublish = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}}525 url = {https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html} 527 526 } 528 527 … … 537 536 month = {June}, 538 537 series = {Developer Reference}, 539 howpublished = {\href{https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}{https://\-www.microsoftpressstore.com/\-articles/\-article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}}540 } 541 542 @ misc{GITHUB:go,538 url = {https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity} 539 } 540 541 @online{GITHUB:go, 543 542 title = {GitHub - The Go Programming Language}, 544 543 author = {The Go Programming Language}, 545 howpublished = {\href{https://github.com/golang/go}{https://\-github.com/\-golang/\-go}},544 url = {https://github.com/golang/go}, 546 545 version = {Change-Id: If07f40b1d73b8f276ee28ffb8b7214175e56c24d} 547 546 } … … 552 551 year = {2019}, 553 552 booktitle = {Hydra}, 554 howpublished = {\href{https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}{https://\-www.youtube.com/\-watch?v=-K11rY57K7k&ab_channel=Hydra}}553 url = {https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra} 555 554 } 556 555 … … 560 559 year = {2008}, 561 560 booktitle = {Erlang User Conference}, 562 howpublished = {\href{http://www.erlang.se/euc/08/euc_smp.pdf}{http://\-www.erlang.se/\-euc/\-08/\-euc_smp.pdf}} 563 } 561 url = {http://www.erlang.se/euc/08/euc_smp.pdf} 562 } 563 564 564 565 565 566 @manual{MAN:tbb/scheduler, 566 567 title = {Scheduling Algorithm - Intel{\textregistered} Threading Building Blocks Developer Reference}, 567 568 organization = {Intel{\textregistered}}, 568 howpublished = {\href{https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}{https://\-www.threadingbuildingblocks.org/\-docs/\-help/\-reference/\-task\_scheduler/\-scheduling\_algorithm.html}}569 url = {https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html} 569 570 } 570 571 … … 572 573 title = {Quasar Core - Quasar User Manual}, 573 574 organization = {Parallel Universe}, 574 howpublished = {\href{https://docs.paralleluniverse.co/quasar}{https://\-docs.paralleluniverse.co/\-quasar}}575 url = {https://docs.paralleluniverse.co/quasar/} 575 576 } 576 577 @misc{MAN:project-loom, 577 howpublished = {\href{https://www.baeldung.com/openjdk-project-loom}{https://\-www.baeldung.com/\-openjdk-project-loom}}578 url = {https://www.baeldung.com/openjdk-project-loom} 578 579 } 579 580 580 581 @misc{MAN:java/fork-join, 581 howpublished = {\href{https://www.baeldung.com/java-fork-join}{https://\-www.baeldung.com/\-java-fork-join}}582 url = {https://www.baeldung.com/java-fork-join} 582 583 } 583 584 … … 632 633 month = "March", 633 634 version = {0,4}, 634 howpublished = {\ href{https://kernel.dk/io_uring.pdf}{https://\-kernel.dk/\-io\_uring.pdf}}635 howpublished = {\url{https://kernel.dk/io_uring.pdf}} 635 636 } 636 637 … … 641 642 title = "Control theory --- {W}ikipedia{,} The Free Encyclopedia", 642 643 year = "2020", 643 howpublished = {\href{https://en.wikipedia.org/wiki/Task_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Task\_parallelism}},644 url = "https://en.wikipedia.org/wiki/Task_parallelism", 644 645 note = "[Online; accessed 22-October-2020]" 645 646 } … … 649 650 title = "Task parallelism --- {W}ikipedia{,} The Free Encyclopedia", 650 651 year = "2020", 651 howpublished = "\href{https://en.wikipedia.org/wiki/Control_theory}{https://\-en.wikipedia.org/\-wiki/\-Control\_theory}",652 url = "https://en.wikipedia.org/wiki/Control_theory", 652 653 note = "[Online; accessed 22-October-2020]" 653 654 } … … 657 658 title = "Implicit parallelism --- {W}ikipedia{,} The Free Encyclopedia", 658 659 year = "2020", 659 howpublished = "\href{https://en.wikipedia.org/wiki/Implicit_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Implicit\_parallelism}",660 url = "https://en.wikipedia.org/wiki/Implicit_parallelism", 660 661 note = "[Online; accessed 23-October-2020]" 661 662 } … … 665 666 title = "Explicit parallelism --- {W}ikipedia{,} The Free Encyclopedia", 666 667 year = "2017", 667 howpublished = "\href{https://en.wikipedia.org/wiki/Explicit_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Explicit\_parallelism}",668 url = "https://en.wikipedia.org/wiki/Explicit_parallelism", 668 669 note = "[Online; accessed 23-October-2020]" 669 670 } … … 673 674 title = "Linear congruential generator --- {W}ikipedia{,} The Free Encyclopedia", 674 675 year = "2020", 675 howpublished = "\href{https://en.wikipedia.org/wiki/Linear_congruential_generator}{https://en.wikipedia.org/wiki/Linear\_congruential\_generator}",676 url = "https://en.wikipedia.org/wiki/Linear_congruential_generator", 676 677 note = "[Online; accessed 2-January-2021]" 677 678 } … … 681 682 title = "Futures and promises --- {W}ikipedia{,} The Free Encyclopedia", 682 683 year = "2020", 683 howpublished = "\href{https://en.wikipedia.org/wiki/Futures_and_promises}{https://\-en.wikipedia.org/\-wiki/Futures\_and\_promises}",684 url = "https://en.wikipedia.org/wiki/Futures_and_promises", 684 685 note = "[Online; accessed 9-February-2021]" 685 686 } … … 689 690 title = "Read-copy-update --- {W}ikipedia{,} The Free Encyclopedia", 690 691 year = "2022", 691 howpublished = "\href{https://en.wikipedia.org/wiki/Linear_congruential_generator}{https://\-en.wikipedia.org/\-wiki/\-Linear\_congruential\_generator}",692 url = "https://en.wikipedia.org/wiki/Linear_congruential_generator", 692 693 note = "[Online; accessed 12-April-2022]" 693 694 } … … 697 698 title = "Readers-writer lock --- {W}ikipedia{,} The Free Encyclopedia", 698 699 year = "2021", 699 howpublished = "\href{https://en.wikipedia.org/wiki/Readers-writer_lock}{https://\-en.wikipedia.org/\-wiki/\-Readers-writer\_lock}",700 url = "https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock", 700 701 note = "[Online; accessed 12-April-2022]" 701 }702 703 @misc{wiki:binpak,704 author = "{Wikipedia contributors}",705 title = "Bin packing problem --- {W}ikipedia{,} The Free Encyclopedia",706 year = "2022",707 howpublished = "\href{https://en.wikipedia.org/wiki/Bin_packing_problem}{https://\-en.wikipedia.org/\-wiki/\-Bin\_packing\_problem}",708 note = "[Online; accessed 29-June-2022]"709 702 } 710 703 … … 712 705 % [05/04, 12:36] Trevor Brown 713 706 % i don't know where rmr complexity was first introduced, but there are many many many papers that use the term and define it 714 % [05/04, 12:37] Trevor Brown707 % [05/04, 12:37] Trevor Brown 715 708 % here's one paper that uses the term a lot and links to many others that use it... might trace it to something useful there https://drops.dagstuhl.de/opus/volltexte/2021/14832/pdf/LIPIcs-DISC-2021-30.pdf 716 % [05/04, 12:37] Trevor Brown709 % [05/04, 12:37] Trevor Brown 717 710 % another option might be to cite a textbook 718 % [05/04, 12:42] Trevor Brown711 % [05/04, 12:42] Trevor Brown 719 712 % but i checked two textbooks in the area i'm aware of and i don't see a definition of rmr complexity in either 720 % [05/04, 12:42] Trevor Brown713 % [05/04, 12:42] Trevor Brown 721 714 % this one has a nice statement about the prevelance of rmr complexity, as well as some rough definition 722 % [05/04, 12:42] Trevor Brown715 % [05/04, 12:42] Trevor Brown 723 716 % https://dl.acm.org/doi/pdf/10.1145/3465084.3467938 724 717 … … 728 721 % 729 722 % https://doi.org/10.1137/1.9781611973099.100 730 731 732 @misc{AIORant,733 author = "Linus Torvalds",734 title = "Re: [PATCH 09/13] aio: add support for async openat()",735 year = "2016",736 month = jan,737 howpublished = "\href{https://lwn.net/Articles/671657}{https://\-lwn.net/\-Articles/671657}",738 note = "[Online; accessed 6-June-2022]"739 }740 741 @misc{apache,742 key = {Apache Software Foundation},743 title = {{T}he {A}pache Web Server},744 howpublished = {\href{http://httpd.apache.org}{http://\-httpd.apache.org}},745 note = "[Online; accessed 6-June-2022]"746 }747 748 @misc{SeriallyReusable,749 author = {IBM},750 title = {Serially reusable programs},751 month = mar,752 howpublished= {\href{https://www.ibm.com/docs/en/ztpf/1.1.0.15?topic=structures-serially-reusable-programs}{https://www.ibm.com/\-docs/\-en/\-ztpf/\-1.1.0.15?\-topic=structures\--serially\--reusable-programs}},753 year = 2021,754 }755 756 @inproceedings{Albers12,757 author = {Susanne Albers and Antonios Antoniadis},758 title = {Race to Idle: New Algorithms for Speed Scaling with a Sleep State},759 booktitle = {Proceedings of the 2012 Annual ACM-SIAM Symposium on Discrete Algorithms (SODA)},760 doi = {10.1137/1.9781611973099.100},761 URL = {https://epubs.siam.org/doi/abs/10.1137/1.9781611973099.100},762 eprint = {https://epubs.siam.org/doi/pdf/10.1137/1.9781611973099.100},763 year = 2012,764 month = jan,765 pages = {1266-1285},766 } -
doc/theses/thierry_delisle_PhD/thesis/text/core.tex
rffec1bf r9e23b446 1 1 \chapter{Scheduling Core}\label{core} 2 2 3 Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state. 4 For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers. 5 In short, the system is neither overloaded nor underloaded. 6 7 It is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. 8 As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new load and return to the steady state, \eg, by adding or removing workers. 9 Therefore, flaws in scheduling the steady state tend to be pervasive in all states. 3 Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers. In short, the system is neither overloaded nor underloaded. 4 5 It is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new load and return to the steady state, \eg, by adding or removing workers. Therefore, flaws in scheduling the steady state tend to be pervasive in all states. 10 6 11 7 \section{Design Goals} 12 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental-model. 13 To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental-model, the system also respects this model. 8 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental-model. To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental-model, the system also respects this model. 14 9 15 10 For threading, a simple and common execution mental-model is the ``Ideal multi-tasking CPU'' : … … 22 17 Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model. 23 18 24 In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with each other but simply share the hardware. 25 This assumption makes it easier to reason about threading because ready \glspl{thrd} can be thought of in isolation and the effect of the scheduler can be virtually ignored. 26 This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees: 19 In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with each other but simply share the hardware. This assumption makes it easier to reason about threading because ready \glspl{thrd} can be thought of in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees: 27 20 \begin{enumerate} 28 21 \item A fairness guarantee: a \gls{thrd} that is ready to run is not prevented by another thread. … … 30 23 \end{enumerate} 31 24 32 It is important to note that these guarantees are expected only up to a point. 33 \Glspl{thrd} that are ready to run should not be prevented to do so, but they still share the limited hardware resources. 34 Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware resources, even if that share is very small. 35 36 Similar to the performance guarantee, the lack of interference among threads is only relevant up to a point. 37 Ideally, the cost of running and blocking should be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. 38 How much is an acceptable cost is obviously highly variable. 39 For this document, the performance experimentation attempts to show the cost of scheduling is at worst equivalent to existing algorithms used in popular languages. 40 This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models. 41 Recall programmer expectation is that the impact of the scheduler can be ignored. 42 Therefore, if the cost of scheduling is competitive to other popular languages, the guarantee is consider achieved. 25 It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still share the limited hardware resources. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware resources, even if that share is very small. 26 27 Similarly the performance guarantee, the lack of interference among threads, is only relevant up to a point. Ideally, the cost of running and blocking should be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document, the performance experimentation attempts to show the cost of scheduling is at worst equivalent to existing algorithms used in popular languages. This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models. Recall programmer expectation is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is compatitive to other popular languages, the guarantee will be consider achieved. 28 43 29 More precisely the scheduler should be: 44 30 \begin{itemize} … … 48 34 49 35 \subsection{Fairness Goals} 50 For this work , fairness is considered to havetwo strongly related requirements: true starvation freedom and ``fast'' load balancing.51 52 \paragraph{True starvation freedom} means as long as at least one \proc continues to dequeue \ats, all ready \ats should be able to run eventually, \ie, eventual progress.53 In any running system, a \proc can stop dequeuing \ats if it starts running a \at that never blocks.54 Without preemption, traditional work-stealing schedulers do not have starvation freedom in this case.36 For this work fairness will be considered as having two strongly related requirements: true starvation freedom and ``fast'' load balancing. 37 38 \paragraph{True starvation freedom} is more easily defined: As long as at least one \proc continues to dequeue \ats, all read \ats should be able to run eventually. 39 In any running system, \procs can stop dequeing \ats if they start running a \at that will simply never park. 40 Traditional workstealing schedulers do not have starvation freedom in these cases. 55 41 Now this requirement begs the question, what about preemption? 56 42 Generally speaking preemption happens on the timescale of several milliseconds, which brings us to the next requirement: ``fast'' load balancing. 57 43 58 44 \paragraph{Fast load balancing} means that load balancing should happen faster than preemption would normally allow. 59 For interactive applications that need to run at 60, 90, 120 frames per second, \ats having to wait for several mill iseconds to run are effectively starved.45 For interactive applications that need to run at 60, 90, 120 frames per second, \ats having to wait for several millseconds to run are effectively starved. 60 46 Therefore load-balancing should be done at a faster pace, one that can detect starvation at the microsecond scale. 61 47 With that said, this is a much fuzzier requirement since it depends on the number of \procs, the number of \ats and the general load of the system. 62 48 63 49 \subsection{Fairness vs Scheduler Locality} \label{fairnessvlocal} 64 An important performance factor in modern architectures is cache locality. 65 Waiting for data at lower levels or not present in the cache can have a major impact on performance. 66 Having multiple \glspl{hthrd} writing to the same cache lines also leads to cache lines that must be waited on. 67 It is therefore preferable to divide data among each \gls{hthrd}\footnote{This partitioning can be an explicit division up front or using data structures where different \glspl{hthrd} are naturally routed to different cache lines.}. 68 69 For a scheduler, having good locality, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness. 70 Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available. 71 Note that this section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling. 72 External locality is a much more complicated subject and is discussed in the next section. 73 74 However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally. 75 Figure~\ref{fig:fair} shows a visual representation of this behaviour. 76 As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental-model. 50 An important performance factor in modern architectures is cache locality. Waiting for data at lower levels or not present in the cache can have a major impact on performance. Having multiple \glspl{hthrd} writing to the same cache lines also leads to cache lines that must be waited on. It is therefore preferable to divide data among each \gls{hthrd}\footnote{This partitioning can be an explicit division up front or using data structures where different \glspl{hthrd} are naturally routed to different cache lines.}. 51 52 For a scheduler, having good locality\footnote{This section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling. External locality is a much more complicated subject and is discussed in the next section.}, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness. Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available. 53 54 However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally, where Figure~\ref{fig:fair} shows a visual representation of this behaviour. As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental-model. 77 55 78 56 \begin{figure} … … 80 58 \input{fairness.pstex_t} 81 59 \vspace*{-10pt} 82 \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache decreases, Locality. 83 At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model. 84 Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.} 60 \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache, Locality, decreases. At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model. Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.} 85 61 \label{fig:fair} 86 62 \end{figure} 87 63 88 64 \subsection{Performance Challenges}\label{pref:challenge} 89 While there exists a multitude of potential scheduling algorithms, they generally always have to contend with the same performance challenges. 90 Since these challenges are recurring themes in the design of a scheduler it is relevant to describe the central ones here before looking at the design. 65 While there exists a multitude of potential scheduling algorithms, they generally always have to contend with the same performance challenges. Since these challenges are recurring themes in the design of a scheduler it is relevant to describe the central ones here before looking at the design. 91 66 92 67 \subsubsection{Scalability} … … 94 69 Given a large number of \procs and an even larger number of \ats, scalability measures how fast \procs can enqueue and dequeues \ats. 95 70 One could expect that doubling the number of \procs would double the rate at which \ats are dequeued, but contention on the internal data structure of the scheduler can lead to worst improvements. 96 While the ready-queue itself can be sharded to alleviate the main source of contention, auxil iary scheduling features, \eg counting ready \ats, can also be sources of contention.71 While the ready-queue itself can be sharded to alleviate the main source of contention, auxillary scheduling features, \eg counting ready \ats, can also be sources of contention. 97 72 98 73 \subsubsection{Migration Cost} 99 Another important source of scheduling latencyis migration.100 A \at migrates if it executes on two different \procsconsecutively, which is the process discussed in \ref{fairnessvlocal}.101 Migrations can have many different causes, but i n certain programs, it can be impossible to limit migration.102 Chapter~\ref{microbench} has a benchmark where any \at can potentially unblock any other \at, which can lead to \ats migrating frequently.103 Hence, it is important to design the internal data structures of the scheduler to limit anylatency penalty from migrations.74 Another important source of latency in scheduling is migration. 75 An \at is said to have migrated if it is executed by two different \proc consecutively, which is the process discussed in \ref{fairnessvlocal}. 76 Migrations can have many different causes, but it certain programs it can be all but impossible to limit migrations. 77 Chapter~\ref{microbench} for example, has a benchmark where any \at can potentially unblock any other \at, which can leat to \ats migrating more often than not. 78 Because of this it is important to design the internal data structures of the scheduler to limit the latency penalty from migrations. 104 79 105 80 106 81 \section{Inspirations} 107 In general, a na\"{i}ve \glsxtrshort{fifo} ready-queue does not scale with increased parallelism from \glspl{hthrd}, resulting in decreased performance. 108 The problem is a single point of contention when adding/removing \ats. 109 As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}. 110 The solution to this problem is to shard the ready-queue: create multiple \emph{subqueues} forming the logical ready-queue and the subqueues are accessed by multiple \glspl{hthrd} without interfering. 111 112 Before going into the design of \CFA's scheduler, it is relevant to discuss two sharding solutions that served as the inspiration scheduler in this thesis. 82 In general, a na\"{i}ve \glsxtrshort{fifo} ready-queue does not scale with increased parallelism from \glspl{hthrd}, resulting in decreased performance. The problem is adding/removing \glspl{thrd} is a single point of contention. As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}. The solution to this problem is to shard the ready-queue : create multiple sub-ready-queues that multiple \glspl{hthrd} can access and modify without interfering. 83 84 Before going into the design of \CFA's scheduler proper, it is relevant to discuss two sharding solutions which served as the inspiration scheduler in this thesis. 113 85 114 86 \subsection{Work-Stealing} 115 87 116 As mentioned in \ref{existing:workstealing}, a popular sharding approach forthe ready-queue is work-stealing.117 In this approach, each \gls{proc} has its own local subqueue and \glspl{proc} only access each other's subqueue if they run out of work on their local ready-queue.118 The interesting aspect of work stealing happens in the steady-state scheduling case, \ie all \glspl{proc} have work and no load balancing isneeded.119 In th is case, workstealing is close to optimal scheduling: it can achieve perfect locality and have no contention.88 As mentioned in \ref{existing:workstealing}, a popular pattern shard the ready-queue is work-stealing. 89 In this pattern each \gls{proc} has its own local ready-queue and \glspl{proc} only access each other's ready-queue if they run out of work on their local ready-queue. 90 The interesting aspect of workstealing happen in easier scheduling cases, \ie enough work for everyone but no more and no load balancing needed. 91 In these cases, work-stealing is close to optimal scheduling: it can achieve perfect locality and have no contention. 120 92 On the other hand, work-stealing schedulers only attempt to do load-balancing when a \gls{proc} runs out of work. 121 93 This means that the scheduler never balances unfair loads unless they result in a \gls{proc} running out of work. 122 Chapter~\ref{microbench} shows that pathological cases work stealing can lead to indefinite starvation. 123 124 Based on these observation, the conclusion is that a \emph{perfect} scheduler should behave similar to work-stealing in the steady-state case, but load balance proactively when the need arises. 125 126 \subsection{Relaxed-FIFO} 127 A different scheduling approach is to create a ``relaxed-FIFO'' queue, as in \todo{cite Trevor's paper}. 128 This approach forgoes any ownership between \gls{proc} and subqueue, and simply creates a pool of ready-queues from which \glspl{proc} pick. 129 Scheduling is performed as follows: 130 \begin{itemize} 131 \item 132 All subqueues are protected by TryLocks. 133 \item 134 Timestamps are added to each element of a subqueue. 135 \item 136 A \gls{proc} randomly tests ready queues until it has acquired one or two queues. 137 \item 138 If two queues are acquired, the older of the two \ats at the front the acquired queues is dequeued. 139 \item 140 Otherwise the \ats from the single queue is dequeued. 141 \end{itemize} 142 The result is a queue that has both good scalability and sufficient fairness. 143 The lack of ownership ensures that as long as one \gls{proc} is still able to repeatedly dequeue elements, it is unlikely any element will delay longer than any other element. 144 This guarantee contrasts with work-stealing, where a \gls{proc} with a long subqueue results in unfairness for its \ats in comparison to a \gls{proc} with a short subqueue. 145 This unfairness persists until a \gls{proc} runs out of work and steals. 94 Chapter~\ref{microbench} shows that in pathological cases this problem can lead to indefinite starvation. 95 96 97 Based on these observation, the conclusion is that a \emph{perfect} scheduler should behave very similarly to work-stealing in the easy cases, but should have more proactive load-balancing if the need arises. 98 99 \subsection{Relaxed-Fifo} 100 An entirely different scheme is to create a ``relaxed-FIFO'' queue as in \todo{cite Trevor's paper}. This approach forgos any ownership between \gls{proc} and ready-queue, and simply creates a pool of ready-queues from which the \glspl{proc} can pick from. 101 \Glspl{proc} choose ready-queus at random, but timestamps are added to all elements of the queue and dequeues are done by picking two queues and dequeing the oldest element. 102 All subqueues are protected by TryLocks and \procs simply pick a different subqueue if they fail to acquire the TryLock. 103 The result is a queue that has both decent scalability and sufficient fairness. 104 The lack of ownership means that as long as one \gls{proc} is still able to repeatedly dequeue elements, it is unlikely that any element will stay on the queue for much longer than any other element. 105 This contrasts with work-stealing, where \emph{any} \gls{proc} busy for an extended period of time results in all the elements on its local queue to have to wait. Unless another \gls{proc} runs out of work. 146 106 147 107 An important aspects of this scheme's fairness approach is that the timestamps make it possible to evaluate how long elements have been on the queue. 148 However, \glspl{proc} eagerly search for these older elements instead of focusing on specific queues, which negatively affects locality.149 150 While th is scheme has good fairness, its performance suffers.151 It requires wide sharding, \eg at least 4 queues per \gls{hthrd}, and finding non-empty queues is difficult when there arefew ready \ats.108 However, another major aspect is that \glspl{proc} will eagerly search for these older elements instead of focusing on specific queues. 109 110 While the fairness, of this scheme is good, it does suffer in terms of performance. 111 It requires very wide sharding, \eg at least 4 queues per \gls{hthrd}, and finding non-empty queues can be difficult if there are too few ready \ats. 152 112 153 113 \section{Relaxed-FIFO++} 154 The inherent fairness and good performance with many \ats, makes the relaxed-FIFO queue a good candidate to form the basis of a new scheduler. 155 The problem case is workloads where the number of \ats is barely greater than the number of \procs. 156 In these situations, the wide sharding of the ready queue means most of its subqueues are empty. 157 Furthermore, the non-empty subqueues are unlikely to hold more than one item. 158 The consequence is that a random dequeue operation is likely to pick an empty subqueue, resulting in an unbounded number of selections. 159 This state is generally unstable: each subqueue is likely to frequently toggle between being empty and nonempty. 160 Indeed, when the number of \ats is \emph{equal} to the number of \procs, every pop operation is expected to empty a subqueue and every push is expected to add to an empty subqueue. 161 In the worst case, a check of the subqueues sees all are empty or full. 114 Since it has inherent fairness quelities and decent performance in the presence of many \ats, the relaxed-FIFO queue appears as a good candidate to form the basis of a scheduler. 115 The most obvious problems is for workloads where the number of \ats is barely greater than the number of \procs. 116 In these situations, the wide sharding means most of the sub-queues from which the relaxed queue is formed will be empty. 117 The consequence is that when a dequeue operations attempts to pick a sub-queue at random, it is likely that it picks an empty sub-queue and will have to pick again. 118 This problem can repeat an unbounded number of times. 162 119 163 120 As this is the most obvious challenge, it is worth addressing first. 164 The obvious solution is to supplement each sharded subqueue with data that indicates if the queue is empty/nonempty to simplify finding nonempty queues, \ie ready \glspl{at}. 165 This sharded data can be organized in different forms, \eg a bitmask or a binary tree that tracks the nonempty subqueues. 166 Specifically, many modern architectures have powerful bitmask manipulation instructions or searching a binary tree has good Big-O complexity. 167 However, precisely tracking nonempty subqueues is problematic. 168 The reason is that the subqueues are initially sharded with a width presumably chosen to avoid contention. 169 However, tracking which ready queue is nonempty is only useful if the tracking data is dense, \ie denser than the sharded subqueues. 170 Otherwise, it does not provide useful information because reading this new data structure risks being as costly as simply picking a subqueue at random. 171 But if the tracking mechanism \emph{is} denser than the shared subqueues, than constant updates invariably create a new source of contention. 172 Early experiments with this approach showed that randomly picking, even with low success rates, is often faster than bit manipulations or tree walks. 121 The obvious solution is to supplement each subqueue with some sharded data structure that keeps track of which subqueues are empty. 122 This data structure can take many forms, for example simple bitmask or a binary tree that tracks which branch are empty. 123 Following a binary tree on each pick has fairly good Big O complexity and many modern architectures have powerful bitmask manipulation instructions. 124 However, precisely tracking which sub-queues are empty is actually fundamentally problematic. 125 The reason is that each subqueues are already a form of sharding and the sharding width has presumably already chosen to avoid contention. 126 However, tracking which ready queue is empty is only useful if the tracking mechanism uses denser sharding than the sub queues, then it will invariably create a new source of contention. 127 But if the tracking mechanism is not denser than the sub-queues, then it will generally not provide useful because reading this new data structure risks being as costly as simply picking a sub-queue at random. 128 Early experiments with this approach have shown that even with low success rates, randomly picking a sub-queue can be faster than a simple tree walk. 173 129 174 130 The exception to this rule is using local tracking. 175 If each \proc locally keeps track of empty subqueues, than this can be done with a very dense data structure without introducing a new source of contention.176 However, the consequence of local tracking is that the information is incomplete.177 Each \proc is only aware of the last state it saw about each subqueue so this information quickly becomes stale.178 Even on systems with low \gls{hthrd} count, \eg 4 or 8, this approachcan quickly lead to the local information being no better than the random pick.179 This result is due in part to the cost of maintaininginformation and its poor quality.180 181 However, using a very low cost but inaccurate approach for local tracking canactually be beneficial.182 If the local tracking is no more costly than a random pick, than \emph{any} improvement to the success rate, however low it is, leadsto a performance benefits.183 This suggests to the following approach:131 If each \proc keeps track locally of which sub-queue is empty, then this can be done with a very dense data structure without introducing a new source of contention. 132 The consequence of local tracking however, is that the information is not complete. 133 Each \proc is only aware of the last state it saw each subqueues but does not have any information about freshness. 134 Even on systems with low \gls{hthrd} count, \eg 4 or 8, this can quickly lead to the local information being no better than the random pick. 135 This is due in part to the cost of this maintaining this information and its poor quality. 136 137 However, using a very low cost approach to local tracking may actually be beneficial. 138 If the local tracking is no more costly than the random pick, than \emph{any} improvement to the succes rate, however low it is, would lead to a performance benefits. 139 This leads to the following approach: 184 140 185 141 \subsection{Dynamic Entropy}\cit{https://xkcd.com/2318/} 186 The Relaxed-FIFO approach can be made to handle the case of mostly empty sub queues by tweaking the \glsxtrlong{prng}.187 The \glsxtrshort{prng} state can be seen as containing a list of all the future sub queues that will be accessed.188 While this conceptis not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the subqueues that were accessed.189 Luckily, bidirectional \glsxtrshort{prng} algorithms do exist, \egsome Linear Congruential Generators\cit{https://en.wikipedia.org/wiki/Linear\_congruential\_generator} support running the algorithm backwards while offering good quality and performance.142 The Relaxed-FIFO approach can be made to handle the case of mostly empty sub-queues by tweaking the \glsxtrlong{prng}. 143 The \glsxtrshort{prng} state can be seen as containing a list of all the future sub-queues that will be accessed. 144 While this is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the subqueues that were accessed. 145 Luckily, bidirectional \glsxtrshort{prng} algorithms do exist, for example some Linear Congruential Generators\cit{https://en.wikipedia.org/wiki/Linear\_congruential\_generator} support running the algorithm backwards while offering good quality and performance. 190 146 This particular \glsxtrshort{prng} can be used as follows: 191 \begin{itemize} 192 \item 193 Each \proc maintains two \glsxtrshort{prng} states, refereed to as $F$ and $B$. 194 \item 195 When a \proc attempts to dequeue a \at, it picks a subqueue by running $B$ backwards. 196 \item 197 When a \proc attempts to enqueue a \at, it runs $F$ forward picking a subqueue to enqueue to. 198 If the enqueue is successful, the state $B$ is overwritten with the content of $F$. 199 \end{itemize} 200 The result is that each \proc tends to dequeue \ats that it has itself enqueued. 201 When most subqueues are empty, this technique increases the odds of finding \ats at very low cost, while also offering an improvement on locality in many cases. 202 203 Tests showed this approach performs better than relaxed-FIFO in many cases. 204 However, it is still not competitive with work-stealing algorithms. 147 148 Each \proc maintains two \glsxtrshort{prng} states, which whill be refered to as \texttt{F} and \texttt{B}. 149 150 When a \proc attempts to dequeue a \at, it picks the subqueues by running the \texttt{B} backwards. 151 When a \proc attempts to enqueue a \at, it runs \texttt{F} forward to pick to subqueue to enqueue to. 152 If the enqueue is successful, the state \texttt{B} is overwritten with the content of \texttt{F}. 153 154 The result is that each \proc will tend to dequeue \ats that it has itself enqueued. 155 When most sub-queues are empty, this technique increases the odds of finding \ats at very low cost, while also offering an improvement on locality in many cases. 156 157 However, while this approach does notably improve performance in many cases, this algorithm is still not competitive with work-stealing algorithms. 205 158 The fundamental problem is that the constant randomness limits how much locality the scheduler offers. 206 This becomes problematic both because the scheduler is likely to get cache misses on internal data-structures and because migrations become frequent. 207 Therefore, the attempt to modify the relaxed-FIFO algorithm to behave more like work stealing did not pan out. 208 The alternative is to do it the other way around. 159 This becomes problematic both because the scheduler is likely to get cache misses on internal data-structures and because migration become very frequent. 160 Therefore since the approach of modifying to relaxed-FIFO algorithm to behave more like work stealing does not seem to pan out, the alternative is to do it the other way around. 209 161 210 162 \section{Work Stealing++} 211 To add stronger fairness guarantees to work stealing a few changes are needed.163 To add stronger fairness guarantees to workstealing a few changes. 212 164 First, the relaxed-FIFO algorithm has fundamentally better fairness because each \proc always monitors all subqueues. 213 Therefore, the work-stealing algorithm must be prepended with some monitoring. 214 Before attempting to dequeue from a \proc's subqueue, the \proc must make some effort to ensure other subqueues are not being neglected. 215 To make this possible, \procs must be able to determine which \at has been on the ready queue the longest. 216 Second, the relaxed-FIFO approach needs timestamps for each \at to make this possible. 165 Therefore the workstealing algorithm must be prepended with some monitoring. 166 Before attempting to dequeue from a \proc's local queue, the \proc must make some effort to make sure remote queues are not being neglected. 167 To make this possible, \procs must be able to determie which \at has been on the ready-queue the longest. 168 Which is the second aspect that much be added. 169 The relaxed-FIFO approach uses timestamps for each \at and this is also what is done here. 217 170 218 171 \begin{figure} 219 172 \centering 220 173 \input{base.pstex_t} 221 \caption[Base \CFA design]{Base \CFA design \smallskip\newline A pool of subqueues offers the sharding, two per \glspl{proc}. 222 Each \gls{proc} can access all of the subqueues. 223 Each \at is timestamped when enqueued.} 174 \caption[Base \CFA design]{Base \CFA design \smallskip\newline A Pool of sub-ready queues offers the sharding, two per \glspl{proc}. Each \gls{proc} have local subqueues, however \glspl{proc} can access any of the sub-queues. Each \at is timestamped when enqueued.} 224 175 \label{fig:base} 225 176 \end{figure} 226 227 Figure~\ref{fig:base} shows the algorithm structure. 228 This structure is similar to classic work-stealing except the subqueues are placed in an array so \procs can access them in constant time. 229 Sharding width can be adjusted based on contention. 230 Note, as an optimization, the TS of a \at is stored in the \at in front of it, so the first TS is in the array and the last \at has no TS. 231 This organization keeps the highly accessed front TSs directly in the array. 232 When a \proc attempts to dequeue a \at, it first picks a random remote subqueue and compares its timestamp to the timestamps of its local subqueue(s). 233 The oldest waiting \at is dequeued to provide global fairness. 234 235 However, this na\"ive implemented has performance problems. 177 The algorithm is structure as shown in Figure~\ref{fig:base}. 178 This is very similar to classic workstealing except the local queues are placed in an array so \procs can access eachother's queue in constant time. 179 Sharding width can be adjusted based on need. 180 When a \proc attempts to dequeue a \at, it first picks a random remote queue and compares its timestamp to the timestamps of the local queue(s), dequeue from the remote queue if needed. 181 182 Implemented as as naively state above, this approach has some obvious performance problems. 236 183 First, it is necessary to have some damping effect on helping. 237 Random effects like cache misses and preemption can add spurious but short bursts of latency negating the attempt to help.238 The se bursts can cause increased migrations and make this work stealing approach slowdown to the level of relaxed-FIFO.184 Random effects like cache misses and preemption can add spurious but short bursts of latency for which helping is not helpful, pun intended. 185 The effect of these bursts would be to cause more migrations than needed and make this workstealing approach slowdown to the match the relaxed-FIFO approach. 239 186 240 187 \begin{figure} … … 245 192 \end{figure} 246 193 247 A simple solution to this problem is to use an exponential moving average\cit{https://en.wikipedia.org/wiki/Moving\_average\#Exponential\_moving\_average} (MA) instead of araw timestamps, shown in Figure~\ref{fig:base-ma}.248 Note , this is more complex because the \at at the head of a subqueue is still waiting, soits wait time has not ended.249 Therefore , the exponential moving average is actually an exponential moving average of how long each dequeued \at haswaited.250 To compare subqueues, the timestamp at the head must be compared to the current time, yielding the best -case wait-time for the \at at the head of the queue.194 A simple solution to this problem is to compare an exponential moving average\cit{https://en.wikipedia.org/wiki/Moving\_average\#Exponential\_moving\_average} instead if the raw timestamps, shown in Figure~\ref{fig:base-ma}. 195 Note that this is slightly more complex than it sounds because since the \at at the head of a subqueue is still waiting, its wait time has not ended. 196 Therefore the exponential moving average is actually an exponential moving average of how long each already dequeued \at have waited. 197 To compare subqueues, the timestamp at the head must be compared to the current time, yielding the bestcase wait time for the \at at the head of the queue. 251 198 This new waiting is averaged with the stored average. 252 To further limit migration, a bias can be added to a local subqueue, where a remote subqueue is helped only if its moving average is more than $X$ times the local subqueue's average. 253 Tests for this approach indicate the choice of the weight for the moving average or the bias is not important, \ie weights and biases of similar \emph{magnitudes} have similar effects. 254 255 With these additions to work stealing, scheduling can be made as fair as the relaxed-FIFO approach, avoiding the majority of unnecessary migrations. 256 Unfortunately, the work to achieve fairness has a performance cost, especially when the workload is inherently fair, and hence, there is only short-term or no starvation. 257 The problem is that the constant polling, \ie reads, of remote subqueues generally entail a cache miss because the TSs are constantly being updated, \ie, writes. 258 To make things worst, remote subqueues that are very active, \ie \ats are frequently enqueued and dequeued from them, lead to higher chances that polling will incur a cache-miss. 259 Conversely, the active subqueues do not benefit much from helping since starvation is already a non-issue. 260 This puts this algorithm in the awkward situation of paying for a cost that is largely unnecessary. 199 To limit even more the amount of unnecessary migration, a bias can be added to the local queue, where a remote queue is helped only if its moving average is more than \emph{X} times the local queue's average. 200 None of the experimentation that I have run with these scheduler seem to indicate that the choice of the weight for the moving average or the choice of bis is particularly important. 201 Weigths and biases of similar \emph{magnitudes} have similar effects. 202 203 With these additions to workstealing, scheduling can be made as fair as the relaxed-FIFO approach, well avoiding the majority of unnecessary migrations. 204 Unfortunately, the performance of this approach does suffer in the cases with no risks of starvation. 205 The problem is that the constant polling of remote subqueues generally entail a cache miss. 206 To make things worst, remote subqueues that are very active, \ie \ats are frequently enqueued and dequeued from them, the higher the chances are that polling will incurr a cache-miss. 207 Conversly, the active subqueues do not benefit much from helping since starvation is already a non-issue. 208 This puts this algorithm in an akward situation where it is paying for a cost, but the cost itself suggests the operation was unnecessary. 261 209 The good news is that this problem can be mitigated 262 210 263 211 \subsection{Redundant Timestamps} 264 The problem with polling remote subqueues is that correctness is critical. 265 There must be a consensus among \procs on which subqueues hold which \ats, as the \ats are in constant motion. 266 Furthermore, since timestamps are use for fairness, it is critical to have consensus on which \at is the oldest. 267 However, when deciding if a remote subqueue is worth polling, correctness is less of a problem. 268 Since the only requirement is that a subqueue is eventually polled, some data staleness is acceptable. 269 This leads to a situation where stale timestamps are only problematic in some cases. 270 Furthermore, stale timestamps can be desirable since lower freshness requirements mean less cache invalidations. 271 272 Figure~\ref{fig:base-ts2} shows a solution with a second array containing a copy of the timestamps and average. 212 The problem with polling remote queues is due to a tension between the consistency requirement on the subqueue. 213 For the subqueues, correctness is critical. There must be a consensus among \procs on which subqueues hold which \ats. 214 Since the timestamps are use for fairness, it is alco important to have consensus and which \at is the oldest. 215 However, when deciding if a remote subqueue is worth polling, correctness is much less of a problem. 216 Since the only need is that a subqueue will eventually be polled, some data staleness can be acceptable. 217 This leads to a tension where stale timestamps are only problematic in some cases. 218 Furthermore, stale timestamps can be somewhat desirable since lower freshness requirements means less tension on the cache coherence protocol. 219 220 221 \begin{figure} 222 \centering 223 % \input{base_ts2.pstex_t} 224 \caption[\CFA design with Redundant Timestamps]{\CFA design with Redundant Timestamps \smallskip\newline A array is added containing a copy of the timestamps. These timestamps are written to with relaxed atomics, without fencing, leading to fewer cache invalidations.} 225 \label{fig:base-ts2} 226 \end{figure} 227 A solution to this is to create a second array containing a copy of the timestamps and average. 273 228 This copy is updated \emph{after} the subqueue's critical sections using relaxed atomics. 274 229 \Glspl{proc} now check if polling is needed by comparing the copy of the remote timestamp instead of the actual timestamp. 275 The result is that since there is no fencing, the writes can be buffered in the hardware and cause fewer cache invalidations. 276 277 \begin{figure} 278 \centering 279 \input{base_ts2.pstex_t} 280 \caption[\CFA design with Redundant Timestamps]{\CFA design with Redundant Timestamps \smallskip\newline An array is added containing a copy of the timestamps. 281 These timestamps are written to with relaxed atomics, so there is no order among concurrent memory accesses, leading to fewer cache invalidations.} 282 \label{fig:base-ts2} 283 \end{figure} 284 285 The correctness argument is somewhat subtle. 230 The result is that since there is no fencing, the writes can be buffered and cause fewer cache invalidations. 231 232 The correctness argument here is somewhat subtle. 286 233 The data used for deciding whether or not to poll a queue can be stale as long as it does not cause starvation. 287 Therefore, it is acceptable if stale data makes queues appear older than they really are but appearing fresher can be a problem. 288 For the timestamps, this means missing writes to the timestamp is acceptable since they make the head \at look older. 289 For the moving average, as long as the operations are just atomic reads/writes, the average is guaranteed to yield a value that is between the oldest and newest values written. 290 Therefore, this unprotected read of the timestamp and average satisfy the limited correctness that is required. 291 292 With redundant timestamps, this scheduling algorithm achieves both the fairness and performance requirements on most machines. 234 Therefore, it is acceptable if stale data make queues appear older than they really are but not fresher. 235 For the timestamps, this means that missing writes to the timestamp is acceptable since they will make the head \at look older. 236 For the moving average, as long as the operation are RW-safe, the average is guaranteed to yield a value that is between the oldest and newest values written. 237 Therefore this unprotected read of the timestamp and average satisfy the limited correctness that is required. 238 239 \begin{figure} 240 \centering 241 \input{cache-share.pstex_t} 242 \caption[CPU design with wide L3 sharing]{CPU design with wide L3 sharing \smallskip\newline A very simple CPU with 4 \glspl{hthrd}. L1 and L2 are private to each \gls{hthrd} but the L3 is shared across to entire core.} 243 \label{fig:cache-share} 244 \end{figure} 245 246 \begin{figure} 247 \centering 248 \input{cache-noshare.pstex_t} 249 \caption[CPU design with a narrower L3 sharing]{CPU design with a narrower L3 sharing \smallskip\newline A different CPU design, still with 4 \glspl{hthrd}. L1 and L2 are still private to each \gls{hthrd} but the L3 is shared some of the CPU but there is still two distinct L3 instances.} 250 \label{fig:cache-noshare} 251 \end{figure} 252 253 With redundant tiemstamps this scheduling algorithm achieves both the fairness and performance requirements, on some machines. 293 254 The problem is that the cost of polling and helping is not necessarily consistent across each \gls{hthrd}. 294 For example, on machines with a CPU containing multiple hyperthreads and cores and multiple CPU sockets, cache misses can be satisfied from the caches on same (local) CPU, or by a CPU on a different (remote) socket. 295 Cache misses satisfied by a remote CPU have significantly higher latency than from the local CPU. 296 However, these delays are not specific to systems with multiple CPUs. 297 Depending on the cache structure, cache misses can have different latency on the same CPU, \eg the AMD EPYC 7662 CPUs used in Chapter~\ref{microbench}. 298 299 \begin{figure} 300 \centering 301 \input{cache-share.pstex_t} 302 \caption[CPU design with wide L3 sharing]{CPU design with wide L3 sharing \smallskip\newline A CPU with 4 cores, where caches L1 and L2 are private to each core, and the L3 cache is shared across all cores.} 303 \label{fig:cache-share} 304 305 \vspace{25pt} 306 307 \input{cache-noshare.pstex_t} 308 \caption[CPU design with a narrower L3 sharing]{CPU design with a narrow L3 sharing \smallskip\newline A CPU with 4 cores, where caches L1 and L2 are private to each core, and the L3 cache is shared across a pair of cores.} 309 \label{fig:cache-noshare} 310 \end{figure} 311 312 Figures~\ref{fig:cache-share} and~\ref{fig:cache-noshare} show two different cache topologies that highlight this difference. 313 In Figure~\ref{fig:cache-share}, all cache misses are either private to a CPU or shared with another CPU. 314 This means latency due to cache misses is fairly consistent. 315 In contrast, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by either instance of L3 cache. 316 However, the memory-access latency to the remote L3 is higher than the memory-access latency to the local L3. 317 The impact of these different designs on this algorithm is that scheduling only scales well on architectures with a wide L3 cache, similar to Figure~\ref{fig:cache-share}, and less well on architectures with many narrower L3 cache instances, similar to Figure~\ref{fig:cache-noshare}. 318 Hence, as the number of L3 instances grow, so too does the chance that the random helping causes significant cache latency. 319 The solution is for the scheduler be aware of the cache topology. 255 For example, on machines where the motherboard holds multiple CPU, cache misses can be satisfied from a cache that belongs to the CPU that missed, the \emph{local} CPU, or by a different CPU, a \emph{remote} one. 256 Cache misses that are satisfied by a remote CPU will have higher latency than if it is satisfied by the local CPU. 257 However, this is not specific to systems with multiple CPUs. 258 Depending on the cache structure, cache-misses can have different latency for the same CPU. 259 The AMD EPYC 7662 CPUs that is described in Chapter~\ref{microbench} is an example of that. 260 Figure~\ref{fig:cache-share} and Figure~\ref{fig:cache-noshare} show two different cache topologies with highlight this difference. 261 In Figure~\ref{fig:cache-share}, all cache instances are either private to a \gls{hthrd} or shared to the entire system, this means latency due to cache-misses are likely fairly consistent. 262 By comparison, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by a hit in either instance of the L3. 263 However, the memory access latency to the remote L3 instance will be notably higher than the memory access latency to the local L3. 264 The impact of these different design on this algorithm is that scheduling will scale very well on architectures similar to Figure~\ref{fig:cache-share}, both will have notably worst scalling with many narrower L3 instances. 265 This is simply because as the number of L3 instances grow, so two does the chances that the random helping will cause significant latency. 266 The solution is to have the scheduler be aware of the cache topology. 320 267 321 268 \subsection{Per CPU Sharding} 322 Building a scheduler that is cache aware poses two main challenges: discovering the cache topology and matching \procs to this cache structure. 323 Unfortunately, there is no portable way to discover cache topology, and it is outside the scope of this thesis to solve this problem. 324 This work uses the cache topology information from Linux's @/sys/devices/system/cpu@ directory. 325 This leaves the challenge of matching \procs to cache structure, or more precisely identifying which subqueues of the ready queue are local to which subcomponents of the cache structure. 326 Once a matching is generated, the helping algorithm is changed to add bias so that \procs more often help subqueues local to the same cache substructure.\footnote{ 327 Note that like other biases mentioned in this section, the actual bias value does not appear to need precise tuning.} 328 329 The simplest approach for mapping subqueues to cache structure is to statically tie subqueues to CPUs. 330 Instead of having each subqueue local to a specific \proc, the system is initialized with subqueues for each hardware hyperthread/core up front. 331 Then \procs dequeue and enqueue by first asking which CPU id they are executing on, in order to identify which subqueues are the local ones. 332 \Glspl{proc} can get the CPU id from @sched_getcpu@ or @librseq@. 333 334 This approach solves the performance problems on systems with topologies with narrow L3 caches, similar to Figure \ref{fig:cache-noshare}. 335 However, it can still cause some subtle fairness problems in systems with few \procs and many \glspl{hthrd}. 336 In this case, the large number of subqueues and the bias against subqueues tied to different cache substructures make it unlikely that every subqueue is picked. 337 To make things worst, the small number of \procs mean that few helping attempts are made. 338 This combination of low selection and few helping attempts allow a \at to become stranded on a subqueue for a long time until it gets randomly helped. 269 Building a scheduler that is aware of cache topology poses two main challenges: discovering cache topology and matching \procs to cache instance. 270 Sadly, there is no standard portable way to discover cache topology in C. 271 Therefore, while this is a significant portability challenge, it is outside the scope of this thesis to design a cross-platform cache discovery mechanisms. 272 The rest of this work assumes discovering the cache topology based on Linux's \texttt{/sys/devices/system/cpu} directory. 273 This leaves the challenge of matching \procs to cache instance, or more precisely identifying which subqueues of the ready queue are local to which cache instance. 274 Once this matching is available, the helping algorithm can be changed to add bias so that \procs more often help subqueues local to the same cache instance 275 \footnote{Note that like other biases mentioned in this section, the actual bias value does not appear to need precise tuinng.}. 276 277 The obvious approach to mapping cache instances to subqueues is to statically tie subqueues to CPUs. 278 Instead of having each subqueue local to a specific \proc, the system is initialized with subqueues for each \glspl{hthrd} up front. 279 Then \procs dequeue and enqueue by first asking which CPU id they are local to, in order to identify which subqueues are the local ones. 280 \Glspl{proc} can get the CPU id from \texttt{sched\_getcpu} or \texttt{librseq}. 281 282 This approach solves the performance problems on systems with topologies similar to Figure~\ref{fig:cache-noshare}. 283 However, it actually causes some subtle fairness problems in some systems, specifically systems with few \procs and many \glspl{hthrd}. 284 In these cases, the large number of subqueues and the bias agains subqueues tied to different cache instances make it so it is very unlikely any single subqueue is picked. 285 To make things worst, the small number of \procs mean that few helping attempts will be made. 286 This combination of few attempts and low chances make it so a \at stranded on a subqueue that is not actively dequeued from may wait very long before it gets randomly helped. 339 287 On a system with 2 \procs, 256 \glspl{hthrd} with narrow cache sharing, and a 100:1 bias, it can actually take multiple seconds for a \at to get dequeued from a remote queue. 340 288 Therefore, a more dynamic matching of subqueues to cache instance is needed. 341 289 342 290 \subsection{Topological Work Stealing} 343 \label{s:TopologicalWorkStealing} 344 Therefore, the approach used in the \CFA scheduler is to have per-\proc subqueues, but have an explicit data-structure track which cache substructure each subqueue is tied to. 345 This tracking requires some finesse because reading this data structure must lead to fewer cache misses than not having the data structure in the first place. 291 The approach that is used in the \CFA scheduler is to have per-\proc subqueue, but have an excplicit data-structure track which cache instance each subqueue is tied to. 292 This is requires some finess because reading this data structure must lead to fewer cache misses than not having the data structure in the first place. 346 293 A key element however is that, like the timestamps for helping, reading the cache instance mapping only needs to give the correct result \emph{often enough}. 347 Therefore the algorithm can be built as follows: before enqueueing or dequeuing a \at, each \proc queries the CPU id and the corresponding cache instance.294 Therefore the algorithm can be built as follows: Before enqueuing or dequeing a \at, each \proc queries the CPU id and the corresponding cache instance. 348 295 Since subqueues are tied to \procs, each \proc can then update the cache instance mapped to the local subqueue(s). 349 296 To avoid unnecessary cache line invalidation, the map is only written to if the mapping changes. 350 297 351 This scheduler is used in the remainder of the thesis for managing CPU execution, but additional scheduling is needed to handle long-term blocking and unblocking, such as I/O.352 -
doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex
rffec1bf r9e23b446 1 1 \chapter{Micro-Benchmarks}\label{microbench} 2 2 3 The first step in evaluating this work is to test-out small controlled cases to ensure the basics workproperly.4 This chapterpresents five different experimental setup, evaluating some of the basic features of \CFA's scheduler.3 The first step of evaluation is always to test-out small controlled cases, to ensure that the basics are working properly. 4 This sections presents five different experimental setup, evaluating some of the basic features of \CFA's scheduler. 5 5 6 6 \section{Benchmark Environment} 7 All benchmarks are run on two distinct hardware platforms. 8 \begin{description} 9 \item[AMD] is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM. 10 The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}. 11 Each CPU has 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches, respectively. 7 All of these benchmarks are run on two distinct hardware environment, an AMD and an INTEL machine. 8 9 For all benchmarks, \texttt{taskset} is used to limit the experiment to 1 NUMA Node with no hyper threading. 10 If more \glspl{hthrd} are needed, then 1 NUMA Node with hyperthreading is used. 11 If still more \glspl{hthrd} are needed then the experiment is limited to as few NUMA Nodes as needed. 12 13 14 \paragraph{AMD} The AMD machine is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM. 15 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55. 16 These EPYCs have 64 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 256 \glspl{hthrd}. 17 The cpus each have 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches respectively. 12 18 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}. 19 20 \paragraph{Intel} The Intel machine is a server with four Intel Xeon Platinum 8160 CPUs and 384GB of DDR4 RAM. 13 21 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55. 14 15 \item[Intel] is a server with four Intel Xeon Platinum 8160 CPUs and 384GB of DDR4 RAM. 16 The Xeon CPU has 24 cores with 2 \glspl{hthrd} per core, for 48 \glspl{hthrd} per socket with 4 sockets for a total of 196 \glspl{hthrd}. 17 Each CPU has 3 MB, 96 MB and 132 MB of L1, L2 and L3 caches respectively. 22 These Xeon Platinums have 24 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 192 \glspl{hthrd}. 23 The cpus each have 3 MB, 96 MB and 132 MB of L1, L2 and L3 caches respectively. 18 24 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared across the entire CPU, therefore 48 \glspl{hthrd}. 19 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55. 20 \end{description} 21 22 For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA Node with no hyper threading. 23 If more \glspl{hthrd} are needed, then 1 NUMA Node with hyperthreading is used. 24 If still more \glspl{hthrd} are needed, then the experiment is limited to as few NUMA Nodes as needed. 25 26 The limited sharing of the last-level cache on the AMD machine is markedly different than the Intel machine. 27 Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different CPU incur a significant latency, on the AMD it is also the case that cache misses served by a different L3 instance on the same CPU still incur high latency. 25 26 This limited sharing of the last level cache on the AMD machine is markedly different than the Intel machine. Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different cpu incurr a significant latency, on AMD it is also the case that cache misses served by a different L3 instance on the same cpu still incur high latency. 28 27 29 28 … … 35 34 \label{fig:cycle} 36 35 \end{figure} 37 The most basic evaluation of any ready queue is to evaluate the latency needed to push and pop one element from the ready queue. 38 Since these two operation also describe a @yield@ operation, many systems use this operation as the most basic benchmark. 39 However, yielding can be treated as a special case by optimizing it away (dead code) since the number of ready \glspl{at} does not change. 40 Not all systems perform this optimization, but those that do have an artificial performance benefit because the yield becomes a \emph{nop}. 41 For this reason, I chose a different first benchmark, called \newterm{Cycle Benchmark}. 42 This benchmark arranges a number of \glspl{at} into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list. 36 The most basic evaluation of any ready queue is to evaluate the latency needed to push and pop one element from the ready-queue. 37 Since these two operation also describe a \texttt{yield} operation, many systems use this as the most basic benchmark. 38 However, yielding can be treated as a special case, since it also carries the information that the number of the ready \glspl{at} will not change. 39 Not all systems use this information, but those which do may appear to have better performance than they would for disconnected push/pop pairs. 40 For this reason, I chose a different first benchmark, which I call the Cycle Benchmark. 41 This benchmark arranges many \glspl{at} into multiple rings of \glspl{at}. 42 Each ring is effectively a circular singly-linked list. 43 43 At runtime, each \gls{at} unparks the next \gls{at} before parking itself. 44 Unparking the next \gls{at} pushes that \gls{at} onto the ready queue as does the ensuing park. 45 46 Hence, the underlying runtime cannot rely on the number of ready \glspl{at} staying constant over the duration of the experiment. 44 This corresponds to the desired pair of ready queue operations. 45 Unparking the next \gls{at} requires pushing that \gls{at} onto the ready queue and the ensuing park will cause the runtime to pop a \gls{at} from the ready-queue. 46 Figure~\ref{fig:cycle} shows a visual representation of this arrangement. 47 48 The goal of this ring is that the underlying runtime cannot rely on the guarantee that the number of ready \glspl{at} will stay constant over the duration of the experiment. 47 49 In fact, the total number of \glspl{at} waiting on the ready queue is expected to vary because of the race between the next \gls{at} unparking and the current \gls{at} parking. 48 That is, the runtime cannot anticipate that the current task will immediately park. 49 As well, the size of the cycle is also decided based on this race, \eg a small cycle may see the chain of unparks go full circle before the first \gls{at} parks because of time-slicing or multiple \procs. 50 Every runtime system must handle this race and cannot optimized away the ready-queue pushes and pops. 51 To prevent any attempt of silently omitting ready-queue operations, the ring of \glspl{at} is made big enough so the \glspl{at} have time to fully park before being unparked again. 52 (Note, an unpark is like a V on a semaphore, so the subsequent park (P) may not block.) 53 Finally, to further mitigate any underlying push/pop optimizations, especially on SMP machines, multiple rings are created in the experiment. 54 55 To avoid this benchmark being affected by idle-sleep handling, the number of rings is multiple times greater than the number of \glspl{proc}. 56 This design avoids the case where one of the \glspl{proc} runs out of work because of the variation on the number of ready \glspl{at} mentioned above. 57 58 Figure~\ref{fig:cycle:code} shows the pseudo code for this benchmark. 59 There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw @park@/@unpark@ and carefully picking the order of the @P@ and @V@ with respect to the loop condition. 60 61 \begin{figure} 62 \begin{cfa} 63 Thread.main() { 64 count := 0 65 for { 66 @wait()@ 67 @this.next.wake()@ 68 count ++ 69 if must_stop() { break } 70 } 71 global.count += count 72 } 73 \end{cfa} 74 \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code} 75 \label{fig:cycle:code} 76 \end{figure} 50 The size of the cycle is also decided based on this race: cycles that are too small may see the chain of unparks go full circle before the first \gls{at} can park. 51 While this would not be a correctness problem, every runtime system must handle that race, it could lead to pushes and pops being optimized away. 52 Since silently omitting ready-queue operations would throw off the measuring of these operations, the ring of \glspl{at} must be big enough so the \glspl{at} have the time to fully park before they are unparked. 53 Note that this problem is only present on SMP machines and is significantly mitigated by the fact that there are multiple rings in the system. 54 55 To avoid this benchmark from being dominated by the idle sleep handling, the number of rings is kept at least as high as the number of \glspl{proc} available. 56 Beyond this point, adding more rings serves to mitigate even more the idle sleep handling. 57 This is to avoid the case where one of the \glspl{proc} runs out of work because of the variation on the number of ready \glspl{at} mentionned above. 58 59 The actual benchmark is more complicated to handle termination, but that simply requires using a binary semphore or a channel instead of raw \texttt{park}/\texttt{unpark} and carefully picking the order of the \texttt{P} and \texttt{V} with respect to the loop condition. 60 Figure~\ref{fig:cycle:code} shows pseudo code for this benchmark. 61 62 \begin{figure} 63 \begin{lstlisting} 64 Thread.main() { 65 count := 0 66 for { 67 wait() 68 this.next.wake() 69 count ++ 70 if must_stop() { break } 71 } 72 global.count += count 73 } 74 \end{lstlisting} 75 \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code} 76 \label{fig:cycle:code} 77 \end{figure} 78 79 77 80 78 81 \subsection{Results} 79 Figure~\ref{fig:cycle:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle.80 81 82 \begin{figure} 82 83 \subfloat[][Throughput, 100 \ats per \proc]{ … … 105 106 \label{fig:cycle:jax:low:ns} 106 107 } 107 \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput as a function of \proc count with 100 cycles per \proc and5 \ats per cycle.}108 \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput as a function of \proc count, using 100 cycles per \proc, 5 \ats per cycle.} 108 109 \label{fig:cycle:jax} 109 110 \end{figure} 111 Figure~\ref{fig:cycle:jax} shows the throughput as a function of \proc count, with the following constants: 112 Each run uses 100 cycles per \proc, 5 \ats per cycle. 110 113 111 114 \todo{results discussion} 112 115 113 116 \section{Yield} 114 For completion, the classic yield benchmark is included.115 This benchmark is simpler than the cycle test: it creates many \glspl{at} that call @yield@.116 As mention ed, this benchmark may not be representative because of optimization shortcuts in @yield@.117 The only interesting variable in this benchmark is the number of \glspl{at} per \glspl{proc}, where ratios close to 1 means the ready queue(s) canbe empty.118 This s cenario can put a strain on the idle-sleep handling compared to scenarios where there is plenty of work.119 Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, where the @wait/next.wake@ is replaced by @yield@.120 121 \begin{figure} 122 \begin{cfa}123 Thread.main() {124 count := 0125 for {126 @yield()@127 count ++128 if must_stop() { break }129 }130 global.count += count131 }132 \end{cfa}133 \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code}134 \label{fig:yield:code}117 For completion, I also include the yield benchmark. 118 This benchmark is much simpler than the cycle tests, it simply creates many \glspl{at} that call \texttt{yield}. 119 As mentionned in the previous section, this benchmark may be less representative of usages that only make limited use of \texttt{yield}, due to potential shortcuts in the routine. 120 Its only interesting variable is the number of \glspl{at} per \glspl{proc}, where ratios close to 1 means the ready queue(s) could be empty. 121 This sometimes puts more strain on the idle sleep handling, compared to scenarios where there is clearly plenty of work to be done. 122 Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, the ``wait/wake-next'' is simply replaced by a yield. 123 124 \begin{figure} 125 \begin{lstlisting} 126 Thread.main() { 127 count := 0 128 for { 129 yield() 130 count ++ 131 if must_stop() { break } 132 } 133 global.count += count 134 } 135 \end{lstlisting} 136 \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code} 137 \label{fig:yield:code} 135 138 \end{figure} 136 139 137 140 \subsection{Results} 138 139 Figure~\ref{fig:yield:jax} shows the throughput as a function of \proc count, where each run uses 100 \ats per \proc.140 141 141 \begin{figure} 142 142 \subfloat[][Throughput, 100 \ats per \proc]{ … … 168 168 \label{fig:yield:jax} 169 169 \end{figure} 170 Figure~\ref{fig:yield:ops:jax} shows the throughput as a function of \proc count, with the following constants: 171 Each run uses 100 \ats per \proc. 170 172 171 173 \todo{results discussion} 172 174 175 173 176 \section{Churn} 174 The Cycle and Yield benchmark represent an \emph{easy} scenario for a scheduler, \egan embarrassingly parallel application.175 In these benchmarks, \glspl{at} can be easily partitioned over the different \glspl{proc} up front and none of the \glspl{at} communicate with each other.176 177 The Churn benchmark represents more chaotic execution, where there is no relation between the last \gls{proc} on which a \gls{at} ran and blocked and the \gls{proc} that subsequently unblocksit.178 W ith processor-specific ready-queues, when a \gls{at} is unblocked by a different \gls{proc} that means the unblocking \gls{proc} must either ``steal'' the \gls{at} from another processor or find it on a globalqueue.179 This dequeuing results in either contention on the remote queue and/or \glspl{rmr} on \gls{at} data structure.180 In either case, this benchmark aims to highlight how each scheduler handles these cases, since both cases can lead to performance degradation if not handled correctly.181 182 T his benchmark uses a fixed-size array of countingsemaphores.183 Each \gls{at} picks a random semaphore, @V@s it to unblock any \at waiting, and then @P@s on the semaphore.177 The Cycle and Yield benchmark represents an ``easy'' scenario for a scheduler, \eg, an embarrassingly parallel application. 178 In these benchmarks, \glspl{at} can be easily partitioned over the different \glspl{proc} up-front and none of the \glspl{at} communicate with each other. 179 180 The Churn benchmark represents more chaotic usages, where there is no relation between the last \gls{proc} on which a \gls{at} ran and the \gls{proc} that unblocked it. 181 When a \gls{at} is unblocked from a different \gls{proc} than the one on which it last ran, the unblocking \gls{proc} must either ``steal'' the \gls{at} or place it on a remote queue. 182 This results can result in either contention on the remote queue or \glspl{rmr} on \gls{at} data structure. 183 In either case, this benchmark aims to highlight how each scheduler handles these cases, since both cases can lead to performance degradation if they are not handled correctly. 184 185 To achieve this the benchmark uses a fixed size array of semaphores. 186 Each \gls{at} picks a random semaphore, \texttt{V}s it to unblock a \at waiting and then \texttt{P}s on the semaphore. 184 187 This creates a flow where \glspl{at} push each other out of the semaphores before being pushed out themselves. 185 For this benchmark to work, the number of \glspl{at} must be equal or greater than the number of semaphores plus the number of \glspl{proc}. 186 Note, the nature of these semaphores mean the counter can go beyond 1, which can lead to nonblocking calls to @P@. 187 Figure~\ref{fig:churn:code} shows pseudo code for this benchmark, where the @yield@ is replaced by @V@ and @P@. 188 189 \begin{figure} 190 \begin{cfa} 191 Thread.main() { 192 count := 0 193 for { 194 r := random() % len(spots) 195 @spots[r].V()@ 196 @spots[r].P()@ 197 count ++ 198 if must_stop() { break } 199 } 200 global.count += count 201 } 202 \end{cfa} 203 \caption[Churn Benchmark : Pseudo Code]{Churn Benchmark : Pseudo Code} 204 \label{fig:churn:code} 205 \end{figure} 206 207 \subsection{Results} 208 Figure~\ref{fig:churn:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle. 188 For this benchmark to work however, the number of \glspl{at} must be equal or greater to the number of semaphores plus the number of \glspl{proc}. 189 Note that the nature of these semaphores mean the counter can go beyond 1, which could lead to calls to \texttt{P} not blocking. 190 191 \todo{code, setup, results} 192 \begin{lstlisting} 193 Thread.main() { 194 count := 0 195 for { 196 r := random() % len(spots) 197 spots[r].V() 198 spots[r].P() 199 count ++ 200 if must_stop() { break } 201 } 202 global.count += count 203 } 204 \end{lstlisting} 209 205 210 206 \begin{figure} … … 234 230 \label{fig:churn:jax:low:ns} 235 231 } 236 \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. 237 Throughput is the total operation per second across all cores. Latency is the duration of each operation.} 232 \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. Throughput is the total operation per second across all cores. Latency is the duration of each opeartion.} 238 233 \label{fig:churn:jax} 239 234 \end{figure} 240 235 241 \todo{results discussion}242 243 236 \section{Locality} 244 237 … … 246 239 247 240 \section{Transfer} 248 The last benchmark is more ofan experiment than a benchmark.249 It tests the behavio ur of the schedulers for amisbehaved workload.241 The last benchmark is more exactly characterize as an experiment than a benchmark. 242 It tests the behavior of the schedulers for a particularly misbehaved workload. 250 243 In this workload, one of the \gls{at} is selected at random to be the leader. 251 244 The leader then spins in a tight loop until it has observed that all other \glspl{at} have acknowledged its leadership. 252 245 The leader \gls{at} then picks a new \gls{at} to be the ``spinner'' and the cycle repeats. 253 The benchmark comes in two flavours for the non-leader \glspl{at}: 254 once they acknowledged the leader, they either block on a semaphore or spin yielding. 255 256 The experiment is designed to evaluate the short-term load-balancing of a scheduler. 257 Indeed, schedulers where the runnable \glspl{at} are partitioned on the \glspl{proc} may need to balance the \glspl{at} for this experiment to terminate. 258 This problem occurs because the spinning \gls{at} is effectively preventing the \gls{proc} from running any other \glspl{thrd}. 259 In the semaphore flavour, the number of runnable \glspl{at} eventually dwindles down to only the leader. 260 This scenario is a simpler case to handle for schedulers since \glspl{proc} eventually run out of work. 246 247 The benchmark comes in two flavours for the behavior of the non-leader \glspl{at}: 248 once they acknowledged the leader, they either block on a semaphore or yield repeatadly. 249 250 This experiment is designed to evaluate the short term load balancing of the scheduler. 251 Indeed, schedulers where the runnable \glspl{at} are partitioned on the \glspl{proc} may need to balance the \glspl{at} for this experient to terminate. 252 This is because the spinning \gls{at} is effectively preventing the \gls{proc} from runnning any other \glspl{thrd}. 253 In the semaphore flavour, the number of runnable \glspl{at} will eventually dwindle down to only the leader. 254 This is a simpler case to handle for schedulers since \glspl{proc} eventually run out of work. 261 255 In the yielding flavour, the number of runnable \glspl{at} stays constant. 262 This scenario is a harder case to handle because corrective measures must be taken even when work isavailable.263 Note , runtime systems with preemptioncircumvent this problem by forcing the spinner to yield.256 This is a harder case to handle because corrective measures must be taken even if work is still available. 257 Note that languages that have mandatory preemption do circumvent this problem by forcing the spinner to yield. 264 258 265 259 \todo{code, setup, results} 266 267 \begin{figure} 268 \begin{cfa} 269 Thread.lead() { 270 this.idx_seen = ++lead_idx 271 if lead_idx > stop_idx { 272 done := true 273 return 274 } 275 276 // Wait for everyone to acknowledge my leadership 277 start: = timeNow() 278 for t in threads { 279 while t.idx_seen != lead_idx { 280 asm pause 281 if (timeNow() - start) > 5 seconds { error() } 282 } 283 } 284 285 // pick next leader 286 leader := threads[ prng() % len(threads) ] 287 288 // wake every one 289 if ! exhaust { 260 \begin{lstlisting} 261 Thread.lead() { 262 this.idx_seen = ++lead_idx 263 if lead_idx > stop_idx { 264 done := true 265 return 266 } 267 268 // Wait for everyone to acknowledge my leadership 269 start: = timeNow() 290 270 for t in threads { 291 if t != me { t.wake() } 292 } 293 } 294 } 295 296 Thread.wait() { 297 this.idx_seen := lead_idx 298 if exhaust { wait() } 299 else { yield() } 300 } 301 302 Thread.main() { 303 while !done { 304 if leader == me { this.lead() } 305 else { this.wait() } 306 } 307 } 308 \end{cfa} 309 \caption[Transfer Benchmark : Pseudo Code]{Transfer Benchmark : Pseudo Code} 310 \label{fig:transfer:code} 311 \end{figure} 312 313 \subsection{Results} 314 Figure~\ref{fig:transfer:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle. 315 316 \todo{results discussion} 271 while t.idx_seen != lead_idx { 272 asm pause 273 if (timeNow() - start) > 5 seconds { error() } 274 } 275 } 276 277 // pick next leader 278 leader := threads[ prng() % len(threads) ] 279 280 // wake every one 281 if !exhaust { 282 for t in threads { 283 if t != me { t.wake() } 284 } 285 } 286 } 287 288 Thread.wait() { 289 this.idx_seen := lead_idx 290 if exhaust { wait() } 291 else { yield() } 292 } 293 294 Thread.main() { 295 while !done { 296 if leader == me { this.lead() } 297 else { this.wait() } 298 } 299 } 300 \end{lstlisting} -
doc/theses/thierry_delisle_PhD/thesis/text/existing.tex
rffec1bf r9e23b446 1 1 \chapter{Previous Work}\label{existing} 2 As stated, scheduling is the process of assigning resources to incoming requests, where the common example is assigning available workers to work requests or vice versa. 3 Common scheduling examples in Computer Science are: operating systems and hypervisors schedule available CPUs, NICs schedule available bandwidth, virtual memory and memory allocator schedule available storage, \etc. 4 Scheduling is also common in most other fields, \eg in assembly lines, assigning parts to line workers is a form of scheduling. 2 Scheduling is the process of assigning resources to incomming requests. 3 A very common form of this is assigning available workers to work-requests. 4 The need for scheduling is very common in Computer Science, \eg Operating Systems and Hypervisors schedule available CPUs, NICs schedule available bamdwith, but scheduling is also common in other fields. 5 For example, in assmebly lines assigning parts in need of assembly to line workers is a form of scheduling. 5 6 6 In general, \emph{selecting} a scheduling algorithm dependson how much information is available to the scheduler.7 Workloads that are well-k nown, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms.8 A secondary aspect is how much information can be gathered versus how much information must be given as part of the schedulerinput.9 Th is information adds to the spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limited information.10 Note , this description includes both information about each requests, \eg time to complete or resources needed, and information about the relationships amongrequest, \eg whether or not some request must be completed before another request starts.7 In all these cases, the choice of a scheduling algorithm generally depends first and formost on how much information is available to the scheduler. 8 Workloads that are well-kown, consistent and homegenous can benefit from a scheduler that is optimized to use this information while ill-defined inconsistent heterogenous workloads will require general algorithms. 9 A secondary aspect to that is how much information can be gathered versus how much information must be given as part of the input. 10 There is therefore a spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limitted information. 11 Note that this description includes both infomation about each requests, \eg time to complete or resources needed, and information about the relationships between request, \eg whether or not some request must be completed before another request starts. 11 12 12 Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling,since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time.13 Scheduling physical resources, for example in assembly lines, is generally amenable to using very well informed scheduling since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time. 13 14 When a faster pace is needed and changes are much more frequent gathering information on workloads, up-front or live, can become much more limiting and more general schedulers are needed. 14 15 15 16 \section{Naming Convention} 16 Scheduling has been studied by various communities concentrating on different incarnation of the same problems. 17 As a result, there are no standard naming conventions for scheduling that is respected across these communities. 18 This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats. 17 Scheduling has been studied by various different communities concentrating on different incarnation of the same problems. As a result, their is no real naming convention for scheduling that is respected across these communities. For this document, I will use the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the objects which will execute these \glspl{at}. 19 18 20 19 \section{Static Scheduling} 21 \newterm{Static schedulers} require \ats dependencies and costs be explicitly and exhaustively specified prior to scheduling.22 The scheduler then processes this input ahead of time and produces a \newterm{schedule} the system follows during execution.23 This approach is popular in real-time systems since the need for strong guarantees justifies the cost of determining andsupplying this information.24 In general, static schedulers are less rel evant to this project because they require input from the programmers that the programming languagedoes not have as part of its concurrency semantic.25 Specifying this information explicitly adds a significant burden to the programmer and reduces flexibility.26 For this reason, the \CFA scheduler does not require this information. 20 Static schedulers require that \glspl{at} have their dependencies and costs explicitly and exhaustively specified prior schedule. 21 The scheduler then processes this input ahead of time and producess a \newterm{schedule} to which the system can later adhere. 22 This approach is generally popular in real-time systems since the need for strong guarantees justifies the cost of supplying this information. 23 In general, static schedulers are less relavant to this project since they require input from the programmers that \CFA does not have as part of its concurrency semantic. 24 Specifying this information explicitly can add a significant burden on the programmers and reduces flexibility, for this reason the \CFA scheduler does not require this information. 25 27 26 28 27 \section{Dynamic Scheduling} 29 \newterm{Dynamic schedulers} determine \ats dependencies and costs during scheduling, if at all. 30 Hence, unlike static scheduling, \ats dependencies are conditional and detected at runtime. 31 This detection takes the form of observing new \ats(s) in the system and determining dependencies from their behaviour, including suspending or halting a \ats that dynamically detects unfulfilled dependencies. 32 Furthermore, each \ats has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled. 33 As a consequence, the scheduler often has an incomplete view of the system, seeing only \ats with no pending dependencies. 28 It may be difficult to fulfill the requirements of static scheduler if dependencies are conditionnal. In this case, it may be preferable to detect dependencies at runtime. This detection effectively takes the form of adding one or more new \gls{at}(s) to the system as their dependencies are resolved. As well as potentially halting or suspending a \gls{at} that dynamically detect unfulfilled dependencies. Each \gls{at} has the responsability of adding the dependent \glspl{at} back in the system once completed. As a consequence, the scheduler may have an incomplete view of the system, seeing only \glspl{at} we no pending dependencies. Schedulers that support this detection at runtime are referred to as \newterm{Dynamic Schedulers}. 34 29 35 30 \subsection{Explicitly Informed Dynamic Schedulers} 36 While dynamic schedulers may not have an exhaustive list of dependencies for a \ats, some information may be available about each \ats, \eg expected duration, required resources, relative importance, \etc. 37 When available, a scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information} 38 However, most programmers do not determine or even \emph{predict} this information; 39 at best, the scheduler has only some imprecise information provided by the programmer, \eg, indicating a \ats takes approximately 3--7 seconds to complete, rather than exactly 5 seconds. 40 Providing this kind of information is a significant programmer burden especially if the information does not scale with the number of \ats and their complexity. 41 For example, providing an exhaustive list of files read by 5 \ats is an easier requirement then providing an exhaustive list of memory addresses accessed by 10,000 independent \ats. 31 While dynamic schedulers do not have access to an exhaustive list of dependencies for a \gls{at}, they may require to provide more or less information about each \gls{at}, including for example: expected duration, required ressources, relative importance, etc. The scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information} Precisely providing this information can be difficult for programmers, especially \emph{predicted} behaviour, and the scheduler may need to support some amount of imprecision in the provided information. For example, specifying that a \glspl{at} takes approximately 5 seconds to complete, rather than exactly 5 seconds. User provided information can also become a significant burden depending how the effort to provide the information scales with the number of \glspl{at} and there complexity. For example, providing an exhaustive list of files read by 5 \glspl{at} is an easier requirement the providing an exhaustive list of memory addresses accessed by 10'000 distinct \glspl{at}. 42 32 43 Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, explicitly informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentioning.33 Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, Explicitly Informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentionnding. 44 34 45 \subsubsection{Priority Scheduling} 46 Common information used by schedulers to direct their algorithm is priorities. 47 Each \ats is given a priority and higher-priority \ats are preferred to lower-priority ones. 48 The simplest priority scheduling algorithm is to require that every \ats have a distinct pre-established priority and always run the available \ats with the highest priority. 49 Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \ats. 50 It can therefore be desirable for schedulers to support \ats with identical priorities and/or automatically setting and adjusting priorities for \ats. 51 Most common operating systems use some variant on priorities with overlaps and dynamic priority adjustments. 52 For example, Microsoft Windows uses a pair of priorities 35 \subsubsection{Prority Scheduling} 36 A commonly used information that schedulers used to direct the algorithm is priorities. Each Task is given a priority and higher-priority \glspl{at} are preferred to lower-priority ones. The simplest priority scheduling algorithm is to simply require that every \gls{at} have a distinct pre-established priority and always run the available \gls{at} with the highest priority. Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \glspl{at}. It can therefore be diserable for schedulers to support \glspl{at} with identical priorities and/or automatically setting and adjusting priorites for \glspl{at}. The most common operating some variation on priorities with overlaps and dynamic priority adjustments. For example, Microsoft Windows uses a pair of priorities 53 37 \cit{https://docs.microsoft.com/en-us/windows/win32/procthread/scheduling-priorities,https://docs.microsoft.com/en-us/windows/win32/taskschd/taskschedulerschema-priority-settingstype-element}, one specified by users out of ten possible options and one adjusted by the system. 54 38 55 39 \subsection{Uninformed and Self-Informed Dynamic Schedulers} 56 Several scheduling algorithms do not require programmers to provide addition al information on each \ats, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.40 Several scheduling algorithms do not require programmers to provide additionnal information on each \gls{at}, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler. 57 41 58 42 59 43 \subsubsection{Feedback Scheduling} 60 As mentioned, schedulers may also gather information about each \ats to direct their decisions. 61 This design effectively moves the scheduler into the realm of \newterm{Control Theory}~\cite{wiki:controltheory}. 62 This information gathering does not generally involve programmers, and as such, does not increase programmer burden the same way explicitly provided information may. 63 However, some feedback schedulers do allow programmers to offer additional information on certain \ats, in order to direct scheduling decisions. 64 The important distinction being whether or not the scheduler can function without this additional information. 44 As mentionned, Schedulers may also gather information about each \glspl{at} to direct their decisions. This design effectively moves the scheduler to some extent into the realm of \newterm{Control Theory}\cite{wiki:controltheory}. This gathering does not generally involve programmers and as such does not increase programmer burden the same way explicitly provided information may. However, some feedback schedulers do offer the option to programmers to offer additionnal information on certain \glspl{at}, in order to direct scheduling decision. The important distinction being whether or not the scheduler can function without this additionnal information. 65 45 66 46 67 47 \section{Work Stealing}\label{existing:workstealing} 68 One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work stealing. 69 This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \ats first, but allows the possibility for other workers to steal local \ats if they run out of \ats. 70 \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has a queue of \ats and workers without \ats steal \ats from random workers\footnote{The Burton and Sleep algorithm had trees of \ats and steal only among neighbours.}. 71 Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations. 48 One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work-stealing. This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker work on its local \glspl{at} first, but allows the possibility for other workers to steal local \glspl{at} if they run out of \glspl{at}. \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has queue of \glspl{at} to accomplish and workers without \glspl{at} steal \glspl{at} from random workers. (The Burton and Sleep algorithm had trees of \glspl{at} and stole only among neighbours). Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations. 72 49 73 Many variations of this algorithm have been proposed over the years ~\cite{DBLP:journals/ijpp/YangH18}, both optimizations of existing implementations and approaches that account for new metrics.50 Many variations of this algorithm have been proposed over the years\cite{DBLP:journals/ijpp/YangH18}, both optmizations of existing implementations and approaches that account for new metrics. 74 51 75 \paragraph{Granularity} A significant portion of early work-stealing research concentrated on \newterm{Implicit Parallelism}~\cite{wiki:implicitpar}. 76 Since the system is responsible for splitting the work, granularity is a challenge that cannot be left to programmers, as opposed to \newterm{Explicit Parallelism}\cite{wiki:explicitpar} where the burden can be left to programmers. 77 In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead. 78 The best performance generally means finding a middle ground between the two. 79 Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained. 52 \paragraph{Granularity} A significant portion of early Work Stealing research was concentrating on \newterm{Implicit Parellelism}\cite{wiki:implicitpar}. Since the system was responsible to split the work, granularity is a challenge that cannot be left to the programmers (as opposed to \newterm{Explicit Parellelism}\cite{wiki:explicitpar} where the burden can be left to programmers). In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead. The best performance generally means finding a middle ground between the two. Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained. 80 53 81 \paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating \ atsfrom one core to another can be . \cite{DBLP:journals/tpds/SquillanteL93}54 \paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating \glspl{at} from one core to another can be . \cite{DBLP:journals/tpds/SquillanteL93} 82 55 83 56 \todo{The survey is not great on this subject} 84 57 85 \paragraph{Complex Machine Architecture} Another aspect that has been examined is how well work stealing is applicable to different machine architectures.58 \paragraph{Complex Machine Architecture} Another aspect that has been looked at is how well Work Stealing is applicable to different machine architectures. 86 59 87 60 \subsection{Theoretical Results} 88 There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}. 89 \cite{DBLP:journals/jacm/BlellochGM99} examines the space bounds of work stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} shows that for under-loaded systems, the scheduler completes its computations in finite time, \ie is \newterm{stable}. 90 Others show that work stealing is applicable to various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}. 91 \cite{DBLP:conf/ipps/ColeR13} also studied how randomized work-stealing affects false sharing among \ats. 61 There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogenous systems\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}. \cite{DBLP:journals/jacm/BlellochGM99} examine the space bounds of Work Stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} show that for underloaded systems, the scheduler will complete computations in finite time, \ie is \newterm{stable}. Others show that Work-Stealing is applicable to various scheduling contexts\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}. \cite{DBLP:conf/ipps/ColeR13} also studied how Randomized Work Stealing affects false sharing among \glspl{at}. 92 62 93 However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph. 94 It is unclear how well these distributions represent workloads in real world scenarios. 63 However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentionning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a Direct Acyclic Graph. It is unclear how well these distributions represent workloads in real world scenarios. 95 64 96 65 \section{Preemption} 97 One last aspect of scheduling is preemption since many schedulers rely on it for some of their guarantees. 98 Preemption is the idea of interrupting \ats that have been running too long, effectively injecting suspend points into the application. 99 There are multiple techniques to achieve this effect but they all aim to guarantee that the suspend points in a \ats are never further apart than some fixed duration. 100 While this helps schedulers guarantee that no \ats unfairly monopolizes a worker, preemption can effectively be added to any scheduler. 101 Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it. 66 One last aspect of scheduling worth mentionning is preemption since many schedulers rely on it for some of their guarantees. Preemption is the idea of interrupting \glspl{at} that have been running for too long, effectively injecting suspend points in the applications. There are multiple techniques to achieve this but they all aim to have the effect of guaranteeing that suspend points in a \gls{at} are never further apart than some fixed duration. While this helps schedulers guarantee that no \glspl{at} will unfairly monopolize a worker, preemption can effectively added to any scheduler. Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it. 102 67 103 \section{Production Schedulers}\label{existing:prod} 104 This section presents a quick overview of several current schedulers. 105 While these schedulers do not necessarily represent the most recent advances in scheduling, they are what is generally accessible to programmers. 106 As such, I believe these schedulers are at least as relevant as those presented in published work. 107 Schedulers that operate in kernel space and in user space are considered, as both can offer relevant insight for this project. 108 However, real-time schedulers are not considered, as these have constraints that are much stricter than what is needed for this project. 68 \section{Schedulers in Production}\label{existing:prod} 69 This section will show a quick overview of several schedulers which are generally available a the time of writing. While these schedulers don't necessarily represent to most recent advances in scheduling, they are what is generally accessible to programmers. As such, I believe that these schedulers are at least as relevant as those presented in published work. I chose both schedulers that operating in kernel space and in user space, as both can offer relevant insight for this project. However, I did not list any schedulers aimed for real-time applications, as these have constraints that are much stricter than what is needed for this project. 109 70 110 71 \subsection{Operating System Schedulers} 111 Operating System Schedulers tend to be fairly complex as they generally support some amount of real-time, aim to balance interactive and non-interactive \ats and support multiple users sharing hardware without requiring these users to cooperate. 112 Here are more details on a few schedulers used in the common operating systems: Linux, FreeBSD, Microsoft Windows and Apple's OS X. 113 The information is less complete for operating systems with closed source. 72 Operating System Schedulers tend to be fairly complex schedulers, they generally support some amount of real-time, aim to balance interactive and non-interactive \glspl{at} and support for multiple users sharing hardware without requiring these users to cooperate. Here are more details on a few schedulers used in the common operating systems: Linux, FreeBsd, Microsoft Windows and Apple's OS X. The information is less complete for operating systems behind closed source. 114 73 115 74 \paragraph{Linux's CFS} 116 The default scheduler used by Linux, the Completely Fair Scheduler~\cite{MAN:linux/cfs,MAN:linux/cfs2}, is a feedback scheduler based on CPU time. 117 For each processor, it constructs a Red-Black tree of \ats waiting to run, ordering them by the amount of CPU time used. 118 The \ats that has used the least CPU time is scheduled. 119 It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time used. 120 The ordering of \ats is also affected by a group based notion of fairness, where \ats belonging to groups having used less CPU time are preferred to \ats belonging to groups having used more CPU time. 121 Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the load, currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}. 75 The default scheduler used by Linux (the Completely Fair Scheduler)\cite{MAN:linux/cfs,MAN:linux/cfs2} is a feedback scheduler based on CPU time. For each processor, it constructs a Red-Black tree of \glspl{at} waiting to run, ordering them by amount of CPU time spent. The scheduler schedules the \gls{at} that has spent the least CPU time. It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time spent. The ordering of \glspl{at} is also impacted by a group based notion of fairness, where \glspl{at} belonging to groups having spent less CPU time are preferred to \glspl{at} beloning to groups having spent more CPU time. Linux achieves load-balancing by regularly monitoring the system state\cite{MAN:linux/cfs/balancing} and using some heuristic on the load (currently CPU time spent in the last millisecond plus decayed version of the previous time slots\cite{MAN:linux/cfs/pelt}.). 122 76 123 \cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly. 124 The issues highlighted stem from Linux's need to support fairness across \ats \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \ats and the other with one thousand \ats, the user with a single \ats does not receive one thousandth of the CPU time.}, increasing the complexity. 77 \cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work-stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly. The issues highlighted sem to stem from Linux's need to support fairness across \glspl{at} \emph{and} across users\footnote{Enforcing fairness across users means, for example, that given two users: one with a single \gls{at} and the other with one thousand \glspl{at}, the user with a single \gls{at} does not receive one one thousandth of the CPU time.}, increasing the complexity. 125 78 126 Linux also offers a FIFO scheduler, a real-time scheduler , which runs the highest-priority \ats, and a round-robin scheduler, which is an extension of the FIFO-scheduler that adds fixed time slices. \cite{MAN:linux/sched}79 Linux also offers a FIFO scheduler, a real-time schedulerwhich runs the highest-priority \gls{at}, and a round-robin scheduler, which is an extension of the fifo-scheduler that adds fixed time slices. \cite{MAN:linux/sched} 127 80 128 81 \paragraph{FreeBSD} 129 The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS. 130 It uses different data structures and heuristics but also schedules according to some combination of CPU time used and niceness values. 131 It also periodically balances the load of the system (according to a different heuristic), but uses a simpler work stealing approach. 82 The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS. It uses different data structures and heuristics but also schedules according to some combination of CPU time spent and niceness values. It also periodically balances the load of the system(according to a different heuristic), but uses a simpler Work Stealing approach. 132 83 133 84 \paragraph{Windows(OS)} 134 Microsoft's Operating System's Scheduler~\cite{MAN:windows/scheduler} is a feedback scheduler with priorities. 135 It supports 32 levels of priorities, some of which are reserved for real-time and privileged applications. 136 It schedules \ats based on the highest priorities (lowest number) and how much CPU time each \ats has used. 137 The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests. 85 Microsoft's Operating System's Scheduler\cite{MAN:windows/scheduler} is a feedback scheduler with priorities. It supports 32 levels of priorities, some of which are reserved for real-time and prviliged applications. It schedules \glspl{at} based on the highest priorities (lowest number) and how much cpu time each \glspl{at} have used. The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests. 138 86 139 87 \todo{load balancing} … … 152 100 153 101 \subsection{User-Level Schedulers} 154 By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \ats have the same user, and therefore cooperation is both feasible and probable. 155 156 \paragraph{Go}\label{GoSafePoint} 157 Go's scheduler uses a randomized work-stealing algorithm that has a global run-queue (\emph{GRQ}) and each processor (\emph{P}) has both a fixed-size run-queue (\emph{LRQ}) and a high-priority next ``chair'' holding a single element~\cite{GITHUB:go,YTUBE:go}. 158 Preemption is present, but only at safe-points,~\cit{https://go.dev/src/runtime/preempt.go} which are inserted detection code at various frequent access boundaries. 102 By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \glspl{at} have the same user, and therefore cooperation is both feasible and probable. 103 \paragraph{Go} 104 Go's scheduler uses a Randomized Work Stealing algorithm that has a global runqueue(\emph{GRQ}) and each processor(\emph{P}) has both a fixed-size runqueue(\emph{LRQ}) and a high-priority next ``chair'' holding a single element.\cite{GITHUB:go,YTUBE:go} Preemption is present, but only at function call boundaries. 159 105 160 106 The algorithm is as follows : 161 107 \begin{enumerate} 162 \item Once out of 61 times, pick 1 element from the \emph{GRQ}.108 \item Once out of 61 times, directly pick 1 element from the \emph{GRQ}. 163 109 \item If there is an item in the ``chair'' pick it. 164 110 \item Else pick an item from the \emph{LRQ}. 165 \begin{itemize} 166 \item If it is empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ} 167 \item and steal \emph{half} the \emph{LRQ} of another \emph{P} chosen randomly. 168 \end{itemize} 111 \item If it was empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ}. 112 \item If it was empty steal \emph{half} the \emph{LRQ} of another \emph{P} chosen randomly. 169 113 \end{enumerate} 170 114 171 115 \paragraph{Erlang} 172 Erlang is a functional language that supports concurrency in the form of processes: threads that share no data. 173 It uses a kind of round-robin scheduler, with a mix of work sharing and stealing to achieve load balancing~\cite{:erlang}, where under-loaded workers steal from other workers, but overloaded workers also push work to other workers. 174 This migration logic is directed by monitoring logic that evaluates the load a few times per seconds. 116 Erlang is a functionnal language that supports concurrency in the form of processes, threads that share no data. It seems to be some kind of Round-Robin Scheduler. It currently uses some mix of Work Sharing and Work Stealing to achieve load balancing\cite{:erlang}, where underloaded workers steal from other workers, but overloaded workers also push work to other workers. This migration logic seems to be directed by monitoring logic that evaluates the load a few times per seconds. 175 117 176 118 \paragraph{Intel\textregistered ~Threading Building Blocks} 177 \newterm{Thread Building Blocks} (TBB) is Intel's task parallelism \cite{wiki:taskparallel} framework. 178 It runs \newterm{jobs}, which are uninterruptable \ats that must always run to completion, on a pool of worker threads. 179 TBB's scheduler is a variation of randomized work-stealing that also supports higher-priority graph-like dependencies~\cite{MAN:tbb/scheduler}. 180 It schedules \ats as follows (where \textit{t} is the last \ats completed): 119 \newterm{Thread Building Blocks}(TBB) is Intel's task parellelism\cite{wiki:taskparallel} framework. It runs \newterm{jobs}, uninterruptable \glspl{at}, schedulable objects that must always run to completion, on a pool of worker threads. TBB's scheduler is a variation of Randomized Work Stealing that also supports higher-priority graph-like dependencies\cite{MAN:tbb/scheduler}. It schedules \glspl{at} as follows (where \textit{t} is the last \gls{at} completed): 181 120 \begin{displayquote} 182 121 \begin{enumerate} 183 \item The task returned by \textit{t} @.execute()@122 \item The task returned by \textit{t}\texttt{.execute()} 184 123 \item The successor of t if \textit{t} was its last completed predecessor. 185 \item A task popped from the end of the thread 's own deque.124 \item A task popped from the end of the thread’s own deque. 186 125 \item A task with affinity for the thread. 187 126 \item A task popped from approximately the beginning of the shared queue. 188 \item A task popped from the beginning of another randomly chosen thread 's deque.127 \item A task popped from the beginning of another randomly chosen thread’s deque. 189 128 \end{enumerate} 190 129 … … 195 134 196 135 \paragraph{Quasar/Project Loom} 197 Java has two projects, Quasar~\cite{MAN:quasar} and Project Loom~\cite{MAN:project-loom}\footnote{It is unclear if these are distinct projects.}, that are attempting to introduce lightweight thread\-ing in the form of Fibers. 198 Both projects seem to be based on the @ForkJoinPool@ in Java, which appears to be a simple incarnation of randomized work-stealing~\cite{MAN:java/fork-join}. 136 Java has two projects that are attempting to introduce lightweight threading into java in the form of Fibers, Quasar\cite{MAN:quasar} and Project Loom\cite{MAN:project-loom}\footnote{It is unclear to me if these are distinct projects or not}. Both projects seem to be based on the \texttt{ForkJoinPool} in Java which appears to be a simple incarnation of Randomized Work Stealing\cite{MAN:java/fork-join}. 199 137 200 138 \paragraph{Grand Central Dispatch} 201 An Apple\cit{Official GCD source} API that offers task parallelism~\cite{wiki:taskparallel}. 202 Its distinctive aspect is multiple ``Dispatch Queues'', some of which are created by programmers. 203 Each queue has its own local ordering guarantees, \eg \ats on queue $A$ are executed in \emph{FIFO} order. 139 This is an API produce by Apple\cit{Official GCD source} that offers task parellelism\cite{wiki:taskparallel}. Its distinctive aspect is that it uses multiple ``Dispatch Queues'', some of which are created by programmers. These queues each have their own local ordering guarantees, \eg \glspl{at} on queue $A$ are executed in \emph{FIFO} order. 204 140 205 141 \todo{load balancing and scheduling} … … 207 143 % http://web.archive.org/web/20090920043909/http://images.apple.com/macosx/technology/docs/GrandCentral_TB_brief_20090903.pdf 208 144 209 In terms of semantics, the Dispatch Queues seem to be very similar to Intel\textregistered ~TBB @execute()@ and predecessor semantics.145 In terms of semantics, the Dispatch Queues seem to be very similar in semantics to Intel\textregistered ~TBB \texttt{execute()} and predecessor semantics. Where it would be possible to convert from one to the other. 210 146 211 147 \paragraph{LibFibre} 212 LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developed at the University of Waterloo. 213 Similarly to Go, it uses a variation of work stealing with a global queue that is higher priority than stealing. 214 Unlike Go, it does not have the high-priority next ``chair'' and does not use randomized work-stealing. 148 LibFibre\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developt at the University of Waterloo. Similarly to Go, it uses a variation of Work Stealing with a global queue that is higher priority than stealing. Unlock Go it does not have the high-priority next ``chair'' and does not use Randomized Work Stealing. 149 -
doc/theses/thierry_delisle_PhD/thesis/text/intro.tex
rffec1bf r9e23b446 1 \chapter {Introduction}\label{intro}2 \ section{\CFA programming language}1 \chapter*{Introduction}\label{intro} 2 \todo{A proper intro} 3 3 4 The \CFA programming language~\cite{cfa:frontpage,cfa:typesystem} extends the C programming language by adding modern safety and productivity features, while maintaining backwards compatibility. 5 Among its productivity features, \CFA supports user-level threading~\cite{Delisle21} allowing programmers to write modern concurrent and parallel programs. 4 The C programming language~\cite{C11} 5 6 The \CFA programming language~\cite{cfa:frontpage,cfa:typesystem} extends the C programming language by adding modern safety and productivity features, while maintaining backwards compatibility. Among its productivity features, \CFA supports user-level threading~\cite{Delisle21} allowing programmers to write modern concurrent and parallel programs. 6 7 My previous master's thesis on concurrent in \CFA focused on features and interfaces. 7 This Ph.D.\ thesis focuses on performance, introducing \glsxtrshort{api} changes only when required by performance considerations. 8 Specifically, this work concentrates on scheduling and \glsxtrshort{io}. 9 Prior to this work, the \CFA runtime used a strict \glsxtrshort{fifo} \gls{rQ} and no \glsxtrshort{io} capabilities at the user-thread level\footnote{C supports \glsxtrshort{io} capabilities at the kernel level, which means blocking operations block kernel threads where blocking user-level threads whould be more appropriate for \CFA.}. 8 This Ph.D.\ thesis focuses on performance, introducing \glsxtrshort{api} changes only when required by performance considerations. Specifically, this work concentrates on scheduling and \glsxtrshort{io}. Prior to this work, the \CFA runtime used a strict \glsxtrshort{fifo} \gls{rQ} and no non-blocking I/O capabilities at the user-thread level. 10 9 11 As a research project, this work builds exclusively on newer versions of the Linux operating-system and gcc/clang compilers. 12 While \CFA is released, supporting older versions of Linux ($<$~Ubuntu 16.04) and gcc/clang compilers ($<$~gcc 6.0) is not a goal of this work. 13 14 \section{Scheduling} 15 Computer systems share multiple resources across many threads of execution, even on single user computers like laptops or smartphones. 16 On a computer system with multiple processors and work units, there exists the problem of mapping work onto processors in an efficient manner, called \newterm{scheduling}. 17 These systems are normally \newterm{open}, meaning new work arrives from an external source or is spawned from an existing work unit. 18 On a computer system, the scheduler takes a sequence of work requests in the form of threads and attempts to complete the work, subject to performance objectives, such as resource utilization. 19 A general-purpose dynamic-scheduler for an open system cannot anticipate future work requests, so its performance is rarely optimal. 20 With complete knowledge of arrive order and work, creating an optimal solution still effectively needs solving the bin packing problem\cite{wiki:binpak}. 21 However, optimal solutions are often not required. 22 Schedulers do produce excellent solutions, whitout needing optimality, by taking advantage of regularities in work patterns. 23 24 Scheduling occurs at discreet points when there are transitions in a system. 25 For example, a thread cycles through the following transitions during its execution. 26 \begin{center} 27 \input{executionStates.pstex_t} 28 \end{center} 29 These \newterm{state transition}s are initiated in response to events (\Index{interrupt}s): 30 \begin{itemize} 31 \item 32 entering the system (new $\rightarrow$ ready) 33 \item 34 timer alarm for preemption (running $\rightarrow$ ready) 35 \item 36 long term delay versus spinning (running $\rightarrow$ blocked) 37 \item 38 blocking ends, \ie network or I/O completion (blocked $\rightarrow$ ready) 39 \item 40 normal completion or error, \ie segment fault (running $\rightarrow$ halted) 41 \item 42 scheduler assigns a thread to a resource (ready $\rightarrow$ running) 43 \end{itemize} 44 Key to scheduling is that a thread cannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system. 45 46 When the workload exceeds the capacity of the processors, \ie work cannot be executed immediately, it is placed on a queue for subsequent service, called a \newterm{ready queue}. 47 Ready queues organize threads for scheduling, which indirectly organizes the work to be performed. 48 The structure of ready queues can take many different forms. 49 Where simple examples include single-queue multi-server (SQMS) and the multi-queue multi-server (MQMS). 50 \begin{center} 51 \begin{tabular}{l|l} 52 \multicolumn{1}{c|}{\textbf{SQMS}} & \multicolumn{1}{c}{\textbf{MQMS}} \\ 53 \hline 54 \raisebox{0.5\totalheight}{\input{SQMS.pstex_t}} & \input{MQMSG.pstex_t} 55 \end{tabular} 56 \end{center} 57 Beyond these two schedulers are a host of options, \ie adding an optional global, shared queue to MQMS. 58 59 The three major optimization criteria for a scheduler are: 60 \begin{enumerate}[leftmargin=*] 61 \item 62 \newterm{load balancing}: available work is distributed so no processor is idle when work is available. 63 64 \noindent 65 Eventual progress for each work unit is often an important consideration, \ie no starvation. 66 \item 67 \newterm{affinity}: processors access state through a complex memory hierarchy, so it is advantageous to keep a work unit's state on a single or closely bound set of processors. 68 69 \noindent 70 Essentially, all multi-processor computers have non-uniform memory access (NUMA), with one or more quantized steps to access data at different levels in the memory hierarchy. 71 When a system has a large number of independently executing threads, affinity becomes difficult because of \newterm{thread churn}. 72 That is, threads must be scheduled on multiple processors to obtain high processors utilization because the number of threads $\ggg$ processors. 73 74 \item 75 \newterm{contention}: safe access of shared objects by multiple processors requires mutual exclusion in some form, generally locking\footnote{ 76 Lock-free data-structures do not involve locking but incurr similar costs to achieve mutual exclusion.} 77 78 \noindent 79 Mutual exclusion cost and latency increases significantly with the number of processors accessing a shared object. 80 \end{enumerate} 81 82 Nevertheless, schedulers are a series of compromises, occasionally with some static or dynamic tuning parameters to enhance specific patterns. 83 Scheduling is a zero-sum game as computer processors normally have a fixed, maximum number of cycles per unit time\footnote{Frequency scaling and turbot boost add a degree of complexity that can be ignored in this discussion without loss of generality.}. 84 SQMS has perfect load-balancing but poor affinity and high contention by the processors, because of the single queue. 85 MQMS has poor load-balancing but perfect affinity and no contention, because each processor has its own queue. 86 87 Significant research effort has also looked at load sharing/stealing among queues, when a ready queue is too long or short, respectively. 88 These approaches attempt to perform better load-balancing at the cost of affinity and contention. 89 Load sharing/stealing schedulers attempt to push/pull work units to/from other ready queues 90 91 Note however that while any change comes at a cost, hence the zero-sum game, not all compromises are necessarily equivalent. 92 Some schedulers can perform very well only in very specific workload scenarios, others might offer acceptable performance but be applicable to a wider range of workloads. 93 Since \CFA attempts to improve the safety and productivity of C, the scheduler presented in this thesis attempts to achieve the same goals. 94 More specifically, safety and productivity for scheduling means supporting a wide range of workloads so that programmers can rely on progress guarantees (safety) and more easily achieve acceptable performance (productivity). 95 96 97 \section{Contributions}\label{s:Contributions} 98 This work provides the following contributions in the area of user-level scheduling in an advanced programming-language runtime-system: 99 \begin{enumerate}[leftmargin=*] 100 \item 101 A scalable scheduling algorithm that offers progress guarantees. 102 \item 103 An algorithm for load-balancing and idle sleep of processors, including NUMA awareness. 104 \item 105 Support for user-level \glsxtrshort{io} capabilities based on Linux's @io_uring@. 106 \end{enumerate} 10 As a research project, this work builds exclusively on newer versions of the Linux operating-system and gcc/clang compilers. While \CFA is released, supporting older versions of Linux ($<$~Ubuntu 16.04) and gcc/clang compilers ($<$~gcc 6.0) is not a goal of this work. -
doc/theses/thierry_delisle_PhD/thesis/text/io.tex
rffec1bf r9e23b446 1 1 \chapter{User Level \io} 2 As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations.2 As mentioned in Section~\ref{prev:io}, User-Level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations. 3 3 Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating-system. 4 4 5 5 \section{Kernel Interface} 6 Since this work fundamentally depends on operating-system support, the first step of this design is to discuss the available interfaces and pick one (or more) as the foundation for the non-blocking \io subsystem in this work.6 Since this work fundamentally depends on operating-system support, the first step of any design is to discuss the available interfaces and pick one (or more) as the foundations of the non-blocking \io subsystem. 7 7 8 8 \subsection{\lstinline{O_NONBLOCK}} … … 10 10 In this mode, ``Neither the @open()@ nor any subsequent \io operations on the [opened file descriptor] will cause the calling process to wait''~\cite{MAN:open}. 11 11 This feature can be used as the foundation for the non-blocking \io subsystem. 12 However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be use d in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait.\footnote{13 In this context, ready means \emph{some} operation can be performed without blocking.12 However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be use in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait 13 \footnote{In this context, ready means \emph{some} operation can be performed without blocking. 14 14 It does not mean an operation returning \lstinline{EAGAIN} succeeds on the next try. 15 For example, a ready read may only return a subset of requested bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}15 For example, a ready read may only return a subset of bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}. 16 16 This mechanism is also crucial in determining when all \glspl{thrd} are blocked and the application \glspl{kthrd} can now block. 17 17 18 There are three options to monitor file descriptors in Linux :\footnote{19 For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}.18 There are three options to monitor file descriptors in Linux 19 \footnote{For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}. 20 20 The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.}, 21 21 @select@~\cite{MAN:select}, @poll@~\cite{MAN:poll} and @epoll@~\cite{MAN:epoll}. 22 22 All three of these options offer a system call that blocks a \gls{kthrd} until at least one of many file descriptors becomes ready. 23 The group of file descriptors being waited on is called the \newterm{interest set}. 24 25 \paragraph{\lstinline{select}} is the oldest of these options, and takes as input a contiguous array of bits, where each bit represents a file descriptor of interest. 26 Hence, the array length must be as long as the largest FD currently of interest. 27 On return, it outputs the set in place to identify which of the file descriptors changed state. 28 This destructive change means selecting in a loop requires re-initializing the array for each iteration. 29 Another limit of @select@ is that calls from different \glspl{kthrd} sharing FDs are independent. 30 Hence, if one \gls{kthrd} is managing the select calls, other threads can only add/remove to/from the manager's interest set through synchronized calls to update the interest set. 31 However, these changes are only reflected when the manager makes its next call to @select@. 32 Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/ttys it is waiting on never get data again. 33 Often the I/O manager has a timeout, polls, or is sent a signal on changes to mitigate this problem. 34 35 \begin{comment} 36 From: Tim Brecht <brecht@uwaterloo.ca> 37 Subject: Re: FD sets 38 Date: Wed, 6 Jul 2022 00:29:41 +0000 39 40 Large number of open files 41 -------------------------- 42 43 In order to be able to use more than the default number of open file 44 descriptors you may need to: 45 46 o increase the limit on the total number of open files /proc/sys/fs/file-max 47 (on Linux systems) 48 49 o increase the size of FD_SETSIZE 50 - the way I often do this is to figure out which include file __FD_SETSIZE 51 is defined in, copy that file into an appropriate directory in ./include, 52 and then modify it so that if you use -DBIGGER_FD_SETSIZE the larger size 53 gets used 54 55 For example on a RH 9.0 distribution I've copied 56 /usr/include/bits/typesizes.h into ./include/i386-linux/bits/typesizes.h 57 58 Then I modify typesizes.h to look something like: 59 60 #ifdef BIGGER_FD_SETSIZE 61 #define __FD_SETSIZE 32767 62 #else 63 #define __FD_SETSIZE 1024 64 #endif 65 66 Note that the since I'm moving and testing the userver on may different 67 machines the Makefiles are set up to use -I ./include/$(HOSTTYPE) 68 69 This way if you redefine the FD_SETSIZE it will get used instead of the 70 default original file. 71 \end{comment} 72 73 \paragraph{\lstinline{poll}} is the next oldest option, and takes as input an array of structures containing the FD numbers rather than their position in an array of bits, allowing a more compact input for interest sets that contain widely spaced FDs. 74 For small interest sets with densely packed FDs, the @select@ bit mask can take less storage, and hence, copy less information into the kernel. 75 Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialize on every call. 76 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \gls{kthrd}, while a manager thread is blocked in @poll@. 77 78 \paragraph{\lstinline{epoll}} follows after @poll@, and places the interest set in the kernel rather than the application, where it is managed by an internal \gls{kthrd}. 79 There are two separate functions: one to add to the interest set and another to check for FDs with state changes. 23 The group of file descriptors being waited is called the \newterm{interest set}. 24 25 \paragraph{\lstinline{select}} is the oldest of these options, it takes as an input a contiguous array of bits, where each bits represent a file descriptor of interest. 26 On return, it modifies the set in place to identify which of the file descriptors changed status. 27 This destructive change means that calling select in a loop requires re-initializing the array each time and the number of file descriptors supported has a hard limit. 28 Another limit of @select@ is that once the call is started, the interest set can no longer be modified. 29 Monitoring a new file descriptor generally requires aborting any in progress call to @select@ 30 \footnote{Starting a new call to \lstinline{select} is possible but requires a distinct kernel thread, and as a result is not an acceptable multiplexing solution when the interest set is large and highly dynamic unless the number of parallel calls to \lstinline{select} can be strictly bounded.}. 31 32 \paragraph{\lstinline{poll}} is an improvement over select, which removes the hard limit on the number of file descriptors and the need to re-initialize the input on every call. 33 It works using an array of structures as an input rather than an array of bits, thus allowing a more compact input for small interest sets. 34 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed while the call is blocked. 35 36 \paragraph{\lstinline{epoll}} further improves these two functions by allowing the interest set to be dynamically added to and removed from while a \gls{kthrd} is blocked on an @epoll@ call. 80 37 This dynamic capability is accomplished by creating an \emph{epoll instance} with a persistent interest set, which is used across multiple calls. 81 As the interest set is augmented, the changes become implicitly part of the interest set for a blocked manager \gls{kthrd}. 82 This capability significantly reduces synchronization between \glspl{kthrd} and the manager calling @epoll@. 83 84 However, all three of these I/O systems have limitations. 38 This capability significantly reduces synchronization overhead on the part of the caller (in this case the \io subsystem), since the interest set can be modified when adding or removing file descriptors without having to synchronize with other \glspl{kthrd} potentially calling @epoll@. 39 40 However, all three of these system calls have limitations. 85 41 The @man@ page for @O_NONBLOCK@ mentions that ``[@O_NONBLOCK@] has no effect for regular files and block devices'', which means none of these three system calls are viable multiplexing strategies for these types of \io operations. 86 42 Furthermore, @epoll@ has been shown to have problems with pipes and ttys~\cit{Peter's examples in some fashion}. … … 97 53 It also supports batching multiple operations in a single system call. 98 54 99 AIO offers two different approach esto polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.55 AIO offers two different approach to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed. 100 56 For the purpose of \io multiplexing, @aio_suspend@ is the best interface. 101 57 However, even if AIO requests can be submitted concurrently, @aio_suspend@ suffers from the same limitation as @select@ and @poll@, \ie, the interest set cannot be dynamically changed while a call to @aio_suspend@ is in progress. … … 114 70 115 71 \begin{flushright} 116 -- Linus Torvalds ~\cite{AIORant}72 -- Linus Torvalds\cit{https://lwn.net/Articles/671657/} 117 73 \end{flushright} 118 74 \end{displayquote} … … 129 85 A very recent addition to Linux, @io_uring@~\cite{MAN:io_uring}, is a framework that aims to solve many of the problems listed in the above interfaces. 130 86 Like AIO, it represents \io operations as entries added to a queue. 131 But like @epoll@, new requests can be submitted , while a blocking call waiting for requests to complete,is already in progress.87 But like @epoll@, new requests can be submitted while a blocking call waiting for requests to complete is already in progress. 132 88 The @io_uring@ interface uses two ring buffers (referred to simply as rings) at its core: a submit ring to which programmers push \io requests and a completion ring from which programmers poll for completion. 133 89 … … 141 97 In the worst case, where all \glspl{thrd} are consistently blocking on \io, it devolves into 1-to-1 threading. 142 98 However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \glspl{thrd} are ready to run. 143 This approach is used by languages like Go\cit{Go} , frameworks like libuv\cit{libuv}, and web servers like Apache~\cite{apache} and Nginx~\cite{nginx}, since it has the advantage that it can easily be used across multiple operating systems.99 This approach is used by languages like Go\cit{Go} and frameworks like libuv\cit{libuv}, since it has the advantage that it can easily be used across multiple operating systems. 144 100 This advantage is especially relevant for languages like Go, which offer a homogeneous \glsxtrshort{api} across all platforms. 145 101 As opposed to C, which has a very limited standard api for \io, \eg, the C standard library has no networking. … … 155 111 \section{Event-Engine} 156 112 An event engine's responsibility is to use the kernel interface to multiplex many \io operations onto few \glspl{kthrd}. 157 In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engine then starts anoperation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.113 In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engines then starts the operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}. 158 114 The parked \glspl{thrd} are then rescheduled by the event engine once the desired operation has completed. 159 115 … … 178 134 \begin{enumerate} 179 135 \item 180 An SQE is allocated from the pre-allocated array \emph{S}.136 An SQE is allocated from the pre-allocated array (denoted \emph{S} in Figure~\ref{fig:iouring}). 181 137 This array is created at the same time as the @io_uring@ instance, is in kernel-locked memory visible by both the kernel and the application, and has a fixed size determined at creation. 182 How these entries are allocated is not important for the functioning of @io_uring@; 183 the only requirement is that no entry is reused before the kernel has consumed it. 138 How these entries are allocated is not important for the functioning of @io_uring@, the only requirement is that no entry is reused before the kernel has consumed it. 184 139 \item 185 140 The SQE is filled according to the desired operation. 186 This step is straight forward. 187 The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries. 141 This step is straight forward, the only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries. 188 142 \item 189 143 The SQE is submitted to the submission ring by appending the index of the SQE to the ring following regular ring buffer steps: \lstinline{buffer[head] = item; head++}. 190 144 Since the head is visible to the kernel, some memory barriers may be required to prevent the compiler from reordering these operations. 191 145 Since the submission ring is a regular ring buffer, more than one SQE can be added at once and the head is updated only after all entries are updated. 192 Note, SQE can be filled and submitted in any order, \eg in Figure~\ref{fig:iouring} the submission order is S0, S3, S2 and S1 has not been submitted.193 146 \item 194 147 The kernel is notified of the change to the ring using the system call @io_uring_enter@. … … 208 161 The @io_uring_enter@ system call is protected by a lock inside the kernel. 209 162 This protection means that concurrent call to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@. 210 It is possible to do the first three submission steps in parallel; 211 however, doing so requires careful synchronization. 163 It is possible to do the first three submission steps in parallel, however, doing so requires careful synchronization. 212 164 213 165 @io_uring@ also introduces constraints on the number of simultaneous operations that can be ``in flight''. 214 First, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once.215 Second, the @io_uring_enter@ system call can fail because ``The kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can havepending.''.166 Obviously, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once. 167 In addition, the @io_uring_enter@ system call can fail because ``The kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can have pending.''. 216 168 This restriction means \io request bursts may have to be subdivided and submitted in chunks at a later time. 217 169 218 170 \subsection{Multiplexing \io: Submission} 219 220 171 The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made in the submission side. 221 While there is freedom in designing the submission side, there are some realities of @io_uring@ that must be taken into account. 222 It is possible to do the first steps of submission in parallel; 223 however, the duration of the system call scales with the number of entries submitted. 172 While it is possible to do the first steps of submission in parallel, the duration of the system call scales with number of entries submitted. 224 173 The consequence is that the amount of parallelism used to prepare submissions for the next system call is limited. 225 174 Beyond this limit, the length of the system call is the throughput limiting factor. 226 I concluded from early experiments that preparing submissions seems to take almost as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}. 227 Therefore, the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances. 228 Since completions are sent to the instance where requests were submitted, all instances with pending operations must be polled continuously\footnote{ 229 As described in Chapter~\ref{practice}, this does not translate into constant CPU usage.}. 175 I concluded from early experiments that preparing submissions seems to take at most as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}. 176 Therefore the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances. 177 Similarly to scheduling, this sharding can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two. 178 Since completions are sent to the instance where requests were submitted, all instances with pending operations must be polled continously 179 \footnote{As will be described in Chapter~\ref{practice}, this does not translate into constant cpu usage.}. 230 180 Note that once an operation completes, there is nothing that ties it to the @io_uring@ instance that handled it. 231 There is nothing preventing a new operation with, \egthe same file descriptors to a different @io_uring@ instance.181 There is nothing preventing a new operation with, for example, the same file descriptors to a different @io_uring@ instance. 232 182 233 183 A complicating aspect of submission is @io_uring@'s support for chains of operations, where the completion of an operation triggers the submission of the next operation on the link. 234 184 SQEs forming a chain must be allocated from the same instance and must be contiguous in the Submission Ring (see Figure~\ref{fig:iouring}). 235 The consequence of this feature is that filling SQEs can be arbitrarily complex, and therefore, users may need to run arbitrary code between allocation and submission. 236 Supporting chains is not a requirement of the \io subsystem, but it is still valuable. 237 Support for this feature can be fulfilled simply by supporting arbitrary user code between allocation and submission. 238 239 Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two. 240 These three sharding approaches are analyzed. 185 The consequence of this feature is that filling SQEs can be arbitrarly complex and therefore users may need to run arbitrary code between allocation and submission. 186 Supporting chains is a requirement of the \io subsystem, but it is still valuable. 187 Support for this feature can be fulfilled simply to supporting arbitrary user code between allocation and submission. 188 189 \subsubsection{Public Instances} 190 One approach is to have multiple shared instances. 191 \Glspl{thrd} attempting \io operations pick one of the available instances and submit operations to that instance. 192 Since there is no coupling between \glspl{proc} and @io_uring@ instances in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently. 193 Since @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects: the synchronization needed to submit does not induce more contention than @io_uring@ already does and the scheme to route \io requests to specific @io_uring@ instances does not introduce contention. 194 This second aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm. 195 196 Allocation in this scheme can be handled fairly easily. 197 Free SQEs, \ie, SQEs that aren't currently being used to represent a request, can be written to safely and have a field called @user_data@ which the kernel only reads to copy to @cqe@s. 198 Allocation also requires no ordering guarantee as all free SQEs are interchangeable. 199 This requires a simple concurrent bag. 200 The only added complexity is that the number of SQEs is fixed, which means allocation can fail. 201 202 Allocation failures need to be pushed up to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available. 203 Furthermore, the routing algorithm should block operations up-front if none of the instances have available SQEs. 204 205 Once an SQE is allocated, \glspl{thrd} can fill them normally, they simply need to keep track of the SQE index and which instance it belongs to. 206 207 Once an SQE is filled in, what needs to happen is that the SQE must be added to the submission ring buffer, an operation that is not thread-safe on itself, and the kernel must be notified using the @io_uring_enter@ system call. 208 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail 209 \footnote{This is because it is invalid to have the same \lstinline{sqe} multiple times in the ring buffer.}. 210 However, as mentioned, the system call itself can fail with the expectation that it will be retried once some of the already submitted operations complete. 211 Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency. 212 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted. 213 This can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section. 214 215 In the case of designating a \gls{thrd}, ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests would be batched together and one of the \glspl{thrd} would do the system call on behalf of the others, referred to as the \newterm{submitter}. 216 In practice however, it is important that the \io requests are not left pending indefinitely and as such, it may be required to have a ``next submitter'' that guarentees everything that is missed by the current submitter is seen by the next one. 217 Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call will include their request. 218 Once the system call is done, the submitter must also free SQEs so that the allocator can reused them. 219 220 Finally, the completion side is much simpler since the @io_uring@ system call enforces a natural synchronization point. 221 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}. 222 Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}. 223 If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events. 224 A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled. 225 226 With this pool of instances approach, the big advantage is that it is fairly flexible. 227 It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions. 228 It also can gracefully handle running out of ressources, SQEs or the kernel returning @EBUSY@. 229 The down side to this is that many of the steps used for submitting need complex synchronization to work properly. 230 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed. 231 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused and handle the kernel returning @EBUSY@. 232 All this synchronization may have a significant cost and, compared to the next approach presented, this synchronization is entirely overhead. 241 233 242 234 \subsubsection{Private Instances} 243 The private approach creates one ring instance per \gls{proc}, \ie one-to-one coupling. 244 This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not time-sliced during submission steps. 245 This requirement is the same as accessing @thread_local@ variables, where a \gls{thrd} is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data. 246 This failure is the serially reusable problem~\cite{SeriallyReusable}. 247 Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in allocation order.\footnote{ 248 To remove this requirement, a \gls{thrd} needs the ability to ``yield to a specific \gls{proc}'', \ie, park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.} 249 From the subsystem's point of view, the allocation and submission are sequential, greatly simplifying both. 250 In this design, allocation and submission form a partitioned ring buffer as shown in Figure~\ref{fig:pring}. 251 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to perform the system call. 252 Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, \etc. 235 Another approach is to simply create one ring instance per \gls{proc}. 236 This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not interrupted in between two submission steps. 237 This is effectively the same requirement as using @thread_local@ variables. 238 Since SQEs that are allocated must be submitted to the same ring, on the same \gls{proc}, this effectively forces the application to submit SQEs in allocation order 239 \footnote{The actual requirement is that \glspl{thrd} cannot context switch between allocation and submission. 240 This requirement means that from the subsystem's point of view, the allocation and submission are sequential. 241 To remove this requirement, a \gls{thrd} would need the ability to ``yield to a specific \gls{proc}'', \ie, park with the promise that it will be run next on a specific \gls{proc}, the \gls{proc} attached to the correct ring.} 242 , greatly simplifying both allocation and submission. 243 In this design, allocation and submission form a partitionned ring buffer as shown in Figure~\ref{fig:pring}. 244 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to do the system call. 245 Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, etc. 253 246 254 247 \begin{figure} … … 261 254 \end{figure} 262 255 263 This approach has the advantage that it does not require much of the synchronization needed in a shared approach. 264 However, this benefit means \glspl{thrd} submitting \io operations have less flexibility: they cannot park or yield, and several exceptional cases are handled poorly. 265 Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations. 266 In this case, the \io \gls{thrd} needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed. 267 268 A more involved version of this approach tries to solve these problems using a pattern called \newterm{helping}. 269 \Glspl{thrd} that cannot submit \io operations, either because of an allocation failure or migration to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster. 270 While there is still the strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \glspl{thrd} to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request. 271 272 Imagine a simple scenario with two \glspl{thrd} on two \glspl{proc}, where one \gls{thrd} submits an \io operation and then sets a flag, while the other \gls{thrd} spins until the flag is set. 273 Assume both \glspl{thrd} are running on the same \gls{proc}, and the \io \gls{thrd} is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \gls{thrd}. 274 In this case, the helping solution has the \io \gls{thrd} append an \io object to the submission list of the first \gls{proc}, where the allocation was made. 256 This approach has the advantage that it does not require much of the synchronization needed in the shared approach. 257 This comes at the cost that \glspl{thrd} submitting \io operations have less flexibility, they cannot park or yield, and several exceptional cases are handled poorly. 258 Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations, in such a case the \gls{thrd} needs to be moved to a different \gls{proc}, the only current way of achieving this would be to @yield()@ hoping to be scheduled on a different \gls{proc}, which is not guaranteed. 259 260 A more involved version of this approach can seem to solve most of these problems, using a pattern called \newterm{helping}. 261 \Glspl{thrd} that wish to submit \io operations but cannot do so 262 \footnote{either because of an allocation failure or because they were migrate to a different \gls{proc} between allocation and submission} 263 create an object representing what they wish to achieve and add it to a list somewhere. 264 For this particular problem, one solution would be to have a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster. 265 The problem with these ``solutions'' is that they are still bound by the strong coupling between \glspl{proc} and @io_uring@ instances. 266 These data structures would allow moving \glspl{thrd} to a specific \gls{proc} when the current \gls{proc} cannot fulfill the \io request. 267 268 Imagine a simple case with two \glspl{thrd} on two \glspl{proc}, one \gls{thrd} submits an \io operation and then sets a flag, the other \gls{thrd} spins until the flag is set. 269 If the first \gls{thrd} is preempted between allocation and submission and moves to the other \gls{proc}, the original \gls{proc} could start running the spinning \gls{thrd}. 270 If this happens, the helping ``solution'' is for the \io \gls{thrd}to added append an item to the submission list of the \gls{proc} where the allocation was made. 275 271 No other \gls{proc} can help the \gls{thrd} since @io_uring@ instances are strongly coupled to \glspl{proc}. 276 However, the \io \gls{proc} is unable to help because it is executing the spinning \gls{thrd} resulting in a deadlock. 277 While this example is artificial, in the presence of many \glspl{thrd}, it is possible for this problem to arise ``in the wild''. 278 Furthermore, this pattern is difficult to reliably detect and avoid. 279 Once in this situation, the only escape is to interrupted the spinning \gls{thrd}, either directly or via some regular preemption, \eg time slicing. 280 Having to interrupt \glspl{thrd} for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect. 272 However, in this case, the \gls{proc} is unable to help because it is executing the spinning \gls{thrd} mentioned when first expression this case 273 \footnote{This particular example is completely artificial, but in the presence of many more \glspl{thrd}, it is not impossible that this problem would arise ``in the wild''. 274 Furthermore, this pattern is difficult to reliably detect and avoid.} 275 resulting in a deadlock. 276 Once in this situation, the only escape is to interrupted the execution of the \gls{thrd}, either directly or due to regular preemption, only then can the \gls{proc} take the time to handle the pending request to help. 277 Interrupting \glspl{thrd} for this purpose is far from desireable, the cost is significant and the situation may be hard to detect. 278 However, a more subtle reason why interrupting the \gls{thrd} is not a satisfying solution is that the \gls{proc} is not actually using the instance it is tied to. 279 If it were to use it, then helping could be done as part of the usage. 281 280 Interrupts are needed here entirely because the \gls{proc} is tied to an instance it is not using. 282 Therefore, a more satisfying solution is for the \gls{thrd} submitting the operation to notice that the instance is unused and simply go ahead and use it. 283 This approach is presented shortly. 284 285 \subsubsection{Public Instances} 286 The public approach creates decoupled pools of @io_uring@ instances and processors, \ie without one-to-one coupling. 287 \Glspl{thrd} attempting an \io operation pick one of the available instances and submit the operation to that instance. 288 Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently. 289 Because @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects: 290 \begin{itemize} 291 \item 292 The synchronization needed to submit does not induce more contention than @io_uring@ already does. 293 \item 294 The scheme to route \io requests to specific @io_uring@ instances does not introduce contention. 295 This aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm. 296 \end{itemize} 297 298 Allocation in this scheme is fairly easy. 299 Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written to safely and have a field called @user_data@ that the kernel only reads to copy to @cqe@s. 300 Allocation also requires no ordering guarantee as all free SQEs are interchangeable. 301 The only added complexity is that the number of SQEs is fixed, which means allocation can fail. 302 303 Allocation failures need to be pushed to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available. 304 Furthermore, the routing algorithm should block operations up-front, if none of the instances have available SQEs. 305 306 Once an SQE is allocated, \glspl{thrd} insert the \io request information, and keep track of the SQE index and the instance it belongs to. 307 308 Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread-safe, and then the kernel must be notified using the @io_uring_enter@ system call. 309 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean a \lstinline{sqe} multiple times in the ring buffer, which is undefined behaviour. 310 However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations complete. 311 312 Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency. 313 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted. 314 Balancing submission can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section. 315 316 Ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \glspl{thrd} is designated to do the system call on behalf of the others, called the \newterm{submitter}. 317 However, in practice, \io requests must be handed promptly so there is a need to guarantee everything missed by the current submitter is seen by the next one. 318 Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call includes their request. 319 Once the system call is done, the submitter must also free SQEs so that the allocator can reused them. 320 321 Finally, the completion side is much simpler since the @io_uring@ system-call enforces a natural synchronization point. 322 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}. 323 Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}. 324 If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events. 325 A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled. 326 327 With the pool of SEQ instances approach, the big advantage is that it is fairly flexible. 328 It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions. 329 It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@. 330 The down side to this approach is that many of the steps used for submitting need complex synchronization to work properly. 331 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed. 332 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused, and handle the kernel returning @EBUSY@. 333 All this synchronization has a significant cost, and compared to the private-instance approach, this synchronization is entirely overhead. 281 Therefore a more satisfying solution would be for the \gls{thrd} submitting the operation to simply notice that the instance is unused and simply go ahead and use it. 282 This is the approach presented next. 334 283 335 284 \subsubsection{Instance borrowing} 336 Both of the prior approaches have undesirable aspects that stem from tight or loose coupling between @io_uring@ and \glspl{proc}. 337 The first approach suffers from tight coupling causing problems when a \gls{proc} does not benefit from the coupling. 338 The second approach suffers from loose coupling causing operations to have synchronization overhead, which tighter coupling avoids. 339 When \glspl{proc} are continuously issuing \io operations, tight coupling is valuable since it avoids synchronization costs. 340 However, in unlikely failure cases or when \glspl{proc} are not using their instances, tight coupling is no longer advantageous. 341 A compromise between these approaches is to allow tight coupling but have the option to revoke the coupling dynamically when failure cases arise. 342 I call this approach \newterm{instance borrowing}.\footnote{ 343 While instance borrowing looks similar to work sharing and stealing, I think it is different enough to warrant a different verb to avoid confusion.} 344 345 In this approach, each cluster, see Figure~\ref{fig:system}, owns a pool of @io_uring@ instances managed by an \newterm{arbiter}. 285 Both of the approaches presented above have undesirable aspects that stem from too loose or too tight coupling between @io_uring@ and \glspl{proc}. 286 In the first approach, loose coupling meant that all operations have synchronization overhead that a tighter coupling can avoid. 287 The second approach on the other hand suffers from tight coupling causing problems when the \gls{proc} do not benefit from the coupling. 288 While \glspl{proc} are continously issuing \io operations tight coupling is valuable since it avoids synchronization costs. 289 However, in unlikely failure cases or when \glspl{proc} are not making use of their instance, tight coupling is no longer advantageous. 290 A compromise between these approaches would be to allow tight coupling but have the option to revoke this coupling dynamically when failure cases arise. 291 I call this approach ``instance borrowing''\footnote{While it looks similar to work-sharing and work-stealing, I think it is different enough from either to warrant a different verb to avoid confusion.}. 292 293 In this approach, each cluster owns a pool of @io_uring@ instances managed by an arbiter. 346 294 When a \gls{thrd} attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance. 347 This instance is now bound to the \gls{proc} the \gls{thrd} isrunning on.348 This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.349 This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at a time, akin to the private instances approach.350 However, it differs in that revocation bythe arbiter means this approach does not suffer from the deadlock scenario described above.295 However, in doing so it ties to the instance to the \gls{proc} it is currently running on. 296 This coupling is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io. 297 This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at any given time, akin to the private instances approach. 298 However, where it differs is that revocation from the arbiter means this approach does not suffer from the deadlock scenario described above. 351 299 352 300 Arbitration is needed in the following cases: 353 301 \begin{enumerate} 354 \item The current \gls{proc} does not hold an instance.302 \item The current \gls{proc} does not currently hold an instance. 355 303 \item The current instance does not have sufficient SQEs to satisfy the request. 356 \item The current \gls{proc} has a wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission, called \newterm{external submissions}. 304 \item The current \gls{proc} has the wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission. 305 I will refer to these as \newterm{External Submissions}. 357 306 \end{enumerate} 358 However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their instance ownership is not being revoked, which is accomplished by a lock-\emph{less} handshake.\footnote{359 Note the handshake is not lock \emph{free} since it lacks the proper progress guarantee.} 307 However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their ownership of the instance is not being revoked. 308 This can be accomplished by a lock-less handshake\footnote{Note that the handshake is not Lock-\emph{Free} since it lacks the proper progress guarantee.}. 360 309 A \gls{proc} raises a local flag before using its borrowed instance and checks if the instance is marked as revoked or if the arbiter has raised its flag. 361 If not ,it proceeds, otherwise it delegates the operation to the arbiter.310 If not it proceeds, otherwise it delegates the operation to the arbiter. 362 311 Once the operation is completed, the \gls{proc} lowers its local flag. 363 312 364 Correspondingly, before revoking an instance ,the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag.313 Correspondingly, before revoking an instance the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag. 365 314 Only then does it reclaim the instance and potentially assign it to an other \gls{proc}. 366 315 … … 374 323 375 324 \paragraph{External Submissions} are handled by the arbiter by revoking the appropriate instance and adding the submission to the submission ring. 376 However, there is no need to immediately revoke the instance.325 There is no need to immediately revoke the instance however. 377 326 External submissions must simply be added to the ring before the next system call, \ie, when the submission ring is flushed. 378 This means whoever is responsible for the system call, first checks if the instance has any external submissions. 379 If so, it asks the arbiter to revoke the instance and add the external submissions to the ring. 380 381 \paragraph{Pending Allocations} are handled by the arbiter when it has available instances and can directly hand over the instance and satisfy the request. 382 Otherwise, it must hold onto the list of threads until SQEs are made available again. 383 This handling is more complex when an allocation requires multiple SQEs, since the arbiter must make a decision between satisfying requests in FIFO ordering or for fewer SQEs. 384 385 While an arbiter has the potential to solve many of the problems mentioned above, it also introduces a significant amount of complexity. 327 This means that whoever is responsible for the system call first checks if the instance has any external submissions. 328 If it is the case, it asks the arbiter to revoke the instance and add the external submissions to the ring. 329 330 \paragraph{Pending Allocations} can be more complicated to handle. 331 If the arbiter has available instances, the arbiter can attempt to directly hand over the instance and satisfy the request. 332 Otherwise it must hold onto the list of threads until SQEs are made available again. 333 This handling becomes that much more complex if pending allocation require more than one SQE, since the arbiter must make a decision between statisfying requests in FIFO ordering or satisfy requests for fewer SQEs first. 334 335 While this arbiter has the potential to solve many of the problems mentionned in above, it also introduces a significant amount of complexity. 386 336 Tracking which processors are borrowing which instances and which instances have SQEs available ends-up adding a significant synchronization prelude to any I/O operation. 387 337 Any submission must start with a handshake that pins the currently borrowed instance, if available. 388 338 An attempt to allocate is then made, but the arbiter can concurrently be attempting to allocate from the same instance from a different \gls{hthrd}. 389 Once the allocation is completed, the submission must check that the instance is still burrowed before attemptingto flush.390 These synchronization steps turn out to have a similar cost to the multiple shared-instances approach.339 Once the allocation is completed, the submission must still check that the instance is still burrowed before attempt to flush. 340 These extra synchronization steps end-up having a similar cost to the multiple shared instances approach. 391 341 Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end-up cycling the processors, which leads to significant cache deterioration. 392 Forthese reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice.342 Because of these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice. 393 343 394 344 \subsubsection{Private Instances V2} 395 345 346 347 396 348 % Verbs of this design 397 349 398 350 % Allocation: obtaining an sqe from which to fill in the io request, enforces the io instance to use since it must be the one which provided the sqe. Must interact with the arbiter if the instance does not have enough sqe for the allocation. (Typical allocation will ask for only one sqe, but chained sqe must be allocated from the same context so chains of sqe must be allocated in bulks) 399 351 400 % Submi ssion: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.352 % Submition: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission. 401 353 402 354 % Flushing: Taking all the sqes that were submitted and making them visible to the kernel, also counting them in order to figure out what to_submit should be. Must be thread-safe with submission. Has to interact with the Arbiter if there are external submissions. Can't simply use a protected queue because adding to the array is not safe if the ring is still available for submitters. Flushing must therefore: check if there are external pending requests if so, ask the arbiter to flush otherwise use the fast flush operation. … … 405 357 406 358 % Handle: process all the produced cqe. No need to interact with any of the submission operations or the arbiter. 359 360 407 361 408 362 … … 450 404 451 405 \section{Interface} 452 The last important part of the \io subsystem is its interface. 453 There are multiple approaches that can be offered to programmers, each with advantages and disadvantages. 454 The new \io subsystem can replace the C runtime API or extend it, and in the later case, the interface can go from very similar to vastly different. 455 The following sections discuss some useful options using @read@ as an example. 456 The standard Linux interface for C is : 457 \begin{cfa} 458 ssize_t read(int fd, void *buf, size_t count); 459 \end{cfa} 406 Finally, the last important part of the \io subsystem is it's interface. There are multiple approaches that can be offered to programmers, each with advantages and disadvantages. The new \io subsystem can replace the C runtime's API or extend it. And in the later case the interface can go from very similar to vastly different. The following sections discuss some useful options using @read@ as an example. The standard Linux interface for C is : 407 408 @ssize_t read(int fd, void *buf, size_t count);@ 460 409 461 410 \subsection{Replacement} 462 411 Replacing the C \glsxtrshort{api} is the more intrusive and draconian approach. 463 412 The goal is to convince the compiler and linker to replace any calls to @read@ to direct them to the \CFA implementation instead of glibc's. 464 This rerouting has the advantage ofworking transparently and supporting existing binaries without needing recompilation.413 This has the advantage of potentially working transparently and supporting existing binaries without needing recompilation. 465 414 It also offers a, presumably, well known and familiar API that C programmers can simply continue to work with. 466 However, this approach also entails a plethora of subtle technical challenges ,which generally boils down to making a perfect replacement.415 However, this approach also entails a plethora of subtle technical challenges which generally boils down to making a perfect replacement. 467 416 If the \CFA interface replaces only \emph{some} of the calls to glibc, then this can easily lead to esoteric concurrency bugs. 468 Since the gcc ecosystems does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible.417 Since the gcc ecosystems does not offer a scheme for such perfect replacement, this approach was rejected as being laudable but infeasible. 469 418 470 419 \subsection{Synchronous Extension} 471 Another interface option is to offer an interface different in name only. 472 For example: 473 \begin{cfa} 474 ssize_t cfa_read(int fd, void *buf, size_t count); 475 \end{cfa} 476 This approach is feasible and still familiar to C programmers. 477 It comes with the caveat that any code attempting to use it must be recompiled, which is a problem considering the amount of existing legacy C binaries. 420 An other interface option is to simply offer an interface that is different in name only. For example: 421 422 @ssize_t cfa_read(int fd, void *buf, size_t count);@ 423 424 \noindent This is much more feasible but still familiar to C programmers. 425 It comes with the caveat that any code attempting to use it must be recompiled, which can be a big problem considering the amount of existing legacy C binaries. 478 426 However, it has the advantage of implementation simplicity. 479 Finally, there is a certain irony to using a blocking synchronous interfaces for a feature often referred to as ``non-blocking'' \io.480 427 481 428 \subsection{Asynchronous Extension} 482 A fairly traditional way of providing asynchronous interactions is using a future mechanism~\cite{multilisp}, \eg: 483 \begin{cfa} 484 future(ssize_t) read(int fd, void *buf, size_t count); 485 \end{cfa} 486 where the generic @future@ is fulfilled when the read completes and it contains the number of bytes read, which may be less than the number of bytes requested. 487 The data read is placed in @buf@. 488 The problem is that both the bytes read and data form the synchronization object, not just the bytes read. 489 Hence, the buffer cannot be reused until the operation completes but the synchronization does not cover the buffer. 490 A classical asynchronous API is: 491 \begin{cfa} 492 future([ssize_t, void *]) read(int fd, size_t count); 493 \end{cfa} 494 where the future tuple covers the components that require synchronization. 495 However, this interface immediately introduces memory lifetime challenges since the call must effectively allocate a buffer to be returned. 496 Because of the performance implications of this API, the first approach is considered preferable as it is more familiar to C programmers. 497 498 \subsection{Direct \lstinline{io_uring} Interface} 499 The last interface directly exposes the underlying @io_uring@ interface, \eg: 500 \begin{cfa} 501 array(SQE, want) cfa_io_allocate(int want); 502 void cfa_io_submit( const array(SQE, have) & ); 503 \end{cfa} 504 where the generic @array@ contains an array of SQEs with a size that may be less than the request. 505 This offers more flexibility to users wanting to fully utilize all of the @io_uring@ features. 429 It is important to mention that there is a certain irony to using only synchronous, therefore blocking, interfaces for a feature often referred to as ``non-blocking'' \io. 430 A fairly traditional way of doing this is using futures\cit{wikipedia futures}. 431 As simple way of doing so is as follows: 432 433 @future(ssize_t) read(int fd, void *buf, size_t count);@ 434 435 \noindent Note that this approach is not necessarily the most idiomatic usage of futures. 436 The definition of read above ``returns'' the read content through an output parameter which cannot be synchronized on. 437 A more classical asynchronous API could look more like: 438 439 @future([ssize_t, void *]) read(int fd, size_t count);@ 440 441 \noindent However, this interface immediately introduces memory lifetime challenges since the call must effectively allocate a buffer to be returned. 442 Because of the performance implications of this, the first approach is considered preferable as it is more familiar to C programmers. 443 444 \subsection{Interface directly to \lstinline{io_uring}} 445 Finally, an other interface that can be relevant is to simply expose directly the underlying \texttt{io\_uring} interface. For example: 446 447 @array(SQE, want) cfa_io_allocate(int want);@ 448 449 @void cfa_io_submit( const array(SQE, have) & );@ 450 451 \noindent This offers more flexibility to users wanting to fully use all of the \texttt{io\_uring} features. 506 452 However, it is not the most user-friendly option. 507 It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricting users to usages that are compatible with how \CFA internally uses @io_uring@. 453 It obviously imposes a strong dependency between user code and \texttt{io\_uring} but at the same time restricting users to usages that are compatible with how \CFA internally uses \texttt{io\_uring}. 454 455 -
doc/theses/thierry_delisle_PhD/thesis/text/practice.tex
rffec1bf r9e23b446 1 1 \chapter{Scheduling in practice}\label{practice} 2 The scheduling algorithm d escribed in Chapter~\ref{core} addresses scheduling in a stable state.3 This chapter addresses problems that occur when the system state changes.2 The scheduling algorithm discribed in Chapter~\ref{core} addresses scheduling in a stable state. 3 However, it does not address problems that occur when the system changes state. 4 4 Indeed the \CFA runtime, supports expanding and shrinking the number of \procs, both manually and, to some extent, automatically. 5 These changes affect the scheduling algorithm, which must dynamically alter its behaviour. 6 7 In detail, \CFA supports adding \procs using the type @processor@, in both RAII and heap coding scenarios. 8 \begin{cfa} 9 { 10 processor p[4]; // 4 new kernel threads 11 ... // execute on 4 processors 12 processor * dp = new( processor, 6 ); // 6 new kernel threads 13 ... // execute on 10 processors 14 delete( dp ); // delete 6 kernel threads 15 ... // execute on 4 processors 16 } // delete 4 kernel threads 17 \end{cfa} 18 Dynamically allocated processors can be deleted an any time, \ie their lifetime exceeds the block of creation. 19 The consequence is that the scheduler and \io subsystems must know when these \procs come in and out of existence and roll them into the appropriate scheduling algorithms. 5 This entails that the scheduling algorithm must support these transitions. 6 7 More precise \CFA supports adding \procs using the RAII object @processor@. 8 These objects can be created at any time and can be destroyed at any time. 9 They are normally created as automatic stack variables, but this is not a requirement. 10 11 The consequence is that the scheduler and \io subsystems must support \procs comming in and out of existence. 20 12 21 13 \section{Manual Resizing} 22 14 Manual resizing is expected to be a rare operation. 23 Programmers normally create/delete processors on a clusters at startup/teardown.24 Therefore ,dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state.25 As such , all internal scheduling arrays that are sized based on the number of \procs need to be @realloc@ed.26 This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or for any other reason.15 Programmers are mostly expected to resize clusters on startup or teardown. 16 Therefore dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state. 17 As such all internal arrays that are sized based on the number of \procs need to be \texttt{realloc}ed. 18 This also means that any references into these arrays, pointers or indexes, may need to be fixed when shrinking\footnote{Indexes may still need fixing when shrinkingbecause some indexes are expected to refer to dense contiguous resources and there is no guarantee the resource being removed has the highest index.}. 27 19 28 20 There are no performance requirements, within reason, for resizing since it is expected to be rare. 29 However, this operation has strict correctness requirements since updating and idle sleep can easily lead to deadlocks.21 However, this operation has strict correctness requirements since shrinking and idle sleep can easily lead to deadlocks. 30 22 It should also avoid as much as possible any effect on performance when the number of \procs remain constant. 31 23 This later requirement prohibits naive solutions, like simply adding a global lock to the ready-queue arrays. 32 24 33 25 \subsection{Read-Copy-Update} 34 One solution is to use the Read-Copy-Update pattern~\cite{wiki:rcu}. 35 In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempt an Indiana Jones Switch to replace the original with the copy. 36 This approach has the advantage that it may not need any synchronization to do the switch. 37 However, there is a race where \procs still use the original data structure after the copy is switched. 38 This race not only requires adding a memory-reclamation scheme, it also requires that operations made on the stale original version are eventually moved to the copy. 39 40 Specifically, the original data structure must be kept until all \procs have witnessed the change. 41 This requirement is the \newterm{memory reclamation challenge} and means every operation needs \emph{some} form of synchronization. 42 If all operations need synchronization, then the overall cost of this technique is likely to be similar to an uncontended lock approach. 43 In addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges. 26 One solution is to use the Read-Copy-Update\cite{wiki:rcu} pattern. 27 In this pattern, resizing is done by creating a copy of the internal data strucures, updating the copy with the desired changes, and then attempt an Idiana Jones Switch to replace the original witht the copy. 28 This approach potentially has the advantage that it may not need any synchronization to do the switch. 29 However, there is a race where \procs could still use the previous, original, data structure after the copy was switched in. 30 This race not only requires some added memory reclamation scheme, it also requires that operations made on the stale original version be eventually moved to the copy. 31 32 For linked-lists, enqueing is only somewhat problematic, \ats enqueued to the original queues need to be transferred to the new, which might not preserve ordering. 33 Dequeing is more challenging. 34 Dequeing from the original will not necessarily update the copy which could lead to multiple \procs dequeing the same \at. 35 Fixing this requires more synchronization or more indirection on the queues. 36 37 Another challenge is that the original must be kept until all \procs have witnessed the change. 38 This is a straight forward memory reclamation challenge but it does mean that every operation will need \emph{some} form of synchronization. 39 If each of these operation does need synchronization then it is possible a simpler solution achieves the same performance. 40 Because in addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges. 44 41 Especially merging subqueues while having a minimal impact on fairness and locality. 45 42 46 For example, given a linked-list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure. 47 If the list supports arbitrary insertions, then inconsistencies in the tail pointer do not break the list; 48 however, ordering may not be preserved. 49 Furthermore, nodes enqueued to the original queues eventually need to be uniquely transferred to the new queues, which may further perturb ordering. 50 Dequeuing is more challenging when nodes appear on both lists because of pending reclamation: dequeuing a node from one list does not remove it from the other nor is that node in the same place on the other list. 51 This situation can lead to multiple \procs dequeuing the same \at. 52 Fixing these challenges requires more synchronization or more indirection to the queues, plus coordinated searching to ensure unique elements. 53 54 \subsection{Readers-Writer Lock} 55 A simpler approach is to use a \newterm{Readers-Writer Lock}~\cite{wiki:rwlock}, where the resizing requires acquiring the lock as a writer while simply enqueueing/dequeuing \ats requires acquiring the lock as a reader. 43 \subsection{Read-Writer Lock} 44 A simpler approach would be to use a \newterm{Readers-Writer Lock}\cite{wiki:rwlock} where the resizing requires acquiring the lock as a writer while simply enqueing/dequeing \ats requires acquiring the lock as a reader. 56 45 Using a Readers-Writer lock solves the problem of dynamically resizing and leaves the challenge of finding or building a lock with sufficient good read-side performance. 57 Since this approach is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken. 58 59 To maximize reader scalability, readers should not contend with each other when attempting to acquire and release a critical section. 60 To achieve this goal requires each reader to have its own memory to mark as locked and unlocked. 61 The read acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock. 62 The write acquire acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks. 63 Acquiring all the local read locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer. 64 65 Figure~\ref{f:SpecializedReadersWriterLock} shows the outline for this specialized readers-writer lock. 66 The lock in nonblocking, so both readers and writers spin while the lock is held. 67 \todo{finish explanation} 68 69 \begin{figure} 70 \begin{cfa} 46 Since this is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken. 47 48 To maximize reader scalability, the readers should not contend with eachother when attempting to acquire and release the critical sections. 49 This effectively requires that each reader have its own piece of memory to mark as locked and unlocked. 50 Reades then acquire the lock wait for writers to finish the critical section and then acquire their local spinlocks. 51 Writers acquire the global lock, so writers have mutual exclusion among themselves, and then acquires each of the local reader locks. 52 Acquiring all the local locks guarantees mutual exclusion between the readers and the writer, while the wait on the read side prevents readers from continously starving the writer. 53 \todo{reference listings} 54 55 \begin{lstlisting} 71 56 void read_lock() { 72 57 // Step 1 : make sure no writers in 73 58 while write_lock { Pause(); } 59 60 // May need fence here 61 74 62 // Step 2 : acquire our local lock 75 while atomic_xchg( tls.lock ) { Pause(); } 76 } 63 while atomic_xchg( tls.lock ) { 64 Pause(); 65 } 66 } 67 77 68 void read_unlock() { 78 69 tls.lock = false; 79 70 } 71 \end{lstlisting} 72 73 \begin{lstlisting} 80 74 void write_lock() { 81 75 // Step 1 : lock global lock 82 while atomic_xchg( write_lock ) { Pause(); } 76 while atomic_xchg( write_lock ) { 77 Pause(); 78 } 79 83 80 // Step 2 : lock per-proc locks 84 81 for t in all_tls { 85 while atomic_xchg( t.lock ) { Pause(); } 86 } 87 } 82 while atomic_xchg( t.lock ) { 83 Pause(); 84 } 85 } 86 } 87 88 88 void write_unlock() { 89 89 // Step 1 : release local locks 90 for t in all_tls { t.lock = false; } 90 for t in all_tls { 91 t.lock = false; 92 } 93 91 94 // Step 2 : release global lock 92 95 write_lock = false; 93 96 } 94 \end{cfa} 95 \caption{Specialized Readers-Writer Lock} 96 \label{f:SpecializedReadersWriterLock} 97 \end{figure} 98 99 \section{Idle-Sleep}\label{idlesleep} 100 While manual resizing of \procs is expected to be rare, the number of \ats can vary significantly over an application's lifetime, which means there are times when there are too few or too many \procs. 101 For this work, it is the programer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue. 102 This leaves too many \procs when there are not enough \ats for all the \procs to be useful. 103 These idle \procs cannot be removed because their lifetime is controlled by the application, and only the application knows when the number of \ats may increase or decrease. 104 While idle \procs can spin until work appears, this approach wastes energy, unnecessarily produces heat and prevents other applications from using the processor. 105 Therefore, idle \procs are put into an idle state, called \newterm{Idle-Sleep}, where the \gls{kthrd} is blocked until the scheduler deems it is needed. 97 \end{lstlisting} 98 99 \section{Idle-Sleep} 100 In addition to users manually changing the number of \procs, it is desireable to support ``removing'' \procs when there is not enough \ats for all the \procs to be useful. 101 While manual resizing is expected to be rare, the number of \ats is expected to vary much more which means \procs may need to be ``removed'' for only short periods of time. 102 Furthermore, race conditions that spuriously lead to the impression that no \ats are ready are actually common in practice. 103 Therefore resources associated with \procs should not be freed but \procs simply put into an idle state where the \gls{kthrd} is blocked until more \ats become ready. 104 This state is referred to as \newterm{Idle-Sleep}. 106 105 107 106 Idle sleep effectively encompasses several challenges. 108 First, a data structure needs to keep track of all \procs that are in idle sleep. 109 Because idle sleep is spurious, this data structure has strict performance requirements, in addition to strict correctness requirements. 110 Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ on a pthread semaphore. 111 The complexity here is to support \at parking and unparking, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity. 112 Finally, the scheduler needs a heuristic to determine when to block and unblock an appropriate number of \procs. 113 However, this third challenge is outside the scope of this thesis because developing a general heuristic is complex enough to justify its own work. 114 Therefore, the \CFA scheduler simply follows the ``Race-to-Idle''~\cite{Albers12} approach where a sleeping \proc is woken any time a \at becomes ready and \procs go to idle sleep anytime they run out of work. 115 An interesting sub-part of this heuristic is what to do with bursts of \ats that become ready. 116 Since waking up a sleeping \proc can have notable latency, it is possible multiple \ats become ready while a single \proc is waking up. 117 This facts begs the question, if many \procs are available, how many should be woken? 118 If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelisation. 119 If the ready \ats will run for a short very short time, waking many \procs may be wasteful. 120 As mentioned, a heuristic to handle these complex cases is outside the scope of this thesis, the behaviour of the scheduler in this particular case is left unspecified. 107 First some data structure needs to keep track of all \procs that are in idle sleep. 108 Because of idle sleep can be spurious, this data structure has strict performance requirements in addition to the strict correctness requirements. 109 Next, some tool must be used to block kernel threads \glspl{kthrd}, \eg \texttt{pthread\_cond\_wait}, pthread semaphores. 110 The complexity here is to support \at parking and unparking, timers, \io operations and all other \CFA features with minimal complexity. 111 Finally, idle sleep also includes a heuristic to determine the appropriate number of \procs to be in idle sleep an any given time. 112 This third challenge is however outside the scope of this thesis because developping a general heuristic is involved enough to justify its own work. 113 The \CFA scheduler simply follows the ``Race-to-Idle'\cit{https://doi.org/10.1137/1.9781611973099.100}' approach where a sleeping \proc is woken any time an \at becomes ready and \procs go to idle sleep anytime they run out of work. 121 114 122 115 \section{Sleeping} 123 116 As usual, the corner-stone of any feature related to the kernel is the choice of system call. 124 In terms of blocking a \gls{kthrd} until some event occurs, the Linux kernel has many available options. 125 126 \subsection{\lstinline{pthread_mutex}/\lstinline{pthread_cond}} 127 The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe park/unpark of a \gls{kthrd} to/from a @pthread_cond@. 128 While this approach works for \glspl{kthrd} waiting among themselves, \io operations do not provide a mechanism to signal @pthread_cond@s. 129 For \io results to wake a \proc waiting on a @pthread_cond@ means a different \glspl{kthrd} must be woken up first, which then signals the \proc. 130 131 \subsection{\lstinline{io_uring} and Epoll} 132 An alternative is to flip the problem on its head and block waiting for \io, using @io_uring@ or @epoll@. 133 This creates the inverse situation, where \io operations directly wake sleeping \procs but waking blocked \procs must use an indirect scheme. 134 This generally takes the form of creating a file descriptor, \eg, dummy file, pipe, or event fd, and using that file descriptor when \procs need to wake each other. 135 This leads to additional complexity because there can be a race between these artificial \io and genuine \io operations. 136 If not handled correctly, this can lead to artificial files getting delayed too long behind genuine files, resulting in longer latency. 117 In terms of blocking a \gls{kthrd} until some event occurs the linux kernel has many available options: 118 119 \paragraph{\texttt{pthread\_mutex}/\texttt{pthread\_cond}} 120 The most classic option is to use some combination of \texttt{pthread\_mutex} and \texttt{pthread\_cond}. 121 These serve as straight forward mutual exclusion and synchronization tools and allow a \gls{kthrd} to wait on a \texttt{pthread\_cond} until signalled. 122 While this approach is generally perfectly appropriate for \glspl{kthrd} waiting after eachother, \io operations do not signal \texttt{pthread\_cond}s. 123 For \io results to wake a \proc waiting on a \texttt{pthread\_cond} means that a different \glspl{kthrd} must be woken up first, and then the \proc can be signalled. 124 125 \subsection{\texttt{io\_uring} and Epoll} 126 An alternative is to flip the problem on its head and block waiting for \io, using \texttt{io\_uring} or even \texttt{epoll}. 127 This creates the inverse situation, where \io operations directly wake sleeping \procs but waking \proc from a running \gls{kthrd} must use an indirect scheme. 128 This generally takes the form of creating a file descriptor, \eg, a dummy file, a pipe or an event fd, and using that file descriptor when \procs need to wake eachother. 129 This leads to additional complexity because there can be a race between these artificial \io operations and genuine \io operations. 130 If not handled correctly, this can lead to the artificial files going out of sync. 137 131 138 132 \subsection{Event FDs} 139 133 Another interesting approach is to use an event file descriptor\cit{eventfd}. 140 This Linux feature is a file descriptor that behaves like \io, \ie, uses @read@ and @write@, but also behaves like a semaphore. 141 Indeed, all reads and writes must use a word-sized values, \ie 64 or 32 bits. 142 Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero out the buffer and return the buffer values so far.\footnote{ 143 This behaviour is without the \lstinline{EFD_SEMAPHORE} flag, which changes the behaviour of \lstinline{read} but is not needed for this work.} 134 This is a Linux feature that is a file descriptor that behaves like \io, \ie, uses \texttt{read} and \texttt{write}, but also behaves like a semaphore. 135 Indeed, all read and writes must use 64bits large values\footnote{On 64-bit Linux, a 32-bit Linux would use 32 bits values.}. 136 Writes add their values to the buffer, that is arithmetic addition and not buffer append, and reads zero out the buffer and return the buffer values so far\footnote{This is without the \texttt{EFD\_SEMAPHORE} flag. This flags changes the behavior of \texttt{read} but is not needed for this work.}. 144 137 If a read is made while the buffer is already 0, the read blocks until a non-0 value is added. 145 What makes this feature particularly interesting is that @io_uring@ supports the @IORING_REGISTER_EVENTFD@ command to register an event @fd@ to a particular instance. 146 Once that instance is registered, any \io completion results in @io_uring@ writing to the event @fd@. 147 This means that a \proc waiting on the event @fd@ can be \emph{directly} woken up by either other \procs or incoming \io. 138 What makes this feature particularly interesting is that \texttt{io\_uring} supports the \texttt{IORING\_REGISTER\_EVENTFD} command, to register an event fd to a particular instance. 139 Once that instance is registered, any \io completion will result in \texttt{io\_uring} writing to the event FD. 140 This means that a \proc waiting on the event FD can be \emph{directly} woken up by either other \procs or incomming \io. 141 142 \begin{figure} 143 \centering 144 \input{idle1.pstex_t} 145 \caption[Basic Idle Sleep Data Structure]{Basic Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock. 146 Each \proc has a private event FD.} 147 \label{fig:idle1} 148 \end{figure} 149 148 150 149 151 \section{Tracking Sleepers} 150 152 Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc. 151 The classic challenge occurs when a \at is made ready while a \proc is going to sleep: there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at. 152 Since \ats can be made ready by timers, \io operations, or other events outside a cluster, this race can occur even if the \proc going to sleep is the only \proc awake. 153 As a result, improper handling of this race leads to all \procs going to sleep when there are ready \ats and the system deadlocks. 154 155 The handshake closing the race is done with both the notifier and the idle \proc executing two ordered steps. 156 The notifier first make sure the newly ready \at is visible to \procs searching for \ats, and then attempt to notify an idle \proc. 157 On the other side, \procs make themselves visible as idle \procs and then search for any \ats they may have missed. 158 Unlike regular work-stealing, this search must be exhaustive to make sure that pre-existing \at is missed. 159 These steps from both sides guarantee that if the search misses a newly ready \at, then the notifier is guaranteed to see at least one idle \proc. 160 Conversly, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search. 153 The classic challenge is when a \at is made ready while a \proc is going to sleep, there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at. 154 Since \ats can be made ready by timers, \io operations or other events outside a clusre, this race can occur even if the \proc going to sleep is the only \proc awake. 155 As a result, improper handling of this race can lead to all \procs going to sleep and the system deadlocking. 161 156 162 157 Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers. 163 Contention can be tolerated for \procs attempting to sleep or wake-up because these \procs are not doing useful work, and therefore, not contributing to overall performance. 164 However, notifying, checking if a \proc must be woken-up, and doing so if needed, can significantly affect overall performance and must be low cost. 158 Contention slowing down \procs attempting to sleep or wake-up can be tolerated. 159 These \procs are not doing useful work and therefore not contributing to overall performance. 160 However, notifying, checking if a \proc must be woken-up and doing so if needed, can significantly affect overall performance and must be low cost. 165 161 166 162 \subsection{Sleepers List} 167 163 Each cluster maintains a list of idle \procs, organized as a stack. 168 This ordering allows \procs at the tail to stay in idle sleep for extended period of times while those at the head of the list wake-up for bursts of activity.169 Because of unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \procshandle as much of the work as possible.170 The idle \procs maintain the stackof sleepers among themselves and notifying a sleeping \proc takes as little work as possible.164 This ordering hopefully allows \proc at the tail to stay in idle sleep for extended period of times. 165 Because of these unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \proc handle as much of the work as possible. 166 The idle \procs maintain the of sleepers among themselves and notifying a sleeping \proc takes as little work as possible. 171 167 This approach means that maintaining the list is fairly straightforward. 172 The list can simply use a single lock per cluster and only \procs that are getting in and out of the idle statecontend for that lock.168 The list can simply use a single lock per cluster and only \procs that are getting in and out of idle state will contend for that lock. 173 169 174 170 This approach also simplifies notification. 175 Indeed, \procs n ot only need to be notify when a new \at is readied, but also must be notified during manualresizing, so the \gls{kthrd} can be joined.176 Th ese requirements meanwhichever entity removes idle \procs from the sleeper list must be able to do so in any order.171 Indeed, \procs need to be notify when a new \at is readied, but they also must be notified during resizing, so the \gls{kthrd} can be joined. 172 This means that whichever entity removes idle \procs from the sleeper list must be able to do so in any order. 177 173 Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure. 178 The single lock also means the notification process simply needs to wake-up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handlesthe rest.174 The notification process then simply needs to wake-up the desired idle \proc, using \texttt{pthread\_cond\_signal}, \texttt{write} on an fd, etc., and the \proc will handle the rest. 179 175 180 176 \subsection{Reducing Latency} 181 As mentioned in this section, \procs going to sleep for extremely short periods of time is likely in certain scenarios. 182 Therefore, the latency of doing a system call to read from and writing to an event @fd@ can negatively affect overall performance in a notable way. 183 Hence, it is important to reduce latency and contention of the notification as much as possible. 184 Figure~\ref{fig:idle1} shows the basic idle-sleep data structure. 185 For the notifiers, this data structure can cause contention on the lock and the event @fd@ syscall can cause notable latency. 186 187 \begin{figure} 188 \centering 189 \input{idle1.pstex_t} 190 \caption[Basic Idle Sleep Data Structure]{Basic Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock. 191 Each \proc has a private event \lstinline{fd}.} 192 \label{fig:idle1} 193 \end{figure} 194 195 Contention occurs because the idle-list lock must be held to access the idle list, \eg by \procs attempting to go to sleep, \procs waking, or notification attempts. 196 The contention from the \procs attempting to go to sleep can be mitigated slightly by using @try_acquire@, so the \procs simply busy wait again searching for \ats if the lock is held. 197 This trick cannot be used when waking \procs since the waker needs to return immediately to what it was doing. 198 199 Interestingly, general notification, \ie waking any idle processor versus a specific one, does not strictly require modifying the list. 200 Here, contention can be reduced notably by having notifiers avoid the lock entirely by adding a pointer to the event @fd@ of the first idle \proc, as in Figure~\ref{fig:idle2}. 201 To avoid contention among notifiers, notifiers atomically exchange the pointer with @NULL@. 202 The first notifier succeeds on the exchange and obtains the @fd@ of an idle \proc; 203 hence, only one notifier contends on the system call. 204 This notifier writes to the @fd@ to wake a \proc. 205 The woken \proc then updates the atomic pointer, while it is updating the head of the list, as it removes itself from the list. 206 Notifiers that obtained a @NULL@ in the exchange simply move on knowing that another notifier is already waking a \proc. 207 This behaviour is equivalent to having multiple notifier write to the @fd@ since reads consume all previous writes. 208 Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares witht the latency of \procs waking up. 209 As mentioned in section~\ref{idlesleep}, there is no optimal approach to handle these bursts. 210 It is therefore difficult to justify the cost of any extra synchronization here. 211 212 \begin{figure}[t] 177 As mentioned in this section, \procs going idle for extremely short periods of time is likely in certain common scenarios. 178 Therefore, the latency of doing a system call to read from and writing to the event fd can actually negatively affect overall performance in a notable way. 179 Is it important to reduce latency and contention of the notification as much as possible. 180 Figure~\ref{fig:idle1} shoes the basic idle sleep data structure. 181 For the notifiers, this data structure can cause contention on the lock and the event fd syscall can cause notable latency. 182 183 \begin{figure} 213 184 \centering 214 185 \input{idle2.pstex_t} 215 \caption[Improved Idle -Sleep Data Structure]{Improved Idle-Sleep Data Structure \smallskip\newline An atomic pointer is added to the listpointing to the Event FD of the first \proc on the list.}186 \caption[Improved Idle Sleep Data Structure]{Improved Idle Sleep Data Structure \smallskip\newline An atomic pointer is added to the list, pointing to the Event FD of the first \proc on the list.} 216 187 \label{fig:idle2} 217 188 \end{figure} 218 189 219 The next optimization is to avoid the latency of the event @fd@, which can be done by adding what is effectively a binary benaphore\cit{benaphore} in front of the event @fd@. 220 The benaphore over the event @fd@ logically provides a three state flag to avoid unnecessary system calls, where the states are expressed explicit in Figure~\ref{fig:idle:state}. 221 A \proc begins its idle sleep by adding itself to the idle list before searching for an \at. 222 In the process of adding itself to the idle list, it sets the state flag to @SEARCH@. 223 If no \ats can be found during the search, the \proc then confirms it is going to sleep by atomically swapping the state to @SLEEP@. 224 If the previous state is still @SEARCH@, then the \proc does read the event @fd@. 225 Meanwhile, notifiers atomically exchange the state to @AWAKE@ state. 226 If the previous state is @SLEEP@, then the notifier must write to the event @fd@. 227 However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event @fd@ can be omitted, which reduces latency notably. 228 These extensions leads to the final data structure shown in Figure~\ref{fig:idle}. 190 The contention is mostly due to the lock on the list needing to be held to get to the head \proc. 191 That lock can be contended by \procs attempting to go to sleep, \procs waking or notification attempts. 192 The contentention from the \procs attempting to go to sleep can be mitigated slightly by using \texttt{try\_acquire} instead, so the \procs simply continue searching for \ats if the lock is held. 193 This trick cannot be used for waking \procs since they are not in a state where they can run \ats. 194 However, it is worth nothing that notification does not strictly require accessing the list or the head \proc. 195 Therefore, contention can be reduced notably by having notifiers avoid the lock entirely and adding a pointer to the event fd of the first idle \proc, as in Figure~\ref{fig:idle2}. 196 To avoid contention between the notifiers, instead of simply reading the atomic pointer, notifiers atomically exchange it to \texttt{null} so only only notifier will contend on the system call. 229 197 230 198 \begin{figure} 231 199 \centering 232 200 \input{idle_state.pstex_t} 233 \caption[Improved Idle -Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three state flag is added to the event \lstinline{fd}.}201 \caption[Improved Idle Sleep Data Structure]{Improved Idle Sleep Data Structure \smallskip\newline An atomic pointer is added to the list, pointing to the Event FD of the first \proc on the list.} 234 202 \label{fig:idle:state} 235 203 \end{figure} 204 205 The next optimization that can be done is to avoid the latency of the event fd when possible. 206 This can be done by adding what is effectively a benaphore\cit{benaphore} in front of the event fd. 207 A simple three state flag is added beside the event fd to avoid unnecessary system calls, as shown in Figure~\ref{fig:idle:state}. 208 The flag starts in state \texttt{SEARCH}, while the \proc is searching for \ats to run. 209 The \proc then confirms the sleep by atomically swaping the state to \texttt{SLEEP}. 210 If the previous state was still \texttt{SEARCH}, then the \proc does read the event fd. 211 Meanwhile, notifiers atomically exchange the state to \texttt{AWAKE} state. 212 if the previous state was \texttt{SLEEP}, then the notifier must write to the event fd. 213 However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event fd can be omitted, which reduces latency notably. 214 This leads to the final data structure shown in Figure~\ref{fig:idle}. 236 215 237 216 \begin{figure} … … 239 218 \input{idle.pstex_t} 240 219 \caption[Low-latency Idle Sleep Data Structure]{Low-latency Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock. 241 Each \proc has a private event \lstinline{fd}with a benaphore in front of it.242 The list also has an atomic pointer to the event \lstinline{fd}and benaphore of the first \proc on the list.}220 Each \proc has a private event FD with a benaphore in front of it. 221 The list also has an atomic pointer to the event fd and benaphore of the first \proc on the list.} 243 222 \label{fig:idle} 244 223 \end{figure} -
doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex
rffec1bf r9e23b446 2 2 This chapter presents an overview of the capabilities of the \CFA runtime prior to this thesis work. 3 3 4 \section{C Threading} 5 6 \Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@. 7 Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC). 8 While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC. 9 This model uses \glspl{kthrd} to achieve parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. 10 The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. 11 A consequence of this approach is that the kernel has a perfect view of every thread executing in the system\footnote{This is not completely true due to primitives like \lstinline|futex|es, which have a significant portion of their logic in user space.}. 4 \Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@. Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC). While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC. This model uses \glspl{kthrd} to achieve parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completely true due to primitives like \lstinline|futex|es, which have a significant portion of their logic in user space.}. 12 5 13 6 \section{M:N Threading}\label{prev:model} … … 15 8 Threading in \CFA is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \glspl{thrd} and switch among \glspl{thrd} liberally without many concerns for performance. 16 9 17 The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}. 18 The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack. 19 The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run. 10 The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run. 20 11 21 12 \section{Clusters} 22 \CFA allows the option to group user-level threading, in the form of clusters. 23 Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. 24 \Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. 25 Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. 26 It also opens the door to handling effects like NUMA, by pinning clusters to a specific NUMA node\footnote{This capability is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for CPU masks.}. 13 \CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. It also opens the door to handling effects like NUMA, by pining clusters to a specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}. 27 14 28 15 \begin{figure} … … 30 17 \input{system.pstex_t} 31 18 \end{center} 32 \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} thatlives outside any cluster and does not run \glspl{thrd}.}19 \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.} 33 20 \label{fig:system} 34 21 \end{figure} … … 41 28 42 29 \begin{quote} 43 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server. 44 If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. 45 In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request. 46 With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. 47 If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client. 48 However, this solution is neither general nor appropriate even in this simple case.}. 30 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request. With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}. 49 31 \end{quote} 50 32 51 Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} ope rations. 52 This feature entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. 53 The multiplexing requires a single \gls{proc} to execute multiple \glsxtrshort{io} operations in parallel. 54 This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. 55 Executing \glsxtrshort{io} operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block. 33 Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, like \glslink{uthrding}{User-Level \emph{Threading}} blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations, which entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple \glsxtrshort{io} operations in parallel. This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. Executing \glsxtrshort{io} operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block. 56 34 57 \section{Interoperating with C}35 \section{Interoperating with \texttt{C}} 58 36 While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the non-blocking challenge extends to all blocking system-calls. The POSIX standard states~\cite[\S~2.9.1]{POSIX17}: 59 37 \begin{quote} 60 All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions need not be thread-safe. ... (list of 70+excluded functions)38 All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions1 need not be thread-safe. ... (list of 70+ potentially excluded functions) 61 39 \end{quote} 62 Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthread slock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.40 Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthread lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model. 63 41 64 42 Languages like Go and Java, which have strict interoperability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them, \eg a blocking function may be delegated to a \gls{kthrd}. Sandboxing may help towards guaranteeing that the kind of deadlock mentioned above does not occur. … … 67 45 \begin{enumerate} 68 46 \item Precisely identifying blocking C calls is difficult. 69 \item Introducing safe-point code (see Go~page~\pageref{GoSafePoint})can have a significant impact on general performance.47 \item Introducing control points code can have a significant impact on general performance. 70 48 \end{enumerate} 71 Because of these consequences, this work does not attempt to ``sandbox'' calls to C. 72 Therefore, it is possible calls to an unknown library function can block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. 73 Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. 74 Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks are redirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.} 49 Because of these consequences, this work does not attempt to ``sandbox'' calls to C. Therefore, it is possible calls from an unidentified library will block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. Therefore, a complete solution to this problem is outside the scope of this thesis. -
doc/theses/thierry_delisle_PhD/thesis/thesis.tex
rffec1bf r9e23b446 83 83 \usepackage{graphicx} % For including graphics 84 84 \usepackage{subcaption} 85 \usepackage{comment} % Removes large sections of the document.86 85 87 86 % Hyperlinks make it very easy to navigate an electronic document. … … 108 107 citecolor=OliveGreen, % color of links to bibliography 109 108 filecolor=magenta, % color of file links 110 urlcolor=blue, % color of external links 111 breaklinks=true 109 urlcolor=cyan % color of external links 112 110 } 113 111 \ifthenelse{\boolean{PrintVersion}}{ % for improved print quality, change some hyperref options -
libcfa/Makefile.am
rffec1bf r9e23b446 18 18 ACLOCAL_AMFLAGS = -I automake 19 19 SUBDIRS = prelude src # order important 20 21 DISTCLEANFILES = config.data -
libcfa/configure.ac
rffec1bf r9e23b446 181 181 AH_TEMPLATE([CFA_HAVE_SPLICE_F_FD_IN_FIXED],[Defined if io_uring support is present when compiling libcfathread and supports the flag SPLICE_F_FD_IN_FIXED.]) 182 182 AH_TEMPLATE([CFA_HAVE_IORING_SETUP_ATTACH_WQ],[Defined if io_uring support is present when compiling libcfathread and supports the flag IORING_SETUP_ATTACH_WQ.]) 183 AH_TEMPLATE([CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS],[Defined if io_uring support is present when compiling libcfathread and supports the flag IORING_REGISTER_IOWQ_MAX_WORKERS.])184 183 AH_TEMPLATE([CFA_HAVE_PREADV2],[Defined if preadv2 support is present when compiling libcfathread.]) 185 184 AH_TEMPLATE([CFA_HAVE_PWRITEV2],[Defined if pwritev2 support is present when compiling libcfathread.]) … … 190 189 191 190 define(ioring_ops, [IORING_OP_NOP,IORING_OP_READV,IORING_OP_WRITEV,IORING_OP_FSYNC,IORING_OP_READ_FIXED,IORING_OP_WRITE_FIXED,IORING_OP_POLL_ADD,IORING_OP_POLL_REMOVE,IORING_OP_SYNC_FILE_RANGE,IORING_OP_SENDMSG,IORING_OP_RECVMSG,IORING_OP_TIMEOUT,IORING_OP_TIMEOUT_REMOVE,IORING_OP_ACCEPT,IORING_OP_ASYNC_CANCEL,IORING_OP_LINK_TIMEOUT,IORING_OP_CONNECT,IORING_OP_FALLOCATE,IORING_OP_OPENAT,IORING_OP_CLOSE,IORING_OP_FILES_UPDATE,IORING_OP_STATX,IORING_OP_READ,IORING_OP_WRITE,IORING_OP_FADVISE,IORING_OP_MADVISE,IORING_OP_SEND,IORING_OP_RECV,IORING_OP_OPENAT2,IORING_OP_EPOLL_CTL,IORING_OP_SPLICE,IORING_OP_PROVIDE_BUFFERS,IORING_OP_REMOVE_BUFFER,IORING_OP_TEE]) 192 define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,IOSQE_ASYNC,IOSQE_BUFFER_SELECT,SPLICE_F_FD_IN_FIXED,IORING_SETUP_ATTACH_WQ ,IORING_REGISTER_IOWQ_MAX_WORKERS])191 define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,IOSQE_ASYNC,IOSQE_BUFFER_SELECT,SPLICE_F_FD_IN_FIXED,IORING_SETUP_ATTACH_WQ]) 193 192 194 193 define(ioring_from_decls, [ -
libcfa/src/Makefile.am
rffec1bf r9e23b446 216 216 nobase_cfa_include_HEADERS = ${stdhdr} ${inst_headers_src} ${inst_headers_nosrc} ${inst_thread_headers_src} ${inst_thread_headers_nosrc} 217 217 EXTRA_DIST = stdhdr 218 DISTCLEANFILES = $(libdeps) $(thread_libdeps)219 218 220 219 #---------------------------------------------------------------------------------------------------------------- … … 222 221 -rm -rf ${CFA_INCDIR} ${CFA_LIBDIR} 223 222 224 #distclean-local:225 #find ${builddir} -path '*.Plo' -delete223 distclean-local: 224 find ${builddir} -path '*.Plo' -delete 226 225 227 226 -
libcfa/src/concurrency/kernel/fwd.hfa
rffec1bf r9e23b446 254 254 // intented to be use by wait, wait_any, waitfor, etc. rather than used directly 255 255 bool setup( future_t & this, oneshot & wait_ctx ) { 256 /* paranoid */ verify( wait_ctx.ptr == 0p || wait_ctx.ptr == 1p);256 /* paranoid */ verify( wait_ctx.ptr == 0p ); 257 257 // The future needs to set the wait context 258 258 for() { … … 274 274 // intented to be use by wait, wait_any, waitfor, etc. rather than used directly 275 275 bool retract( future_t & this, oneshot & wait_ctx ) { 276 struct oneshot * expected = this.ptr; 277 278 // attempt to remove the context so it doesn't get consumed. 279 if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { 276 for() { 277 struct oneshot * expected = this.ptr; 278 279 // expected == 0p: future was never actually setup, just return 280 if( expected == 0p ) return false; 281 282 // expected == 1p: the future is ready and the context was fully consumed 283 // the server won't use the pointer again 284 // It is safe to delete (which could happen after the return) 285 if( expected == 1p ) return true; 286 287 // expected == 2p: the future is ready but the context hasn't fully been consumed 288 // spin until it is safe to move on 289 if( expected == 2p ) { 290 while( this.ptr != 1p ) Pause(); 291 /* paranoid */ verify( this.ptr == 1p ); 292 return true; 293 } 294 295 // expected != wait_ctx: the future was setup with a different context ?!?! 296 // something went wrong here, abort 297 if( expected != &wait_ctx ) abort("Future in unexpected state"); 298 280 299 // we still have the original context, then no one else saw it 281 return false; 282 } 283 284 // expected == 0p: future was never actually setup, just return 285 if( expected == 0p ) return false; 286 287 // expected == 1p: the future is ready and the context was fully consumed 288 // the server won't use the pointer again 289 // It is safe to delete (which could happen after the return) 290 if( expected == 1p ) return true; 291 292 // expected == 2p: the future is ready but the context hasn't fully been consumed 293 // spin until it is safe to move on 294 if( expected == 2p ) { 295 while( this.ptr != 1p ) Pause(); 296 /* paranoid */ verify( this.ptr == 1p ); 297 return true; 298 } 299 300 // anything else: the future was setup with a different context ?!?! 301 // something went wrong here, abort 302 abort("Future in unexpected state"); 300 // attempt to remove the context so it doesn't get consumed. 301 if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { 302 return false; 303 } 304 } 303 305 } 304 306 -
libcfa/src/concurrency/locks.cfa
rffec1bf r9e23b446 237 237 // This pthread_cond_var member is called from the kernel, and therefore, cannot block, but it can spin. 238 238 lock( cond->lock __cfaabi_dbg_ctx2 ); 239 239 240 // this check is necessary to avoid a race condition since this timeout handler 240 241 // may still be called after a thread has been removed from the queue but … … 346 347 size_t recursion_count = queue_and_get_recursion(this, &info); 347 348 alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info }; 348 unlock( lock );349 350 // registers alarm outside cond lock to avoid deadlock351 349 register_self( &node_wrap.alarm_node ); 350 unlock( lock ); 352 351 353 352 // blocks here … … 438 437 if ( ret ) { 439 438 info_thread(L) & popped = try_pop_front( blocked_threads ); 440 popped.signalled = true;441 439 on_notify(*popped.lock, popped.t); 442 440 } … … 450 448 while( ! blocked_threads`isEmpty ) { 451 449 info_thread(L) & popped = try_pop_front( blocked_threads ); 452 popped.signalled = true;453 450 on_notify(*popped.lock, popped.t); 454 451 } … … 472 469 size_t recursion_count = queue_and_get_recursion(this, &info); 473 470 pthread_alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info }; 474 unlock( lock );475 476 // registers alarm outside cond lock to avoid deadlock477 471 register_self( &node_wrap.alarm_node ); 472 unlock( lock ); 478 473 479 474 // blocks here … … 505 500 return i.signalled; 506 501 507 Duration getDuration(timespec t) {508 timespec currTime;509 clock_gettime(CLOCK_REALTIME, &currTime);510 Duration waitUntil = { t };511 Duration currDur = { currTime };512 if ( currDur >= waitUntil ) return currDur - waitUntil;513 Duration zero = { 0 };514 return zero;515 }516 517 502 bool wait( pthread_cond_var(L) & this, L & l, timespec t ) { 518 PTHREAD_WAIT_TIME( 0, &l , getDuration( t ) ) 503 Duration d = { t }; 504 WAIT_TIME( 0, &l , d ) 519 505 } 520 506 521 507 bool wait( pthread_cond_var(L) & this, L & l, uintptr_t info, timespec t ) { 522 PTHREAD_WAIT_TIME( info, &l , getDuration( t ) ) 508 Duration d = { t }; 509 WAIT_TIME( info, &l , d ) 523 510 } 524 511 } -
libcfa/src/concurrency/locks.hfa
rffec1bf r9e23b446 478 478 #endif 479 479 lock( lock, node ); 480 while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause(); 481 __atomic_store_n(&held, true, __ATOMIC_SEQ_CST); 480 while(held) Pause(); 481 held = true; 482 // printf("locked\n"); 482 483 unlock( lock, node ); 483 484 #ifdef __CFA_DEBUG__ … … 487 488 488 489 static inline void unlock(spin_queue_lock & this) with(this) { 490 // printf("unlocked\n"); 489 491 #ifdef __CFA_DEBUG__ 490 492 owner = 0p; 491 493 #endif 492 __atomic_store_n(&held, false, __ATOMIC_RELEASE);494 held = false; 493 495 } 494 496 … … 533 535 #endif 534 536 lock( lock, node ); 535 while( __atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();536 __atomic_store_n(&held, true, __ATOMIC_SEQ_CST);537 while(held) Pause(); 538 held = true; 537 539 unlock( lock, node ); 538 540 #ifdef __CFA_DEBUG__ … … 545 547 owner = 0p; 546 548 #endif 547 __atomic_store_n(&held, false, __ATOMIC_SEQ_CST);549 held = false; 548 550 } 549 551 … … 586 588 #endif 587 589 lock( lock ); 588 while( __atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();589 __atomic_store_n(&held, true, __ATOMIC_RELEASE);590 while(held) Pause(); 591 held = true; 590 592 unlock( lock ); 591 593 #ifdef __CFA_DEBUG__ … … 598 600 owner = 0p; 599 601 #endif 600 __atomic_store_n(&held, false, __ATOMIC_RELEASE);602 held = false; 601 603 } 602 604 -
libcfa/src/concurrency/ready_subqueue.hfa
rffec1bf r9e23b446 49 49 // Get the relevant nodes locally 50 50 this.prev->link.next = node; 51 __atomic_store_n(&this.prev->link.ts, rdtscl(), __ATOMIC_RELAXED);51 this.prev->link.ts = rdtscl(); 52 52 this.prev = node; 53 53 #if !defined(__CFA_NO_STATISTICS__) -
libcfa/src/heap.cfa
rffec1bf r9e23b446 509 509 checkHeader( header < (Heap.Storage.Header *)heapBegin || (Heap.Storage.Header *)heapEnd < header, name, addr ); // bad address ? (offset could be + or -) 510 510 511 Heap * homeManager; 511 512 if ( unlikely( freeHead == 0p || // freed and only free-list node => null link 512 513 // freed and link points at another free block not to a bucket in the bucket array. -
src/AST/Convert.cpp
rffec1bf r9e23b446 168 168 auto attr = get<Attribute>().acceptL( node->attributes ); 169 169 170 // This field can be unset very early on (Pre-FixReturnTypes).171 auto newType = (type) ? type->clone() : nullptr;172 173 170 auto decl = new ObjectDecl( 174 171 node->name, … … 176 173 LinkageSpec::Spec( node->linkage.val ), 177 174 bfwd, 178 newType,175 type->clone(), 179 176 nullptr, // prevent infinite loop 180 177 attr, … … 1582 1579 1583 1580 virtual void visit( const ObjectDecl * old ) override final { 1584 if ( inCache( old ) ) {1585 return;1586 }1587 1581 auto&& type = GET_ACCEPT_1(type, Type); 1588 1582 auto&& init = GET_ACCEPT_1(init, Init); 1589 1583 auto&& bfwd = GET_ACCEPT_1(bitfieldWidth, Expr); 1590 1584 auto&& attr = GET_ACCEPT_V(attributes, Attribute); 1591 1585 if ( inCache( old ) ) { 1586 return; 1587 } 1592 1588 auto decl = new ast::ObjectDecl( 1593 1589 old->location, -
src/AST/Decl.hpp
rffec1bf r9e23b446 315 315 316 316 EnumDecl( const CodeLocation& loc, const std::string& name, 317 std::vector<ptr<Attribute>>&& attrs = {}, Linkage::Spec linkage = Linkage::Cforall, Type const* base = nullptr,318 std::unordered_map< std::string, long long > enumValues = std::unordered_map< std::string, long long >() )317 std::vector<ptr<Attribute>>&& attrs = {}, Linkage::Spec linkage = Linkage::Cforall, Type * base = nullptr, 318 std::unordered_map< std::string, long long > enumValues = std::unordered_map< std::string, long long >() ) 319 319 : AggregateDecl( loc, name, std::move(attrs), linkage ), base(base), enumValues(enumValues) {} 320 320 -
src/AST/Expr.cpp
rffec1bf r9e23b446 272 272 // Adjust the length of the string for the terminator. 273 273 const Expr * strSize = from_ulong( loc, str.size() + 1 ); 274 const Type * strType = new ArrayType( charType, strSize, FixedLen, DynamicDim );274 const Type * strType = new ArrayType( charType, strSize, FixedLen, StaticDim ); 275 275 const std::string strValue = "\"" + str + "\""; 276 276 return new ConstantExpr( loc, strType, strValue, std::nullopt ); -
src/AST/Pass.impl.hpp
rffec1bf r9e23b446 681 681 if ( __visit_children() ) { 682 682 // unlike structs, traits, and unions, enums inject their members into the global scope 683 maybe_accept( node, &EnumDecl::base );684 683 maybe_accept( node, &EnumDecl::params ); 685 684 maybe_accept( node, &EnumDecl::members ); -
src/AST/module.mk
rffec1bf r9e23b446 37 37 AST/Init.cpp \ 38 38 AST/Init.hpp \ 39 AST/Inspect.cpp \40 AST/Inspect.hpp \41 39 AST/Label.hpp \ 42 40 AST/LinkageSpec.cpp \ -
src/CodeGen/CodeGenerator.cc
rffec1bf r9e23b446 9 9 // Author : Richard C. Bilson 10 10 // Created On : Mon May 18 07:44:20 2015 11 // Last Modified By : Andrew Beach12 // Last Modified On : Wed Jun 29 14:34:00 202213 // Update Count : 54 211 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Wed Feb 2 20:30:30 2022 13 // Update Count : 541 14 14 // 15 15 #include "CodeGenerator.h" … … 18 18 #include <list> // for _List_iterator, list, list<>::it... 19 19 20 #include "AST/Decl.hpp" // for DeclWithType21 20 #include "Common/UniqueName.h" // for UniqueName 22 21 #include "Common/utility.h" // for CodeLocation, toString … … 295 294 } else { 296 295 if ( obj->get_init() ) { 297 obj->get_init()->accept( *visitor ); 296 obj->get_init()->accept( *visitor ); 298 297 } else { 299 298 // Should not reach here! … … 684 683 extension( variableExpr ); 685 684 const OperatorInfo * opInfo; 686 if( dynamic_cast<ZeroType*>( variableExpr->get_var()->get_type() ) ) { 687 output << "0"; 688 } else if ( variableExpr->get_var()->get_linkage() == LinkageSpec::Intrinsic && (opInfo = operatorLookup( variableExpr->get_var()->get_name() )) && opInfo->type == OT_CONSTANT ) { 685 if ( variableExpr->get_var()->get_linkage() == LinkageSpec::Intrinsic && (opInfo = operatorLookup( variableExpr->get_var()->get_name() )) && opInfo->type == OT_CONSTANT ) { 689 686 output << opInfo->symbol; 690 687 } else { 691 // if (dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type()) 688 // if (dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type()) 692 689 // && dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type())->baseEnum->base) { 693 690 // output << '(' <<genType(dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type())->baseEnum->base, "", options) << ')'; … … 1239 1236 } // if 1240 1237 } 1241 1242 std::string genName( ast::DeclWithType const * decl ) {1243 if ( const OperatorInfo * opInfo = operatorLookup( decl->name ) ) {1244 return opInfo->outputName;1245 } else {1246 return decl->name;1247 }1248 }1249 1250 1238 } // namespace CodeGen 1251 1239 -
src/CodeGen/CodeGenerator.h
rffec1bf r9e23b446 9 9 // Author : Richard C. Bilson 10 10 // Created On : Mon May 18 07:44:20 2015 11 // Last Modified By : Andrew Beach12 // Last Modified On : Wed Jun 29 14:32:00202213 // Update Count : 6 511 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Tue Feb 1 09:23:21 2022 13 // Update Count : 64 14 14 // 15 15 … … 26 26 #include "SynTree/Visitor.h" // for Visitor 27 27 #include "SynTree/SynTree.h" // for Visitor Nodes 28 29 namespace ast {30 class DeclWithType;31 }32 28 33 29 namespace CodeGen { … … 186 182 /// returns C-compatible name of declaration 187 183 std::string genName( DeclarationWithType * decl ); 188 std::string genName( ast::DeclWithType const * decl );189 184 190 185 inline std::ostream & operator<<( std::ostream & os, const CodeGenerator::LineEnder & endl ) { -
src/CodeGen/FixNames.cc
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixNames.cc -- Adjustments to typed declarations.7 // FixNames.cc -- 8 8 // 9 9 // Author : Richard C. Bilson 10 10 // Created On : Mon May 18 07:44:20 2015 11 11 // Last Modified By : Andrew Beach 12 // Last Modified On : Wed Jul 20 11:49:00 202213 // Update Count : 2 412 // Last Modified On : Fri Oct 29 15:49:00 2021 13 // Update Count : 23 14 14 // 15 15 … … 87 87 88 88 /// Does work with the main function and scopeLevels. 89 class FixNames_new final{89 class FixNames_new : public ast::WithGuards { 90 90 int scopeLevel = 1; 91 91 … … 103 103 104 104 const ast::FunctionDecl *postvisit( const ast::FunctionDecl *functionDecl ) { 105 // This store is used to ensure a maximum of one call to mutate. 106 ast::FunctionDecl * mutDecl = nullptr; 107 108 if ( shouldSetScopeLevel( functionDecl ) ) { 109 mutDecl = ast::mutate( functionDecl ); 110 mutDecl->scopeLevel = scopeLevel; 111 } 112 105 113 if ( FixMain::isMain( functionDecl ) ) { 106 auto mutDecl = ast::mutate( functionDecl ); 107 108 if ( shouldSetScopeLevel( mutDecl ) ) { 109 mutDecl->scopeLevel = scopeLevel; 110 } 114 if ( !mutDecl ) { mutDecl = ast::mutate( functionDecl ); } 111 115 112 116 int nargs = mutDecl->params.size(); … … 120 124 ) 121 125 ); 122 123 return mutDecl;124 } else if ( shouldSetScopeLevel( functionDecl ) ) {125 return ast::mutate_field( functionDecl, &ast::FunctionDecl::scopeLevel, scopeLevel );126 } else {127 return functionDecl;128 126 } 127 return mutDecl ? mutDecl : functionDecl; 129 128 } 130 129 131 130 void previsit( const ast::CompoundStmt * ) { 132 scopeLevel += 1; 133 } 134 135 void postvisit( const ast::CompoundStmt * ) { 136 scopeLevel -= 1; 131 GuardValue( scopeLevel ) += 1; 137 132 } 138 133 }; -
src/CodeGen/FixNames.h
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixNames.h -- Adjustments to typed declarations.7 // FixNames.h -- 8 8 // 9 9 // Author : Richard C. Bilson … … 26 26 /// mangles object and function names 27 27 void fixNames( std::list< Declaration* > & translationUnit ); 28 /// Sets scope levels and fills in main's default return. 29 void fixNames( ast::TranslationUnit & translationUnit ); 28 void fixNames( ast::TranslationUnit & translationUnit ); 30 29 } // namespace CodeGen 31 30 -
src/CodeGen/GenType.cc
rffec1bf r9e23b446 254 254 255 255 void GenType::postvisit( EnumInstType * enumInst ) { 256 if ( enumInst->baseEnum && enumInst->baseEnum->base ) {256 if ( enumInst->baseEnum->base ) { 257 257 typeString = genType(enumInst->baseEnum->base, "", options) + typeString; 258 258 } else { -
src/Common/Eval.cc
rffec1bf r9e23b446 10 10 // Created On : Mon May 18 07:44:20 2015 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Fri Jul 1 08:41:03 202213 // Update Count : 11712 // Last Modified On : Wed Jul 24 15:09:06 2019 13 // Update Count : 64 14 14 // 15 15 … … 17 17 18 18 #include "Common/PassVisitor.h" 19 #include "CodeGen/OperatorTable.h" // access: OperatorInfo20 19 #include "AST/Pass.hpp" 21 20 #include "InitTweak/InitTweak.h" … … 25 24 // Old AST 26 25 struct EvalOld : public WithShortCircuiting { 27 long long int value = 0; // compose the result of the constant expression 28 bool valid = true; // true => constant expression and value is the result 29 // false => not constant expression, e.g., ++i 30 bool cfavalid = true; // true => constant expression and value computable 31 // false => constant expression but value not computable, e.g., sizeof(int) 26 long long int value = 0; 27 bool valid = true; 32 28 33 29 void previsit( const BaseSyntaxNode * ) { visit_children = false; } … … 93 89 // New AST 94 90 struct EvalNew : public ast::WithShortCircuiting { 95 long long int value = 0; // compose the result of the constant expression 96 bool valid = true; // true => constant expression and value is the result 97 // false => not constant expression, e.g., ++i 98 bool cfavalid = true; // true => constant expression and value computable 99 // false => constant expression but value not computable, e.g., sizeof(int) 91 long long int value = 0; 92 bool valid = true; 100 93 101 94 void previsit( const ast::Node * ) { visit_children = false; } 102 void postvisit( const ast::Node * ) { cfavalid =valid = false; }95 void postvisit( const ast::Node * ) { valid = false; } 103 96 104 void postvisit( const ast::UntypedExpr * ) { 105 assertf( false, "UntypedExpr in constant expression evaluation" ); // FIX ME, resolve variable 106 } 107 108 void postvisit( const ast::ConstantExpr * expr ) { // only handle int constants 97 void postvisit( const ast::ConstantExpr * expr ) { 109 98 value = expr->intValue(); 110 99 } 111 100 112 void postvisit( const ast::SizeofExpr * ) { 113 // do not change valid or value => let C figure it out 114 cfavalid = false; 101 void postvisit( const ast::SizeofExpr * expr ) { 102 if ( expr->expr ) value = eval(expr->expr).first; 103 else if ( expr->type ) value = eval(expr->expr).first; 104 else SemanticError( expr->location, ::toString( "Internal error: SizeofExpr has no expression or type value" ) ); 115 105 } 116 106 117 void postvisit( const ast::AlignofExpr * ) { 118 // do not change valid or value => let C figure it out 119 cfavalid = false; 120 } 121 122 void postvisit( const ast::OffsetofExpr * ) { 123 // do not change valid or value => let C figure it out 124 cfavalid = false; 125 } 126 127 void postvisit( const ast::LogicalExpr * expr ) { 128 std::pair<long long int, bool> arg1, arg2; 129 arg1 = eval( expr->arg1 ); 130 valid &= arg1.second; 131 if ( ! valid ) return; 132 arg2 = eval( expr->arg2 ); 133 valid &= arg2.second; 134 if ( ! valid ) return; 135 136 if ( expr->isAnd ) { 137 value = arg1.first && arg2.first; 138 } else { 139 value = arg1.first || arg2.first; 140 } // if 141 } 142 143 void postvisit( const ast::ConditionalExpr * expr ) { 144 std::pair<long long int, bool> arg1, arg2, arg3; 145 arg1 = eval( expr->arg1 ); 146 valid &= arg1.second; 147 if ( ! valid ) return; 148 arg2 = eval( expr->arg2 ); 149 valid &= arg2.second; 150 if ( ! valid ) return; 151 arg3 = eval( expr->arg3 ); 152 valid &= arg3.second; 153 if ( ! valid ) return; 154 155 value = arg1.first ? arg2.first : arg3.first; 156 } 157 158 void postvisit( const ast::CastExpr * expr ) { 159 // cfa-cc generates a cast before every constant and many other places, e.g., (int)3, so the cast argument must 160 // be evaluated to get the constant value. 107 void postvisit( const ast::CastExpr * expr ) { 161 108 auto arg = eval(expr->arg); 162 109 valid = arg.second; 163 110 value = arg.first; 164 cfavalid = false;111 // TODO: perform type conversion on value if valid 165 112 } 166 113 167 void postvisit( const ast::VariableExpr * expr ) { 114 void postvisit( const ast::VariableExpr * expr ) { // No hit 168 115 if ( const ast::EnumInstType * inst = dynamic_cast<const ast::EnumInstType *>(expr->result.get()) ) { 169 116 if ( const ast::EnumDecl * decl = inst->base ) { … … 181 128 const std::string & fname = function->name; 182 129 assertf( expr->args.size() == 1 || expr->args.size() == 2, "Intrinsic function with %zd arguments: %s", expr->args.size(), fname.c_str() ); 183 184 if ( expr->args.size() == 1 ) { 185 // pre/postfix operators ++ and -- => assignment, which is not constant 186 std::pair<long long int, bool> arg1; 187 arg1 = eval(expr->args.front()); 188 valid &= arg1.second; 130 std::pair<long long int, bool> arg1, arg2; 131 arg1 = eval(expr->args.front()); 132 valid = valid && arg1.second; 133 if ( ! valid ) return; 134 if ( expr->args.size() == 2 ) { 135 arg2 = eval(expr->args.back()); 136 valid = valid && arg2.second; 189 137 if ( ! valid ) return; 190 191 if (fname == "+?") { 192 value = arg1.first; 193 } else if (fname == "-?") { 194 value = -arg1.first; 195 } else if (fname == "~?") { 196 value = ~arg1.first; 197 } else if (fname == "!?") { 198 value = ! arg1.first; 199 } else { 200 valid = false; 201 } // if 202 } else { // => expr->args.size() == 2 203 // infix assignment operators => assignment, which is not constant 204 std::pair<long long int, bool> arg1, arg2; 205 arg1 = eval(expr->args.front()); 206 valid &= arg1.second; 207 if ( ! valid ) return; 208 arg2 = eval(expr->args.back()); 209 valid &= arg2.second; 210 if ( ! valid ) return; 211 212 if (fname == "?+?") { 213 value = arg1.first + arg2.first; 214 } else if (fname == "?-?") { 215 value = arg1.first - arg2.first; 216 } else if (fname == "?*?") { 217 value = arg1.first * arg2.first; 218 } else if (fname == "?/?") { 219 value = arg1.first / arg2.first; 220 } else if (fname == "?%?") { 221 value = arg1.first % arg2.first; 222 } else if (fname == "?<<?") { 223 value = arg1.first << arg2.first; 224 } else if (fname == "?>>?") { 225 value = arg1.first >> arg2.first; 226 } else if (fname == "?<?") { 227 value = arg1.first < arg2.first; 228 } else if (fname == "?>?") { 229 value = arg1.first > arg2.first; 230 } else if (fname == "?<=?") { 231 value = arg1.first <= arg2.first; 232 } else if (fname == "?>=?") { 233 value = arg1.first >= arg2.first; 234 } else if (fname == "?==?") { 235 value = arg1.first == arg2.first; 236 } else if (fname == "?!=?") { 237 value = arg1.first != arg2.first; 238 } else if (fname == "?&?") { 239 value = arg1.first & arg2.first; 240 } else if (fname == "?^?") { 241 value = arg1.first ^ arg2.first; 242 } else if (fname == "?|?") { 243 value = arg1.first | arg2.first; 244 } else { 245 valid = false; 246 } 247 } // if 138 } 139 if (fname == "?+?") { 140 value = arg1.first + arg2.first; 141 } else if (fname == "?-?") { 142 value = arg1.first - arg2.first; 143 } else if (fname == "?*?") { 144 value = arg1.first * arg2.first; 145 } else if (fname == "?/?") { 146 value = arg1.first / arg2.first; 147 } else if (fname == "?%?") { 148 value = arg1.first % arg2.first; 149 } else { 150 valid = false; 151 } 248 152 // TODO: implement other intrinsic functions 249 153 } 250 154 }; 251 155 252 std::pair<long long int, bool> eval( const Expression * expr ) {156 std::pair<long long int, bool> eval( const Expression * expr) { 253 157 PassVisitor<EvalOld> ev; 254 if ( expr) {255 expr->accept( ev);256 return std::make_pair( ev.pass.value, ev.pass.valid);158 if (expr) { 159 expr->accept(ev); 160 return std::make_pair(ev.pass.value, ev.pass.valid); 257 161 } else { 258 return std::make_pair( 0, false);162 return std::make_pair(0, false); 259 163 } 260 164 } 261 165 262 std::pair<long long int, bool> eval( const ast::Expr * expr) {166 std::pair<long long int, bool> eval(const ast::Expr * expr) { 263 167 ast::Pass<EvalNew> ev; 264 if ( expr) {265 expr->accept( ev);266 return std::make_pair( ev.core.value, ev.core.valid);168 if (expr) { 169 expr->accept(ev); 170 return std::make_pair(ev.core.value, ev.core.valid); 267 171 } else { 268 return std::make_pair( 0, false);172 return std::make_pair(0, false); 269 173 } 270 174 } -
src/Common/ResolvProtoDump.cpp
rffec1bf r9e23b446 227 227 } 228 228 229 void previsit( const ast::EnumInstType * ) {229 void previsit( const ast::EnumInstType * enumInst) { 230 230 // TODO: Add the meaningful text representation of typed enum 231 231 ss << (int)ast::BasicType::SignedInt; -
src/Concurrency/Keywords.h
rffec1bf r9e23b446 28 28 void implementThreadStarter( std::list< Declaration * > & translationUnit ); 29 29 30 /// Implement the sue-like keywords and the suspend keyword. Pre-Autogen30 /// Implement the sue-like keywords and the suspend keyword. 31 31 void implementKeywords( ast::TranslationUnit & translationUnit ); 32 /// Implement the mutex parameters and mutex statement. Post-Autogen32 /// Implement the mutex parameters and mutex statement. 33 33 void implementMutex( ast::TranslationUnit & translationUnit ); 34 /// Add the thread starter code to constructors. Post-Autogen34 /// Add the thread starter code to constructors. 35 35 void implementThreadStarter( ast::TranslationUnit & translationUnit ); 36 36 }; -
src/ControlStruct/ExceptDecl.cc
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // ExceptDecl.cc -- Handles declarations of exception types.7 // ExceptDecl.cc -- 8 8 // 9 9 // Author : Henry Xue -
src/ControlStruct/ExceptDecl.h
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // ExceptDecl.h -- Handles declarations of exception types.7 // ExceptDecl.h -- 8 8 // 9 9 // Author : Henry Xue 10 10 // Created On : Tue Jul 20 04:10:50 2021 11 // Last Modified By : Andrew Beach12 // Last Modified On : Tue Jul 12 15:49:00 202213 // Update Count : 211 // Last Modified By : Henry Xue 12 // Last Modified On : Tue Jul 20 04:10:50 2021 13 // Update Count : 1 14 14 // 15 15 … … 20 20 class Declaration; 21 21 22 namespace ast {23 class TranslationUnit;22 namespace ControlStruct { 23 void translateExcept( std::list< Declaration *> & translationUnit ); 24 24 } 25 26 namespace ControlStruct {27 /// Unfold exception declarations into raw structure declarations.28 /// Also builds vtable declarations and converts vtable types.29 void translateExcept( std::list< Declaration *> & translationUnit );30 void translateExcept( ast::TranslationUnit & translationUnit );31 } -
src/ControlStruct/HoistControlDecls.hpp
rffec1bf r9e23b446 21 21 22 22 namespace ControlStruct { 23 /// Hoist declarations out of control flow statements into compound statement. 24 /// Must happen before auto-gen routines are added. 23 // Hoist declarations out of control flow statements into compound statement. 25 24 void hoistControlDecls( ast::TranslationUnit & translationUnit ); 26 25 } // namespace ControlStruct -
src/ControlStruct/MultiLevelExit.cpp
rffec1bf r9e23b446 149 149 }; 150 150 151 NullStmt * labelledNullStmt( const CodeLocation & cl, const Label & label ) { 151 NullStmt * labelledNullStmt( 152 const CodeLocation & cl, const Label & label ) { 152 153 return new NullStmt( cl, vector<Label>{ label } ); 153 154 } … … 163 164 164 165 const CompoundStmt * MultiLevelExitCore::previsit( 165 const CompoundStmt * stmt ) {166 const CompoundStmt * stmt ) { 166 167 visit_children = false; 167 168 … … 188 189 } 189 190 190 size_t getUnusedIndex( const Stmt * stmt, const Label & originalTarget ) { 191 size_t getUnusedIndex( 192 const Stmt * stmt, const Label & originalTarget ) { 191 193 const size_t size = stmt->labels.size(); 192 194 … … 208 210 } 209 211 210 const Stmt * addUnused( const Stmt * stmt, const Label & originalTarget ) { 212 const Stmt * addUnused( 213 const Stmt * stmt, const Label & originalTarget ) { 211 214 size_t i = getUnusedIndex( stmt, originalTarget ); 212 215 if ( i == stmt->labels.size() ) { … … 353 356 354 357 // Mimic what the built-in push_front would do anyways. It is O(n). 355 void push_front( vector<ptr<Stmt>> & vec, const Stmt * element ) { 358 void push_front( 359 vector<ptr<Stmt>> & vec, const Stmt * element ) { 356 360 vec.emplace_back( nullptr ); 357 361 for ( size_t i = vec.size() - 1 ; 0 < i ; --i ) { … … 586 590 587 591 ptr<Stmt> else_stmt = nullptr; 588 constStmt * loop_kid = nullptr;592 Stmt * loop_kid = nullptr; 589 593 // check if loop node and if so add else clause if it exists 590 const WhileDoStmt * whilePtr = kid.as<WhileDoStmt>();591 if ( whilePtr && whilePtr->else_ ) {594 const WhileDoStmt * whilePtr = dynamic_cast<const WhileDoStmt *>(kid.get()); 595 if ( whilePtr && whilePtr->else_) { 592 596 else_stmt = whilePtr->else_; 593 loop_kid = mutate_field( whilePtr, &WhileDoStmt::else_, nullptr ); 594 } 595 const ForStmt * forPtr = kid.as<ForStmt>(); 596 if ( forPtr && forPtr->else_ ) { 597 WhileDoStmt * mutate_ptr = mutate(whilePtr); 598 mutate_ptr->else_ = nullptr; 599 loop_kid = mutate_ptr; 600 } 601 const ForStmt * forPtr = dynamic_cast<const ForStmt *>(kid.get()); 602 if ( forPtr && forPtr->else_) { 597 603 else_stmt = forPtr->else_; 598 loop_kid = mutate_field( forPtr, &ForStmt::else_, nullptr ); 604 ForStmt * mutate_ptr = mutate(forPtr); 605 mutate_ptr->else_ = nullptr; 606 loop_kid = mutate_ptr; 599 607 } 600 608 -
src/ControlStruct/module.mk
rffec1bf r9e23b446 17 17 SRC += \ 18 18 ControlStruct/ExceptDecl.cc \ 19 ControlStruct/ExceptDeclNew.cpp \20 19 ControlStruct/ExceptDecl.h \ 21 20 ControlStruct/ExceptTranslateNew.cpp \ -
src/GenPoly/Box.cc
rffec1bf r9e23b446 189 189 /// Enters a new scope for type-variables, adding the type variables from ty 190 190 void beginTypeScope( Type *ty ); 191 /// Exits the type-variable scope 192 void endTypeScope(); 191 193 /// Enters a new scope for knowLayouts and knownOffsets and queues exit calls 192 194 void beginGenericScope(); … … 196 198 UniqueName bufNamer; ///< Namer for VLA buffers 197 199 Expression * addrMember = nullptr; ///< AddressExpr argument is MemberExpr? 198 bool expect_func_type = false; ///< used to avoid recursing too deep in type decls199 200 }; 200 201 … … 1276 1277 FunctionType * ftype = functionDecl->type; 1277 1278 if ( ! ftype->returnVals.empty() && functionDecl->statements ) { 1278 // intrinsic functions won't be using the _retval so no need to generate it. 1279 if ( functionDecl->linkage != LinkageSpec::Intrinsic && !isPrefix( functionDecl->name, "_thunk" ) && ! isPrefix( functionDecl->name, "_adapter" ) ) { // xxx - remove check for prefix once thunks properly use ctor/dtors 1279 if ( ! isPrefix( functionDecl->name, "_thunk" ) && ! isPrefix( functionDecl->name, "_adapter" ) ) { // xxx - remove check for prefix once thunks properly use ctor/dtors 1280 1280 assert( ftype->returnVals.size() == 1 ); 1281 1281 DeclarationWithType * retval = ftype->returnVals.front(); … … 1418 1418 void PolyGenericCalculator::beginGenericScope() { 1419 1419 GuardScope( *this ); 1420 // We expect the first function type see to be the type relating to this scope1421 // but any further type is probably some unrelated function pointer1422 // keep track of which is the first1423 GuardValue( expect_func_type );1424 expect_func_type = true;1425 1420 } 1426 1421 … … 1472 1467 void PolyGenericCalculator::premutate( FunctionType *funcType ) { 1473 1468 beginTypeScope( funcType ); 1474 1475 GuardValue( expect_func_type );1476 1477 if(!expect_func_type) {1478 GuardAction( [this]() {1479 knownLayouts.endScope();1480 knownOffsets.endScope();1481 });1482 // If this is the first function type we see1483 // Then it's the type of the declaration and we care about it1484 knownLayouts.beginScope();1485 knownOffsets.beginScope();1486 }1487 1488 // The other functions type we will see in this scope are probably functions parameters1489 // they don't help us with the layout and offsets so don't mark them as known in this scope1490 expect_func_type = false;1491 1469 1492 1470 // make sure that any type information passed into the function is accounted for … … 1767 1745 } 1768 1746 1769 // std::cout << "TRUE 2" << std::endl;1770 1771 1747 return true; 1772 1748 } else if ( UnionInstType *unionTy = dynamic_cast< UnionInstType* >( ty ) ) { -
src/GenPoly/GenPoly.cc
rffec1bf r9e23b446 64 64 } 65 65 66 __attribute__((unu sed))66 __attribute__((ununsed)) 67 67 bool hasPolyParams( const std::vector<ast::ptr<ast::Expr>> & params, const TyVarMap & tyVars, const ast::TypeSubstitution * env) { 68 68 for (auto ¶m : params) { -
src/InitTweak/GenInit.cc
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // GenInit.cc -- Generate initializers, and other stuff.7 // GenInit.cc -- 8 8 // 9 9 // Author : Rob Schluntz … … 642 642 643 643 ast::ConstructorInit * genCtorInit( const CodeLocation & loc, const ast::ObjectDecl * objDecl ) { 644 // call into genImplicitCall from Autogen.h to generate calls to ctor/dtor for each 644 // call into genImplicitCall from Autogen.h to generate calls to ctor/dtor for each 645 645 // constructable object 646 646 InitExpander_new srcParam{ objDecl->init }, nullParam{ (const ast::Init *)nullptr }; 647 647 ast::ptr< ast::Expr > dstParam = new ast::VariableExpr(loc, objDecl); 648 649 ast::ptr< ast::Stmt > ctor = SymTab::genImplicitCall( 648 649 ast::ptr< ast::Stmt > ctor = SymTab::genImplicitCall( 650 650 srcParam, dstParam, loc, "?{}", objDecl ); 651 ast::ptr< ast::Stmt > dtor = SymTab::genImplicitCall( 652 nullParam, dstParam, loc, "^?{}", objDecl, 651 ast::ptr< ast::Stmt > dtor = SymTab::genImplicitCall( 652 nullParam, dstParam, loc, "^?{}", objDecl, 653 653 SymTab::LoopBackward ); 654 654 655 655 // check that either both ctor and dtor are present, or neither 656 656 assert( (bool)ctor == (bool)dtor ); 657 657 658 658 if ( ctor ) { 659 // need to remember init expression, in case no ctors exist. If ctor does exist, want to 659 // need to remember init expression, in case no ctors exist. If ctor does exist, want to 660 660 // use ctor expression instead of init. 661 ctor.strict_as< ast::ImplicitCtorDtorStmt >(); 661 ctor.strict_as< ast::ImplicitCtorDtorStmt >(); 662 662 dtor.strict_as< ast::ImplicitCtorDtorStmt >(); 663 663 -
src/InitTweak/GenInit.h
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // GenInit.h -- Generate initializers, and other stuff.7 // GenInit.h -- 8 8 // 9 9 // Author : Rodolfo G. Esteves … … 29 29 void genInit( ast::TranslationUnit & translationUnit ); 30 30 31 /// Converts return statements into copy constructor calls on the hidden return variable. 32 /// This pass must happen before auto-gen. 31 /// Converts return statements into copy constructor calls on the hidden return variable 33 32 void fixReturnStatements( std::list< Declaration * > & translationUnit ); 34 33 void fixReturnStatements( ast::TranslationUnit & translationUnit ); -
src/Parser/lex.ll
rffec1bf r9e23b446 82 82 // Stop warning due to incorrectly generated flex code. 83 83 #pragma GCC diagnostic ignored "-Wsign-compare" 84 85 // lex uses __null in a boolean context, it's fine.86 #pragma GCC diagnostic ignored "-Wnull-conversion"87 84 %} 88 85 -
src/Parser/parser.yy
rffec1bf r9e23b446 10 10 // Created On : Sat Sep 1 20:22:55 2001 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Fri Jul 1 15:35:08202213 // Update Count : 540 512 // Last Modified On : Sat May 14 09:16:22 2022 13 // Update Count : 5401 14 14 // 15 15 … … 56 56 57 57 #include "SynTree/Attribute.h" // for Attribute 58 59 // lex uses __null in a boolean context, it's fine.60 #pragma GCC diagnostic ignored "-Wparentheses-equality"61 58 62 59 extern DeclarationNode * parseTree; … … 1243 1240 { 1244 1241 $$ = new StatementNode( build_while( new CondCtl( nullptr, new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ), maybe_build_compound( $4 ) ) ); 1245 SemanticWarning( yylloc, Warning::SuperfluousElse , "");1242 SemanticWarning( yylloc, Warning::SuperfluousElse ); 1246 1243 } 1247 1244 | WHILE '(' conditional_declaration ')' statement %prec THEN … … 1254 1251 { 1255 1252 $$ = new StatementNode( build_do_while( new ExpressionNode( build_constantInteger( *new string( "1" ) ) ), maybe_build_compound( $2 ) ) ); 1256 SemanticWarning( yylloc, Warning::SuperfluousElse , "");1253 SemanticWarning( yylloc, Warning::SuperfluousElse ); 1257 1254 } 1258 1255 | DO statement WHILE '(' comma_expression ')' ';' … … 1265 1262 { 1266 1263 $$ = new StatementNode( build_for( new ForCtrl( (ExpressionNode * )nullptr, (ExpressionNode * )nullptr, (ExpressionNode * )nullptr ), maybe_build_compound( $4 ) ) ); 1267 SemanticWarning( yylloc, Warning::SuperfluousElse , "");1264 SemanticWarning( yylloc, Warning::SuperfluousElse ); 1268 1265 } 1269 1266 | FOR '(' for_control_expression_list ')' statement %prec THEN … … 2397 2394 | ENUM '(' cfa_abstract_parameter_declaration ')' attribute_list_opt '{' enumerator_list comma_opt '}' 2398 2395 { 2399 if ( $3->storageClasses.val != 0 || $3->type->qualifiers.val != 0 ) 2396 if ( $3->storageClasses.val != 0 || $3->type->qualifiers.val != 0 ) 2400 2397 { SemanticError( yylloc, "storage-class and CV qualifiers are not meaningful for enumeration constants, which are const." ); } 2401 2398 … … 2441 2438 // empty 2442 2439 { $$ = nullptr; } 2443 | '=' constant_expression { $$ = new InitializerNode( $2 ); }2444 | '=' '{' initializer_list_opt comma_opt '}' { $$ = new InitializerNode( $3, true ); }2445 //| simple_assignment_operator initializer2446 //{ $$ = $1 == OperKinds::Assign ? $2 : $2->set_maybeConstructed( false ); }2440 // | '=' constant_expression 2441 // { $$ = $2; } 2442 | simple_assignment_operator initializer 2443 { $$ = $1 == OperKinds::Assign ? $2 : $2->set_maybeConstructed( false ); } 2447 2444 ; 2448 2445 … … 2844 2841 linkage = LinkageSpec::update( yylloc, linkage, $2 ); 2845 2842 } 2846 up external_definition down 2843 up external_definition down 2847 2844 { 2848 2845 linkage = linkageStack.top(); -
src/ResolvExpr/CandidateFinder.cpp
rffec1bf r9e23b446 41 41 #include "Common/utility.h" // for move, copy 42 42 #include "SymTab/Mangler.h" 43 #include "SymTab/Validate.h" // for validateType 43 44 #include "Tuples/Tuples.h" // for handleTupleAssignment 44 45 #include "InitTweak/InitTweak.h" // for getPointerBase … … 1090 1091 assert( toType ); 1091 1092 toType = resolveTypeof( toType, context ); 1093 // toType = SymTab::validateType( castExpr->location, toType, symtab ); 1092 1094 toType = adjustExprType( toType, tenv, symtab ); 1093 1095 … … 1588 1590 // calculate target type 1589 1591 const ast::Type * toType = resolveTypeof( initAlt.type, context ); 1592 // toType = SymTab::validateType( initExpr->location, toType, symtab ); 1590 1593 toType = adjustExprType( toType, tenv, symtab ); 1591 1594 // The call to find must occur inside this loop, otherwise polymorphic return -
src/ResolvExpr/CurrentObject.cc
rffec1bf r9e23b446 9 9 // Author : Rob Schluntz 10 10 // Created On : Tue Jun 13 15:28:32 2017 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Fri Jul 1 09:16:01 202213 // Update Count : 1511 // Last Modified By : Rob Schluntz 12 // Last Modified On : Tue Jun 13 15:28:44 2017 13 // Update Count : 2 14 14 // 15 15 … … 73 73 virtual void setPosition( std::list< Expression * > & designators ) = 0; 74 74 75 /// retrieve the list of possible Type/Designat ion pairs for the current position in the currect object75 /// retrieve the list of possible Type/Designaton pairs for the current position in the currect object 76 76 virtual std::list<InitAlternative> operator*() const = 0; 77 77 … … 158 158 159 159 private: 160 void setSize( Expression * expr ) { 161 auto res = eval( expr);160 void setSize( Expression * expr ) { // replace this logic with an eval call 161 auto res = eval(expr); 162 162 if (res.second) { 163 163 size = res.first; … … 170 170 void setPosition( Expression * expr ) { 171 171 // need to permit integer-constant-expressions, including: integer constants, enumeration constants, character constants, sizeof expressions, _Alignof expressions, cast expressions 172 auto arg = eval( expr ); 173 index = arg.first; 174 return; 175 176 // if ( ConstantExpr * constExpr = dynamic_cast< ConstantExpr * >( expr ) ) { 177 // try { 178 // index = constExpr->intValue(); 179 // } catch( SemanticErrorException & ) { 180 // SemanticError( expr, "Constant expression of non-integral type in array designator: " ); 181 // } 182 // } else if ( CastExpr * castExpr = dynamic_cast< CastExpr * >( expr ) ) { 183 // setPosition( castExpr->get_arg() ); 184 // } else if ( VariableExpr * varExpr = dynamic_cast< VariableExpr * >( expr ) ) { 185 // EnumInstType * inst = dynamic_cast<EnumInstType *>( varExpr->get_result() ); 186 // assertf( inst, "ArrayIterator given variable that isn't an enum constant : %s", toString( expr ).c_str() ); 187 // long long int value; 188 // if ( inst->baseEnum->valueOf( varExpr->var, value ) ) { 189 // index = value; 190 // } 191 // } else if ( dynamic_cast< SizeofExpr * >( expr ) || dynamic_cast< AlignofExpr * >( expr ) ) { 192 // index = 0; // xxx - get actual sizeof/alignof value? 193 // } else { 194 // assertf( false, "4 bad designator given to ArrayIterator: %s", toString( expr ).c_str() ); 195 // } 172 if ( ConstantExpr * constExpr = dynamic_cast< ConstantExpr * >( expr ) ) { 173 try { 174 index = constExpr->intValue(); 175 } catch( SemanticErrorException & ) { 176 SemanticError( expr, "Constant expression of non-integral type in array designator: " ); 177 } 178 } else if ( CastExpr * castExpr = dynamic_cast< CastExpr * >( expr ) ) { 179 setPosition( castExpr->get_arg() ); 180 } else if ( VariableExpr * varExpr = dynamic_cast< VariableExpr * >( expr ) ) { 181 EnumInstType * inst = dynamic_cast<EnumInstType *>( varExpr->get_result() ); 182 assertf( inst, "ArrayIterator given variable that isn't an enum constant : %s", toString( expr ).c_str() ); 183 long long int value; 184 if ( inst->baseEnum->valueOf( varExpr->var, value ) ) { 185 index = value; 186 } 187 } else if ( dynamic_cast< SizeofExpr * >( expr ) || dynamic_cast< AlignofExpr * >( expr ) ) { 188 index = 0; // xxx - get actual sizeof/alignof value? 189 } else { 190 assertf( false, "bad designator given to ArrayIterator: %s", toString( expr ).c_str() ); 191 } 196 192 } 197 193 … … 333 329 assertf( false, "could not find member in %s: %s", kind.c_str(), toString( varExpr ).c_str() ); 334 330 } else { 335 assertf( false, " 3bad designator given to %s: %s", kind.c_str(), toString( designators.front() ).c_str() );331 assertf( false, "bad designator given to %s: %s", kind.c_str(), toString( designators.front() ).c_str() ); 336 332 } // if 337 333 } // if … … 641 637 642 638 void setSize( const Expr * expr ) { 643 auto res = eval( expr);639 auto res = eval(expr); 644 640 if ( ! res.second ) { 645 SemanticError( location, toString( "Array designator must be a constant expression: ", expr ) ); 641 SemanticError( location, 642 toString("Array designator must be a constant expression: ", expr ) ); 646 643 } 647 644 size = res.first; … … 649 646 650 647 public: 651 ArrayIterator( const CodeLocation & loc, const ArrayType * at ) : location( loc ), array( at ), base( at->base ) { 648 ArrayIterator( const CodeLocation & loc, const ArrayType * at ) 649 : location( loc ), array( at ), base( at->base ) { 652 650 PRINT( std::cerr << "Creating array iterator: " << at << std::endl; ) 653 651 memberIter.reset( createMemberIterator( loc, base ) ); … … 662 660 // enumeration constants, character constants, sizeof expressions, alignof expressions, 663 661 // cast expressions 664 665 auto arg = eval( expr );666 index = arg.first;667 return;668 669 // if ( auto constExpr = dynamic_cast< const ConstantExpr * >( expr ) ) {670 // try {671 // index = constExpr->intValue();672 // } catch ( SemanticErrorException & ) {673 // SemanticError( expr, "Constant expression of non-integral type in array designator: " );674 // }675 // } else if ( auto castExpr = dynamic_cast< const CastExpr * >( expr ) ) {676 // setPosition( castExpr->arg );677 // } else if ( dynamic_cast< const SizeofExpr * >( expr ) || dynamic_cast< const AlignofExpr * >( expr ) ) {678 // index = 0;679 // } else {680 // assertf( false, "2bad designator given to ArrayIterator: %s", toString( expr ).c_str() );681 //}662 if ( auto constExpr = dynamic_cast< const ConstantExpr * >( expr ) ) { 663 try { 664 index = constExpr->intValue(); 665 } catch ( SemanticErrorException & ) { 666 SemanticError( expr, 667 "Constant expression of non-integral type in array designator: " ); 668 } 669 } else if ( auto castExpr = dynamic_cast< const CastExpr * >( expr ) ) { 670 setPosition( castExpr->arg ); 671 } else if ( 672 dynamic_cast< const SizeofExpr * >( expr ) 673 || dynamic_cast< const AlignofExpr * >( expr ) 674 ) { 675 index = 0; 676 } else { 677 assertf( false, 678 "bad designator given to ArrayIterator: %s", toString( expr ).c_str() ); 679 } 682 680 } 683 681 … … 725 723 std::deque< InitAlternative > ret = memberIter->first(); 726 724 for ( InitAlternative & alt : ret ) { 727 alt.designation.get_and_mutate()->designators.emplace_front( ConstantExpr::from_ulong( location, index ) ); 725 alt.designation.get_and_mutate()->designators.emplace_front( 726 ConstantExpr::from_ulong( location, index ) ); 728 727 } 729 728 return ret; … … 789 788 return; 790 789 } 791 assertf( false, "could not find member in %s: %s", kind.c_str(), toString( varExpr ).c_str() ); 790 assertf( false, 791 "could not find member in %s: %s", kind.c_str(), toString( varExpr ).c_str() ); 792 792 } else { 793 assertf( false, "1 bad designator given to %s: %s", kind.c_str(), toString( *begin ).c_str() ); 793 assertf( false, 794 "bad designator given to %s: %s", kind.c_str(), toString( *begin ).c_str() ); 794 795 } 795 796 } -
src/SymTab/FixFunction.cc
rffec1bf r9e23b446 9 9 // Author : Richard C. Bilson 10 10 // Created On : Sun May 17 16:19:49 2015 11 // Last Modified By : Andrew Beach12 // Last Modified On : Tue Jul 12 14:28:00 202213 // Update Count : 711 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Mon Mar 6 23:36:59 2017 13 // Update Count : 6 14 14 // 15 15 … … 122 122 } 123 123 124 void previsit( const ast::FunctionType * ) { visit_children = false; }125 126 const ast::Type * postvisit( const ast::FunctionType * type ) {127 return new ast::PointerType( type );128 }129 130 124 void previsit( const ast::VoidType * ) { isVoid = true; } 131 125 … … 151 145 } 152 146 153 const ast::Type * fixFunction( const ast::Type * type, bool & isVoid ) {154 ast::Pass< FixFunction_new > fixer;155 type = type->accept( fixer );156 isVoid |= fixer.core.isVoid;157 return type;158 }159 160 147 } // namespace SymTab 161 148 -
src/SymTab/FixFunction.h
rffec1bf r9e23b446 10 10 // Created On : Sun May 17 17:02:08 2015 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Tue Jul 12 14:19:00 202213 // Update Count : 512 // Last Modified On : Sat Jul 22 09:45:55 2017 13 // Update Count : 4 14 14 // 15 15 … … 21 21 namespace ast { 22 22 class DeclWithType; 23 class Type;24 23 } 25 24 … … 32 31 /// Sets isVoid to true if type is void 33 32 const ast::DeclWithType * fixFunction( const ast::DeclWithType * dwt, bool & isVoid ); 34 const ast::Type * fixFunction( const ast::Type * type, bool & isVoid );35 33 } // namespace SymTab 36 34 -
src/SymTab/Mangler.cc
rffec1bf r9e23b446 537 537 } 538 538 539 __attribute__((unused))540 539 inline std::vector< ast::ptr< ast::Type > > getTypes( const std::vector< ast::ptr< ast::DeclWithType > > & decls ) { 541 540 std::vector< ast::ptr< ast::Type > > ret; -
src/SymTab/Validate.cc
rffec1bf r9e23b446 10 10 // Created On : Sun May 17 21:50:04 2015 11 11 // Last Modified By : Andrew Beach 12 // Last Modified On : Tue Jul 12 15:00:00 202213 // Update Count : 36 712 // Last Modified On : Tue May 17 14:36:00 2022 13 // Update Count : 366 14 14 // 15 15 … … 294 294 }; 295 295 296 void validate ( std::list< Declaration * > &translationUnit, __attribute__((unused)) bool doDebug) {296 void validate_A( std::list< Declaration * > & translationUnit ) { 297 297 PassVisitor<HoistTypeDecls> hoistDecls; 298 298 { … … 305 305 decayEnumsAndPointers( translationUnit ); // must happen before VerifyCtorDtorAssign, because void return objects should not exist; before LinkReferenceToTypes_old because it is an indexer and needs correct types for mangling 306 306 } 307 } 308 309 void validate_B( std::list< Declaration * > & translationUnit ) { 307 310 PassVisitor<FixQualifiedTypes> fixQual; 308 311 { 309 312 Stats::Heap::newPass("validate-B"); 310 313 Stats::Time::BlockGuard guard("validate-B"); 311 linkReferenceToTypes( translationUnit ); // Must happen before auto-gen, because it uses the sized flag.314 //linkReferenceToTypes( translationUnit ); 312 315 mutateAll( translationUnit, fixQual ); // must happen after LinkReferenceToTypes_old, because aggregate members are accessed 313 316 HoistStruct::hoistStruct( translationUnit ); 314 317 EliminateTypedef::eliminateTypedef( translationUnit ); 315 318 } 319 } 320 321 void validate_C( std::list< Declaration * > & translationUnit ) { 316 322 PassVisitor<ValidateGenericParameters> genericParams; 317 323 PassVisitor<ResolveEnumInitializers> rei( nullptr ); … … 337 343 }); 338 344 } 345 } 346 347 void validate_D( std::list< Declaration * > & translationUnit ) { 339 348 { 340 349 Stats::Heap::newPass("validate-D"); … … 353 362 }); 354 363 } 364 } 365 366 void validate_E( std::list< Declaration * > & translationUnit ) { 355 367 PassVisitor<CompoundLiteral> compoundliteral; 356 368 { … … 372 384 } 373 385 } 386 } 387 388 void validate_F( std::list< Declaration * > & translationUnit ) { 374 389 PassVisitor<LabelAddressFixer> labelAddrFixer; 375 390 { … … 395 410 } 396 411 } 412 } 413 414 void validate( std::list< Declaration * > &translationUnit, __attribute__((unused)) bool doDebug ) { 415 validate_A( translationUnit ); 416 validate_B( translationUnit ); 417 validate_C( translationUnit ); 418 validate_D( translationUnit ); 419 validate_E( translationUnit ); 420 validate_F( translationUnit ); 397 421 } 398 422 -
src/SymTab/Validate.h
rffec1bf r9e23b446 11 11 // Created On : Sun May 17 21:53:34 2015 12 12 // Last Modified By : Andrew Beach 13 // Last Modified On : Tue Jul 12 15:30:00 202214 // Update Count : 613 // Last Modified On : Tue May 17 14:35:00 2022 14 // Update Count : 5 15 15 // 16 16 … … 19 19 #include <list> // for list 20 20 21 class Declaration; 21 struct CodeLocation; 22 class Declaration; 23 class Type; 24 25 namespace ast { 26 class Type; 27 class SymbolTable; 28 } 22 29 23 30 namespace SymTab { 31 class Indexer; 32 24 33 /// Normalizes struct and function declarations 25 34 void validate( std::list< Declaration * > &translationUnit, bool doDebug = false ); 35 36 // Sub-passes of validate. 37 void validate_A( std::list< Declaration * > &translationUnit ); 38 void validate_B( std::list< Declaration * > &translationUnit ); 39 void validate_C( std::list< Declaration * > &translationUnit ); 40 void validate_D( std::list< Declaration * > &translationUnit ); 41 void validate_E( std::list< Declaration * > &translationUnit ); 42 void validate_F( std::list< Declaration * > &translationUnit ); 26 43 } // namespace SymTab 27 44 -
src/SymTab/ValidateType.cc
rffec1bf r9e23b446 222 222 // visit enum members first so that the types of self-referencing members are updated properly 223 223 // Replace the enum base; right now it works only for StructEnum 224 if ( enumDecl->base ) { 225 if ( const TypeInstType * base = dynamic_cast< TypeInstType * >(enumDecl->base) ) { 226 if ( const StructDecl * decl = local_indexer->lookupStruct( base->name ) ) { 227 enumDecl->base = new StructInstType( Type::Qualifiers(), const_cast< StructDecl * >( decl ) ); // Just linking in the node 228 } 229 } else if ( const PointerType * ptr = dynamic_cast< PointerType * >(enumDecl->base) ) { 230 if ( const TypeInstType * ptrBase = dynamic_cast< TypeInstType * >( ptr->base ) ) { 231 if ( const StructDecl * decl = local_indexer->lookupStruct( ptrBase->name ) ) { 232 enumDecl->base = new PointerType( Type::Qualifiers(), 233 new StructInstType( Type::Qualifiers(), const_cast< StructDecl * >( decl ) ) ); 234 } 235 } 224 if ( enumDecl->base && dynamic_cast<TypeInstType*>(enumDecl->base) ) { 225 std::string baseName = static_cast<TypeInstType*>(enumDecl->base)->name; 226 const StructDecl * st = local_indexer->lookupStruct( baseName ); 227 if ( st ) { 228 enumDecl->base = new StructInstType(Type::Qualifiers(),const_cast<StructDecl *>(st)); // Just linking in the node 236 229 } 237 230 } 238 239 231 if ( enumDecl->body ) { 240 232 ForwardEnumsType::iterator fwds = forwardEnums.find( enumDecl->name ); -
src/SynTree/AggregateDecl.cc
rffec1bf r9e23b446 10 10 // Created On : Sun May 17 23:56:39 2015 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Fri Jul 1 09:12:33 202213 // Update Count : 3 212 // Last Modified On : Mon Dec 16 15:07:20 2019 13 // Update Count : 31 14 14 // 15 15 … … 125 125 SingleInit * init = strict_dynamic_cast< SingleInit * >( field->init ); 126 126 auto result = eval( init->value ); 127 if ( ! result.second ) SemanticError( init->location, toString( " Enumerator value for '", field, "' is not an integer constant") );127 if ( ! result.second ) SemanticError( init->location, toString( "Non-constexpr in initialization of enumerator: ", field ) ); 128 128 currentValue = result.first; 129 129 } -
src/SynTree/Type.h
rffec1bf r9e23b446 274 274 class PointerType : public Type { 275 275 public: 276 Type * base;276 Type *base; 277 277 278 278 // In C99, pointer types can be qualified in many ways e.g., int f( int a[ static 3 ] ) … … 516 516 typedef ReferenceToType Parent; 517 517 public: 518 // this decl is not "owned" by the enuminst; it is merely a pointer to elsewhere in the tree,519 // where the enumused in this type is actually defined518 // this decl is not "owned" by the union inst; it is merely a pointer to elsewhere in the tree, 519 // where the union used in this type is actually defined 520 520 EnumDecl *baseEnum = nullptr; 521 521 -
src/Tuples/Tuples.cc
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // Tuples. cc -- A collection of tuple operations.7 // Tuples.h -- 8 8 // 9 9 // Author : Andrew Beach -
src/Tuples/Tuples.h
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // Tuples.h -- A collection of tuple operations.7 // Tuples.h -- 8 8 // 9 9 // Author : Rodolfo G. Esteves -
src/Validate/Autogen.cpp
rffec1bf r9e23b446 28 28 #include "AST/DeclReplacer.hpp" 29 29 #include "AST/Expr.hpp" 30 #include "AST/Inspect.hpp"31 30 #include "AST/Pass.hpp" 32 31 #include "AST/Stmt.hpp" … … 122 121 123 122 // Built-ins do not use autogeneration. 124 bool shouldAutogen() const final { return !decl->linkage.is_builtin && !structHasFlexibleArray(decl); }123 bool shouldAutogen() const final { return !decl->linkage.is_builtin; } 125 124 private: 126 125 void genFuncBody( ast::FunctionDecl * decl ) final; … … 184 183 { 185 184 // TODO: These functions are somewhere between instrinsic and autogen, 186 // could possibly use a new linkage type. For now we just make the 187 // basic ones intrinsic to code-gen them as C assignments. 188 const auto & real_type = decl->base; 189 const auto & basic = real_type.as<ast::BasicType>(); 190 if(!real_type || (basic && basic->isInteger())) proto_linkage = ast::Linkage::Intrinsic; 185 // could possibly use a new linkage type. For now we just make them 186 // intrinsic to code-gen them as C assignments. 187 proto_linkage = ast::Linkage::Intrinsic; 191 188 } 192 189 … … 405 402 auto retval = srcParam(); 406 403 retval->name = "_ret"; 404 // xxx - Adding this unused attribute can slience unused variable warning 405 // However, some code might not be compiled as expected 406 // Temporarily disabled 407 // retval->attributes.push_back(new ast::Attribute("unused")); 407 408 return genProto( "?=?", { dstParam(), srcParam() }, { retval } ); 408 409 } -
src/Validate/Autogen.hpp
rffec1bf r9e23b446 22 22 namespace Validate { 23 23 24 /// Generate routines for all data types in the translation unit.25 /// A lot of passes have to happen either before or after this pass.26 24 void autogenerateRoutines( ast::TranslationUnit & translationUnit ); 27 25 -
src/Validate/CompoundLiteral.hpp
rffec1bf r9e23b446 23 23 24 24 /// Use variables to implement compound literals. 25 /// Must happen after auto-gen routines are added.26 25 void handleCompoundLiterals( ast::TranslationUnit & translationUnit ); 27 26 -
src/Validate/EliminateTypedef.cpp
rffec1bf r9e23b446 10 10 // Created On : Wed Apr 20 16:37:00 2022 11 11 // Last Modified By : Andrew Beach 12 // Last Modified On : Mon Jul 11 16:30:00 202213 // Update Count : 112 // Last Modified On : Mon Apr 25 14:26:00 2022 13 // Update Count : 0 14 14 // 15 15 … … 28 28 29 29 struct EliminateTypedefCore { 30 // Remove typedefs from inside aggregates.31 30 ast::StructDecl const * previsit( ast::StructDecl const * decl ); 32 31 ast::UnionDecl const * previsit( ast::UnionDecl const * decl ); 33 // Remove typedefs from statement lists.34 32 ast::CompoundStmt const * previsit( ast::CompoundStmt const * stmt ); 35 // Remove typedefs from control structure initializers.36 ast::IfStmt const * previsit( ast::IfStmt const * stmt );37 ast::ForStmt const * previsit( ast::ForStmt const * stmt );38 ast::WhileDoStmt const * previsit( ast::WhileDoStmt const * stmt );39 33 }; 40 34 … … 69 63 } 70 64 71 ast::IfStmt const * EliminateTypedefCore::previsit( ast::IfStmt const * stmt ) {72 return field_erase_if( stmt, &ast::IfStmt::inits, isTypedefStmt );73 }74 75 ast::ForStmt const * EliminateTypedefCore::previsit( ast::ForStmt const * stmt ) {76 return field_erase_if( stmt, &ast::ForStmt::inits, isTypedefStmt );77 }78 79 ast::WhileDoStmt const * EliminateTypedefCore::previsit( ast::WhileDoStmt const * stmt ) {80 return field_erase_if( stmt, &ast::WhileDoStmt::inits, isTypedefStmt );81 }82 83 65 } // namespace 84 66 -
src/Validate/FindSpecialDecls.h
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FindSpecialDeclarations.h -- Find special declarations used in the compiler.7 // FindSpecialDeclarations.h -- 8 8 // 9 9 // Author : Rob Schluntz … … 43 43 void findSpecialDecls( std::list< Declaration * > & translationUnit ); 44 44 45 /// Find and remember some of the special declarations that are useful for45 /// find and remember some of the special declarations that are useful for 46 46 /// generating code, so that they do not have to be discovered multiple times. 47 47 void findGlobalDecls( ast::TranslationUnit & translationUnit ); -
src/Validate/FixQualifiedTypes.cpp
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixQualifiedTypes.cpp -- Replace the qualified type with a direct type.7 // FixQualifiedTypes.cpp -- 8 8 // 9 9 // Author : Andrew Beach … … 76 76 ret->qualifiers = type->qualifiers; 77 77 ast::TypeSubstitution sub( aggr->params, instp->params ); 78 // = parent->genericSubstitution(); 78 79 auto result = sub.apply(ret); 79 80 return result.node.release(); -
src/Validate/FixQualifiedTypes.hpp
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixQualifiedTypes.hpp -- Replace the qualified type with a direct type.7 // FixQualifiedTypes.hpp -- 8 8 // 9 9 // Author : Andrew Beach … … 22 22 namespace Validate { 23 23 24 /// Replaces qualified types with an unqualified NamedTypeDecl.25 /// Must happen after Link References To Types,26 /// because aggregate members are accessed.27 24 void fixQualifiedTypes( ast::TranslationUnit & translationUnit ); 28 25 -
src/Validate/ForallPointerDecay.hpp
rffec1bf r9e23b446 29 29 /// Also checks that operator names are used properly on functions and 30 30 /// assigns unique IDs. This is a "legacy" pass. 31 /// Must be after implement concurrent keywords; because uniqueIds must be32 /// set on declaration before resolution.33 /// Must happen before auto-gen routines are added.34 31 void decayForallPointers( ast::TranslationUnit & transUnit ); 35 32 -
src/Validate/GenericParameter.cpp
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // GenericParameter.hpp -- Generic parameter related passes.7 // GenericParameter.hpp -- 8 8 // 9 9 // Author : Andrew Beach -
src/Validate/GenericParameter.hpp
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // GenericParameter.hpp -- Generic parameter related passes.7 // GenericParameter.hpp -- 8 8 // 9 9 // Author : Andrew Beach … … 23 23 24 24 /// Perform substutions for generic parameters and fill in defaults. 25 /// Check as early as possible, but it can't happen before Link References to26 /// Types and observed failing when attempted before eliminate typedef.27 25 void fillGenericParameters( ast::TranslationUnit & translationUnit ); 28 26 -
src/Validate/HoistStruct.hpp
rffec1bf r9e23b446 22 22 namespace Validate { 23 23 24 /// Flattens nested type declarations. (Run right after Fix Qualified Types.)24 /// Flattens nested type declarations. 25 25 void hoistStruct( ast::TranslationUnit & translationUnit ); 26 26 -
src/Validate/LabelAddressFixer.cpp
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // LabelAddressFixer.cpp -- Create label address expressions.7 // LabelAddressFixer.cpp -- 8 8 // 9 9 // Author : Andrew Beach -
src/Validate/LabelAddressFixer.hpp
rffec1bf r9e23b446 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // LabelAddressFixer.hpp -- Create label address expressions.7 // LabelAddressFixer.hpp -- 8 8 // 9 9 // Author : Andrew Beach … … 20 20 namespace Validate { 21 21 22 /// Label addresses are not actually created in the parser, this pass finds23 /// the patterns that represent the label address expression.24 22 void fixLabelAddresses( ast::TranslationUnit & translationUnit ); 25 23 -
src/Validate/module.mk
rffec1bf r9e23b446 26 26 Validate/EliminateTypedef.cpp \ 27 27 Validate/EliminateTypedef.hpp \ 28 Validate/EnumAndPointerDecay.cpp \29 Validate/EnumAndPointerDecay.hpp \30 28 Validate/FindSpecialDeclsNew.cpp \ 31 29 Validate/FixQualifiedTypes.cpp \ 32 30 Validate/FixQualifiedTypes.hpp \ 33 Validate/FixReturnTypes.cpp \34 Validate/FixReturnTypes.hpp \35 31 Validate/ForallPointerDecay.cpp \ 36 32 Validate/ForallPointerDecay.hpp \ … … 41 37 Validate/HoistStruct.cpp \ 42 38 Validate/HoistStruct.hpp \ 43 Validate/HoistTypeDecls.cpp \44 Validate/HoistTypeDecls.hpp \45 39 Validate/InitializerLength.cpp \ 46 40 Validate/InitializerLength.hpp \ 47 41 Validate/LabelAddressFixer.cpp \ 48 42 Validate/LabelAddressFixer.hpp \ 49 Validate/LinkReferenceToTypes.cpp \50 Validate/LinkReferenceToTypes.hpp \51 43 Validate/NoIdSymbolTable.hpp \ 52 Validate/ReplaceTypedef.cpp \53 Validate/ReplaceTypedef.hpp \54 44 Validate/ReturnCheck.cpp \ 55 Validate/ReturnCheck.hpp \ 56 Validate/VerifyCtorDtorAssign.cpp \ 57 Validate/VerifyCtorDtorAssign.hpp 45 Validate/ReturnCheck.hpp 58 46 59 47 SRCDEMANGLE += $(SRC_VALIDATE) -
src/Virtual/Tables.h
rffec1bf r9e23b446 19 19 #include "AST/Fwd.hpp" 20 20 class Declaration; 21 class StructDecl; 21 22 class Expression; 22 class FunctionDecl;23 class Initializer;24 class ObjectDecl;25 class StructDecl;26 class StructInstType;27 class Type;28 23 29 24 namespace Virtual { -
src/main.cc
rffec1bf r9e23b446 10 10 // Created On : Fri May 15 23:12:02 2015 11 11 // Last Modified By : Andrew Beach 12 // Last Modified On : Mon Jul 18 11:08:00 202213 // Update Count : 67 612 // Last Modified On : Tue Jun 7 13:29:00 2022 13 // Update Count : 674 14 14 // 15 15 … … 78 78 #include "Validate/CompoundLiteral.hpp" // for handleCompoundLiterals 79 79 #include "Validate/EliminateTypedef.hpp" // for eliminateTypedef 80 #include "Validate/EnumAndPointerDecay.hpp" // for decayEnumsAndPointers81 80 #include "Validate/FindSpecialDecls.h" // for findGlobalDecls 82 81 #include "Validate/FixQualifiedTypes.hpp" // for fixQualifiedTypes 83 #include "Validate/FixReturnTypes.hpp" // for fixReturnTypes84 82 #include "Validate/ForallPointerDecay.hpp" // for decayForallPointers 85 83 #include "Validate/GenericParameter.hpp" // for fillGenericParameters, tr... 86 84 #include "Validate/HoistStruct.hpp" // for hoistStruct 87 #include "Validate/HoistTypeDecls.hpp" // for hoistTypeDecls88 85 #include "Validate/InitializerLength.hpp" // for setLengthFromInitializer 89 86 #include "Validate/LabelAddressFixer.hpp" // for fixLabelAddresses 90 #include "Validate/LinkReferenceToTypes.hpp" // for linkReferenceToTypes91 #include "Validate/ReplaceTypedef.hpp" // for replaceTypedef92 87 #include "Validate/ReturnCheck.hpp" // for checkReturnStatements 93 #include "Validate/VerifyCtorDtorAssign.hpp" // for verifyCtorDtorAssign94 88 #include "Virtual/ExpandCasts.h" // for expandCasts 95 89 … … 330 324 Stats::Time::StopBlock(); 331 325 326 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) ); 327 if ( exdeclp ) { 328 dump( translationUnit ); 329 return EXIT_SUCCESS; 330 } // if 331 332 // add the assignment statement after the initialization of a type parameter 333 PASS( "Validate-A", SymTab::validate_A( translationUnit ) ); 334 335 // Must happen before auto-gen, because it uses the sized flag. 336 PASS( "Link Reference To Types", SymTab::linkReferenceToTypes( translationUnit ) ); 337 338 CodeTools::fillLocations( translationUnit ); 339 332 340 if( useNewAST ) { 341 CodeTools::fillLocations( translationUnit ); 342 333 343 if (Stats::Counters::enabled) { 334 344 ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New"); … … 339 349 forceFillCodeLocations( transUnit ); 340 350 341 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) ); 342 if ( exdeclp ) { 343 dump( move( transUnit ) ); 344 return EXIT_SUCCESS; 345 } 346 347 PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) ); 348 PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) ); 349 // Hoist Type Decls pulls some declarations out of contexts where 350 // locations are not tracked. Perhaps they should be, but for now 351 // the full fill solves it. 352 forceFillCodeLocations( transUnit ); 353 354 PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) ); 355 PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) ); 356 PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) ); 357 358 PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) ); 359 351 // Must happen after Link References To Types, 352 // because aggregate members are accessed. 360 353 PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) ); 354 361 355 PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) ); 362 356 PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) ); 357 358 // Check as early as possible. Can't happen before 359 // LinkReferenceToType, observed failing when attempted 360 // before eliminateTypedef 363 361 PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) ); 362 364 363 PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) ); 365 364 PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) ); 365 366 // Must happen before Autogen. 366 367 PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) ); 368 367 369 PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) ); 370 371 // Must be after implement concurrent keywords; because uniqueIds 372 // must be set on declaration before resolution. 373 // Must happen before autogen routines are added. 368 374 PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) ); 375 376 // Must happen before autogen routines are added. 369 377 PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) ); 370 378 379 // Must be after enum and pointer decay. 380 // Must be before compound literals. 371 381 PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) ); 372 382 … … 444 454 translationUnit = convert( move( transUnit ) ); 445 455 } else { 446 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) ); 447 if ( exdeclp ) { 448 dump( translationUnit ); 449 return EXIT_SUCCESS; 450 } // if 451 452 // add the assignment statement after the initialization of a type parameter 453 PASS( "Validate", SymTab::validate( translationUnit ) ); 456 PASS( "Validate-B", SymTab::validate_B( translationUnit ) ); 457 PASS( "Validate-C", SymTab::validate_C( translationUnit ) ); 458 PASS( "Validate-D", SymTab::validate_D( translationUnit ) ); 459 PASS( "Validate-E", SymTab::validate_E( translationUnit ) ); 460 PASS( "Validate-F", SymTab::validate_F( translationUnit ) ); 454 461 455 462 if ( symtabp ) { -
tests/.expect/attributes.nast.arm64.txt
rffec1bf r9e23b446 1334 1334 } 1335 1335 inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){ 1336 enum __anonymous4 _X4_retM12__anonymous4_2; 1336 1337 { 1337 1338 ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2)); 1338 1339 } 1339 1340 1340 return (*_X4_dstM12__anonymous4_2); 1341 { 1342 ((void)(_X4_retM12__anonymous4_2=(*_X4_dstM12__anonymous4_2)) /* ?{} */); 1343 } 1344 1345 return _X4_retM12__anonymous4_2; 1341 1346 } 1342 1347 { -
tests/.expect/attributes.nast.x64.txt
rffec1bf r9e23b446 1334 1334 } 1335 1335 inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){ 1336 enum __anonymous4 _X4_retM12__anonymous4_2; 1336 1337 { 1337 1338 ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2)); -
tests/.expect/attributes.nast.x86.txt
rffec1bf r9e23b446 1334 1334 } 1335 1335 inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){ 1336 enum __anonymous4 _X4_retM12__anonymous4_2; 1336 1337 { 1337 1338 ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2)); -
tests/.expect/attributes.oast.x64.txt
rffec1bf r9e23b446 1334 1334 } 1335 1335 inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){ 1336 enum __anonymous4 _X4_retM12__anonymous4_2; 1336 1337 { 1337 1338 ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2)); -
tests/alloc2.cfa
rffec1bf r9e23b446 11 11 typedef struct S1 T1; 12 12 13 void test_base( void * ip, size_t size, size_t align ) {13 void test_base( void * ip, size_t size, size_t align) { 14 14 tests_total += 1; 15 // printf( "DEBUG: starting test %d\n", tests_total);16 bool passed = (malloc_size( ip ) == size) && (malloc_usable_size( ip ) >= size) && (malloc_alignment( ip) == align) && ((uintptr_t)ip % align == 0);17 if ( ! passed) {18 printf( "failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size( ip ), malloc_usable_size( ip ), malloc_alignment( ip ));15 // printf("DEBUG: starting test %d\n", tests_total); 16 bool passed = (malloc_size(ip) == size) && (malloc_usable_size(ip) >= size) && (malloc_alignment(ip) == align) && ((uintptr_t)ip % align == 0); 17 if (!passed) { 18 printf("failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size(ip), malloc_usable_size(ip), malloc_alignment(ip)); 19 19 tests_failed += 1; 20 } // if21 // printf( "DEBUG: done test %d\n", tests_total);20 } 21 // printf("DEBUG: done test %d\n", tests_total); 22 22 } 23 23 24 void test_fill( void * ip_, size_t start, size_t end, char fill ) {24 void test_fill( void * ip_, size_t start, size_t end, char fill) { 25 25 tests_total += 1; 26 // printf( "DEBUG: starting test %d\n", tests_total);26 // printf("DEBUG: starting test %d\n", tests_total); 27 27 bool passed = true; 28 28 char * ip = (char *) ip_; 29 for ( i; start ~ end) passed = passed && (ip[i] == fill);30 if ( ! passed) {31 printf( "failed test %3d: fill C\n", tests_total);29 for (i; start ~ end) passed = passed && (ip[i] == fill); 30 if (!passed) { 31 printf("failed test %3d: fill C\n", tests_total); 32 32 tests_failed += 1; 33 } // if34 // printf( "DEBUG: done test %d\n", tests_total);33 } 34 // printf("DEBUG: done test %d\n", tests_total); 35 35 } 36 36 37 void test_fill( void * ip_, size_t start, size_t end, int fill ) {37 void test_fill( void * ip_, size_t start, size_t end, int fill) { 38 38 tests_total += 1; 39 // printf( "DEBUG: starting test %d\n", tests_total);39 // printf("DEBUG: starting test %d\n", tests_total); 40 40 bool passed = true; 41 int * ip = (int *) ip_;42 for (i; start ~ end ) passed = passed && (ip[i] == fill);43 if ( ! passed) {44 printf( "failed test %3d: fill int\n", tests_total);41 int * ip = (int *) ip_; 42 for (i; start ~ end) passed = passed && (ip[i] == fill); 43 if (!passed) { 44 printf("failed test %3d: fill int\n", tests_total); 45 45 tests_failed += 1; 46 } // if47 // printf( "DEBUG: done test %d\n", tests_total);46 } 47 // printf("DEBUG: done test %d\n", tests_total); 48 48 } 49 49 50 void test_fill( void * ip_, size_t start, size_t end, int * fill ) {50 void test_fill( void * ip_, size_t start, size_t end, int * fill) { 51 51 tests_total += 1; 52 // printf( "DEBUG: starting test %d\n", tests_total);53 bool passed = memcmp((void*)((uintptr_t )ip_ + start ), (void*)fill, end ) == 0;54 if ( ! passed) {55 printf( "failed test %3d: fill int A\n", tests_total);52 // printf("DEBUG: starting test %d\n", tests_total); 53 bool passed = (memcmp((void*)((uintptr_t)ip_ + start), (void*)fill, end) == 0); 54 if (!passed) { 55 printf("failed test %3d: fill int A\n", tests_total); 56 56 tests_failed += 1; 57 } // if58 // printf( "DEBUG: done test %d\n", tests_total);57 } 58 // printf("DEBUG: done test %d\n", tests_total); 59 59 } 60 60 61 void test_fill( void * ip_, size_t start, size_t end, T1 fill ) {61 void test_fill( void * ip_, size_t start, size_t end, T1 fill) { 62 62 tests_total += 1; 63 // printf( "DEBUG: starting test %d\n", tests_total);63 // printf("DEBUG: starting test %d\n", tests_total); 64 64 bool passed = true; 65 65 T1 * ip = (T1 *) ip_; 66 for ( i; start ~ end ) passed = passed && (ip[i].data == fill.data);67 if ( ! passed) {68 printf( "failed test %3d: fill T1\n", tests_total);66 for (i; start ~ end) passed = passed && (ip[i].data == fill.data); 67 if (!passed) { 68 printf("failed test %3d: fill T1\n", tests_total); 69 69 tests_failed += 1; 70 } // if71 // printf( "DEBUG: done test %d\n", tests_total);70 } 71 // printf("DEBUG: done test %d\n", tests_total); 72 72 } 73 73 74 void test_fill( void * ip_, size_t start, size_t end, T1 * fill ) {74 void test_fill( void * ip_, size_t start, size_t end, T1 * fill) { 75 75 tests_total += 1; 76 // printf( "DEBUG: starting test %d\n", tests_total);77 bool passed = memcmp( (void*)((uintptr_t )ip_ + start ), (void*)fill, end ) == 0;78 if ( ! passed) {79 printf( "failed test %3d: fill T1 A\n", tests_total);76 // printf("DEBUG: starting test %d\n", tests_total); 77 bool passed = (memcmp((void*)((uintptr_t)ip_ + start), (void*)fill, end) == 0); 78 if (!passed) { 79 printf("failed test %3d: fill T1 A\n", tests_total); 80 80 tests_failed += 1; 81 } // if82 // printf( "DEBUG: done test %d\n", tests_total);81 } 82 // printf("DEBUG: done test %d\n", tests_total); 83 83 } 84 84 85 void test_use( int * ip, size_t dim ) {85 void test_use( int * ip, size_t dim) { 86 86 tests_total += 1; 87 // printf( "DEBUG: starting test %d\n", tests_total);87 // printf("DEBUG: starting test %d\n", tests_total); 88 88 bool passed = true; 89 for ( i; 0 ~ dim) ip[i] = 0xdeadbeef;90 for ( i; 0 ~ dim) passed = passed && (ip[i] == 0xdeadbeef);91 if ( ! passed) {92 printf( "failed test %3d: use int\n", tests_total);89 for (i; 0 ~ dim) ip[i] = 0xdeadbeef; 90 for (i; 0 ~ dim) passed = passed && (ip[i] == 0xdeadbeef); 91 if (!passed) { 92 printf("failed test %3d: use int\n", tests_total); 93 93 tests_failed += 1; 94 } // if95 // printf( "DEBUG: done test %d\n", tests_total);94 } 95 // printf("DEBUG: done test %d\n", tests_total); 96 96 } 97 97 98 void test_use( T1 * ip, size_t dim ) {98 void test_use( T1 * ip, size_t dim) { 99 99 tests_total += 1; 100 // printf( "DEBUG: starting test %d\n", tests_total);100 // printf("DEBUG: starting test %d\n", tests_total); 101 101 bool passed = true; 102 for ( i; 0 ~ dim) ip[i].data = 0xdeadbeef;103 for ( i; 0 ~ dim) passed = passed && (ip[i].data == 0xdeadbeef);104 if ( ! passed) {105 printf( "failed test %3d: use T1\n", tests_total);102 for (i; 0 ~ dim) ip[i].data = 0xdeadbeef; 103 for (i; 0 ~ dim) passed = passed && (ip[i].data == 0xdeadbeef); 104 if (!passed) { 105 printf("failed test %3d: use T1\n", tests_total); 106 106 tests_failed += 1; 107 } // if108 // printf( "DEBUG: done test %d\n", tests_total);107 } 108 // printf("DEBUG: done test %d\n", tests_total); 109 109 } 110 110 111 111 int main( void ) { 112 enum { dim = 8, align = 64, libAlign = libAlign() };113 112 size_t elemSize = sizeof(int); 113 size_t dim = 8; 114 114 size_t size = dim * elemSize; 115 116 int FillT = 9; 117 char FillC = 'a'; 118 int * FillA = calloc( dim / 4 ); 119 T1 FillT1 = { FillT }; 120 T1 * FillT1A = (T1 *)(void *) malloc( (dim / 4) * sizeof(T1) ); 121 for ( i; 0 ~ (dim / 4) ) FillT1A[i] = FillT1; 122 123 int * ip; 124 int * op; 125 double * dp; 126 T1 * t1p; 127 T1 * t1op; 115 size_t align = 64; 116 const size_t libAlign = libAlign(); 117 118 int FillT = 9; 119 char FillC = 'a'; 120 int * FillA = calloc(dim / 4); 121 T1 FillT1 = { FillT }; 122 T1 * FillT1A = (T1 *)(void *) malloc( (dim / 4) * sizeof(T1) ); 123 for (i; 0 ~ (dim / 4) ) FillT1A[i] = FillT1; 124 125 int * ip; 126 int * op; 127 double * dp; 128 T1 * t1p; 129 T1 * t1op; 128 130 129 131 // testing alloc … … 134 136 135 137 ip = alloc(); 136 test_base( ip, elemSize, libAlign);137 test_use( ip, elemSize / elemSize);138 free( ip);138 test_base(ip, elemSize, libAlign); 139 test_use(ip, elemSize / elemSize); 140 free(ip); 139 141 140 142 ip = alloc( dim ); 141 test_base( ip, size, libAlign);142 test_use( ip, size / elemSize);143 free( ip);143 test_base(ip, size, libAlign); 144 test_use(ip, size / elemSize); 145 free(ip); 144 146 145 147 ip = alloc( 0 ); 146 test_base( ip, 0, libAlign);147 free( ip);148 test_base(ip, 0, libAlign); 149 free(ip); 148 150 149 151 dp = alloc( dim ); 150 152 ip = alloc( dp`resize ); 151 test_base( ip, elemSize, libAlign);152 test_use( ip, elemSize / elemSize);153 free( ip);154 155 ip = alloc( ((double *)0p)`resize );156 test_base( ip, elemSize, libAlign);157 test_use( ip, elemSize / elemSize);158 free( ip);153 test_base(ip, elemSize, libAlign); 154 test_use(ip, elemSize / elemSize); 155 free(ip); 156 157 ip = alloc( ((double*)0p)`resize ); 158 test_base(ip, elemSize, libAlign); 159 test_use(ip, elemSize / elemSize); 160 free(ip); 159 161 160 162 dp = alloc( dim ); 161 163 ip = alloc( dim, dp`resize ); 162 test_base( ip, size, libAlign);163 test_use( ip, size / elemSize);164 free( ip);164 test_base(ip, size, libAlign); 165 test_use(ip, size / elemSize); 166 free(ip); 165 167 166 168 dp = alloc( dim ); 167 169 ip = alloc( 0, dp`resize ); 168 test_base( ip, 0, libAlign);169 free( ip);170 171 ip = alloc( dim, 0p`resize );172 test_base( ip, size, libAlign);173 test_use( ip, size / elemSize);174 free( ip);175 176 ip = alloc( 0, 0p`resize );177 test_base( ip, 0, libAlign);178 free( ip);179 180 op = alloc( dim, 0xdeadbeefN`fill );170 test_base(ip, 0, libAlign); 171 free(ip); 172 173 ip = alloc( dim, ((double*)0p)`resize ); 174 test_base(ip, size, libAlign); 175 test_use(ip, size / elemSize); 176 free(ip); 177 178 ip = alloc( 0, ((double*)0p)`resize ); 179 test_base(ip, 0, libAlign); 180 free(ip); 181 182 op = alloc( dim, ((int)0xdeadbeef)`fill ); 181 183 ip = alloc( dim, op`realloc ); 182 test_base( ip, size, libAlign);183 test_fill( ip, 0, dim, 0xdeadbeefN);184 test_use( ip, size / elemSize);185 free( ip);186 187 op = alloc( dim, 0xdeadbeefN`fill );184 test_base(ip, size, libAlign); 185 test_fill(ip, 0, dim, (int)0xdeadbeef); 186 test_use(ip, size / elemSize); 187 free(ip); 188 189 op = alloc( dim, ((int)0xdeadbeef)`fill ); 188 190 ip = alloc( 0, op`realloc ); 189 test_base( ip, 0, libAlign);190 free( ip);191 192 ip = alloc( dim, 0p`realloc );193 test_base( ip, size, libAlign);194 test_use( ip, size / elemSize);195 free( ip);196 197 ip = alloc( 0, 0p`realloc );198 test_base( ip, 0, libAlign);199 free( ip);200 201 op = alloc( dim, 0xdeadbeefN`fill );191 test_base(ip, 0, libAlign); 192 free(ip); 193 194 ip = alloc( dim, ((int*)0p)`realloc ); 195 test_base(ip, size, libAlign); 196 test_use(ip, size / elemSize); 197 free(ip); 198 199 ip = alloc( 0, ((int*)0p)`realloc ); 200 test_base(ip, 0, libAlign); 201 free(ip); 202 203 op = alloc( dim, ((int)0xdeadbeef)`fill ); 202 204 ip = alloc( dim, op`resize ); 203 test_base( ip, size, libAlign);204 test_use( ip, size / elemSize);205 free( ip);205 test_base(ip, size, libAlign); 206 test_use(ip, size / elemSize); 207 free(ip); 206 208 207 209 ip = alloc( FillC`fill ); 208 test_base( ip, elemSize, libAlign);209 test_fill( ip, 0, elemSize, FillC);210 test_use( ip, elemSize / elemSize);211 free( ip);210 test_base(ip, elemSize, libAlign); 211 test_fill(ip, 0, elemSize, FillC); 212 test_use(ip, elemSize / elemSize); 213 free(ip); 212 214 213 215 ip = alloc( FillT`fill ); 214 test_base( ip, elemSize, libAlign);215 test_fill( ip, 0, 1, FillT);216 test_use( ip, elemSize / elemSize);217 free( ip);216 test_base(ip, elemSize, libAlign); 217 test_fill(ip, 0, 1, FillT); 218 test_use(ip, elemSize / elemSize); 219 free(ip); 218 220 219 221 ip = alloc( dim, FillC`fill ); 220 test_base( ip, size, libAlign);221 test_fill( ip, 0, size, FillC);222 test_use( ip, size / elemSize);223 free( ip);222 test_base(ip, size, libAlign); 223 test_fill(ip, 0, size, FillC); 224 test_use(ip, size / elemSize); 225 free(ip); 224 226 225 227 ip = alloc( 0, FillC`fill ); 226 test_base( ip, 0, libAlign);227 free( ip);228 test_base(ip, 0, libAlign); 229 free(ip); 228 230 229 231 ip = alloc( dim, FillT`fill ); 230 test_base( ip, size, libAlign);231 test_fill( ip, 0, dim, FillT);232 test_use( ip, size / elemSize);233 free( ip);232 test_base(ip, size, libAlign); 233 test_fill(ip, 0, dim, FillT); 234 test_use(ip, size / elemSize); 235 free(ip); 234 236 235 237 ip = alloc( 0, FillT`fill ); 236 test_base( ip, 0, libAlign);237 free( ip);238 test_base(ip, 0, libAlign); 239 free(ip); 238 240 239 241 ip = alloc( dim, [FillA, dim/4]`fill ); 240 test_base( ip, size, libAlign);241 test_fill( ip, 0, size/4, FillA);242 test_use( ip, size / elemSize);243 free( ip);242 test_base(ip, size, libAlign); 243 test_fill(ip, 0, size/4, FillA); 244 test_use(ip, size / elemSize); 245 free(ip); 244 246 245 247 ip = alloc( 0, [FillA, dim/4]`fill ); 246 test_base( ip, 0, libAlign);247 free( ip);248 249 op = alloc( dim, 0xdeadbeefN`fill );248 test_base(ip, 0, libAlign); 249 free(ip); 250 251 op = alloc( dim, ((int)0xdeadbeef)`fill ); 250 252 ip = alloc( dim, op`realloc, FillC`fill ); 251 test_base( ip, size, libAlign);252 test_fill( ip, 0, dim, 0xdeadbeefN);253 test_use( ip, size / elemSize);254 free( ip);255 256 op = alloc( dim, 0xdeadbeefN`fill );253 test_base(ip, size, libAlign); 254 test_fill(ip, 0, dim, (int)0xdeadbeef); 255 test_use(ip, size / elemSize); 256 free(ip); 257 258 op = alloc( dim, ((int)0xdeadbeef)`fill ); 257 259 ip = alloc( dim / 4, op`realloc, FillC`fill ); 258 test_base( ip, size / 4, libAlign);259 test_fill( ip, 0, dim / 4, 0xdeadbeefN);260 test_use( ip, size / 4 / elemSize);261 free( ip);262 263 op = alloc( dim, 0xdeadbeefN`fill );260 test_base(ip, size / 4, libAlign); 261 test_fill(ip, 0, dim / 4, (int)0xdeadbeef); 262 test_use(ip, size / 4 / elemSize); 263 free(ip); 264 265 op = alloc( dim, ((int)0xdeadbeef)`fill ); 264 266 ip = alloc( dim * 4, op`realloc, FillC`fill ); 265 test_base( ip, size * 4, libAlign);266 test_fill( ip, 0, dim, 0xdeadbeefN);267 test_fill( ip, size, size * 4, FillC);268 test_use( ip, size * 4 / elemSize);269 free( ip);270 271 op = alloc( dim, 0xdeadbeefN`fill );267 test_base(ip, size * 4, libAlign); 268 test_fill(ip, 0, dim, (int)0xdeadbeef); 269 test_fill(ip, size, size * 4, FillC); 270 test_use(ip, size * 4 / elemSize); 271 free(ip); 272 273 op = alloc( dim, ((int)0xdeadbeef)`fill ); 272 274 ip = alloc( 0, op`realloc, FillC`fill ); 273 test_base( ip, 0, libAlign);274 free( ip);275 276 ip = alloc( dim, 0p`realloc, FillC`fill );277 test_base( ip, size, libAlign);278 test_fill( ip, 0, size, FillC);279 test_use( ip, size / elemSize);280 free( ip);281 282 ip = alloc( 0, 0p`realloc, FillC`fill );283 test_base( ip, 0, libAlign);284 free( ip);285 286 op = alloc( dim, 0xdeadbeefN`fill );275 test_base(ip, 0, libAlign); 276 free(ip); 277 278 ip = alloc( dim, ((int*)0p)`realloc, FillC`fill ); 279 test_base(ip, size, libAlign); 280 test_fill(ip, 0, size, FillC); 281 test_use(ip, size / elemSize); 282 free(ip); 283 284 ip = alloc( 0, ((int*)0p)`realloc, FillC`fill ); 285 test_base(ip, 0, libAlign); 286 free(ip); 287 288 op = alloc( dim, ((int)0xdeadbeef)`fill ); 287 289 ip = alloc( dim, op`realloc, FillT`fill ); 288 test_base( ip, size, libAlign);289 test_fill( ip, 0, dim, 0xdeadbeefN);290 test_use( ip, size / elemSize);291 free( ip);292 293 op = alloc( dim, 0xdeadbeefN`fill );290 test_base(ip, size, libAlign); 291 test_fill(ip, 0, dim, (int)0xdeadbeef); 292 test_use(ip, size / elemSize); 293 free(ip); 294 295 op = alloc( dim, ((int)0xdeadbeef)`fill ); 294 296 ip = alloc( dim / 4, op`realloc, FillT`fill ); 295 test_base( ip, size / 4, libAlign);296 test_fill( ip, 0, dim / 4, 0xdeadbeefN);297 test_use( ip, size / 4 / elemSize);298 free( ip);299 300 op = alloc( dim, 0xdeadbeefN`fill );297 test_base(ip, size / 4, libAlign); 298 test_fill(ip, 0, dim / 4, (int)0xdeadbeef); 299 test_use(ip, size / 4 / elemSize); 300 free(ip); 301 302 op = alloc( dim, ((int)0xdeadbeef)`fill ); 301 303 ip = alloc( dim * 4, op`realloc, FillT`fill ); 302 test_base( ip, size * 4, libAlign);303 test_fill( ip, 0, dim, 0xdeadbeefN);304 test_fill( ip, dim, dim * 4, FillT);305 test_use( ip, size * 4 / elemSize);306 free( ip);307 308 op = alloc( dim, 0xdeadbeefN`fill );304 test_base(ip, size * 4, libAlign); 305 test_fill(ip, 0, dim, (int)0xdeadbeef); 306 test_fill(ip, dim, dim * 4, FillT); 307 test_use(ip, size * 4 / elemSize); 308 free(ip); 309 310 op = alloc( dim, ((int)0xdeadbeef)`fill ); 309 311 ip = alloc( 0, op`realloc, FillT`fill ); 310 test_base( ip, 0, libAlign);311 free( ip);312 313 ip = alloc( dim, 0p`realloc, FillT`fill );314 test_base( ip, size, libAlign);315 test_fill( ip, 0, dim, FillT);316 test_use( ip, size / elemSize);317 free( ip);318 319 ip = alloc( 0, 0p`realloc, FillT`fill );320 test_base( ip, 0, libAlign);321 free( ip);312 test_base(ip, 0, libAlign); 313 free(ip); 314 315 ip = alloc( dim, ((int*)0p)`realloc, FillT`fill ); 316 test_base(ip, size, libAlign); 317 test_fill(ip, 0, dim, FillT); 318 test_use(ip, size / elemSize); 319 free(ip); 320 321 ip = alloc( 0, ((int*)0p)`realloc, FillT`fill ); 322 test_base(ip, 0, libAlign); 323 free(ip); 322 324 323 325 ip = alloc( align`align ); 324 test_base( ip, elemSize, align);325 test_use( ip, elemSize / elemSize);326 free( ip);326 test_base(ip, elemSize, align); 327 test_use(ip, elemSize / elemSize); 328 free(ip); 327 329 328 330 ip = alloc( dim, align`align ); 329 test_base( ip, size, align);330 test_use( ip, size / elemSize);331 free( ip);331 test_base(ip, size, align); 332 test_use(ip, size / elemSize); 333 free(ip); 332 334 333 335 ip = alloc( 0, align`align ); 334 test_base( ip, 0, libAlign);335 free( ip);336 337 op = alloc( dim, 0xdeadbeefN`fill );336 test_base(ip, 0, libAlign); 337 free(ip); 338 339 op = alloc( dim, ((int)0xdeadbeef)`fill ); 338 340 ip = alloc( op`realloc, align`align ); 339 test_base( ip, elemSize, align);340 test_fill( ip, 0, 1, 0xdeadbeefN);341 test_use( ip, elemSize / elemSize);342 free( ip);343 344 ip = alloc( 0p`realloc, align`align );345 test_base( ip, elemSize, align);346 test_use( ip, elemSize / elemSize);347 free( ip);341 test_base(ip, elemSize, align); 342 test_fill(ip, 0, 1, (int)0xdeadbeef); 343 test_use(ip, elemSize / elemSize); 344 free(ip); 345 346 ip = alloc( ((int*)0p)`realloc, align`align ); 347 test_base(ip, elemSize, align); 348 test_use(ip, elemSize / elemSize); 349 free(ip); 348 350 349 351 dp = alloc( dim ); 350 352 ip = alloc( dp`resize, align`align ); 351 test_base( ip, elemSize, align);352 test_use( ip, elemSize / elemSize);353 free( ip);354 355 ip = alloc( 0p`resize, align`align );356 test_base( ip, elemSize, align);357 test_use( ip, elemSize / elemSize);358 free( ip);359 360 op = alloc( dim, 0xdeadbeefN`fill);353 test_base(ip, elemSize, align); 354 test_use(ip, elemSize / elemSize); 355 free(ip); 356 357 ip = alloc( ((double*)0p)`resize, align`align ); 358 test_base(ip, elemSize, align); 359 test_use(ip, elemSize / elemSize); 360 free(ip); 361 362 op = alloc( dim, ((int)0xdeadbeef)`fill); 361 363 ip = alloc( dim, op`realloc, align`align ); 362 test_base( ip, size, align);363 test_fill( ip, 0, dim, 0xdeadbeefN);364 test_use( ip, size / elemSize);365 free( ip);366 367 op = alloc( dim, 0xdeadbeefN`fill );364 test_base(ip, size, align); 365 test_fill(ip, 0, dim, (int)0xdeadbeef); 366 test_use(ip, size / elemSize); 367 free(ip); 368 369 op = alloc( dim, ((int)0xdeadbeef)`fill ); 368 370 ip = alloc( 0, op`realloc, align`align ); 369 test_base( ip, 0, libAlign);370 free( ip);371 372 ip = alloc( dim, 0p`realloc, align`align );373 test_base( ip, size, align);374 test_use( ip, size / elemSize);375 free( ip);376 377 ip = alloc( 0, 0p`realloc, align`align );378 test_base( ip, 0, libAlign);379 free( ip);371 test_base(ip, 0, libAlign); 372 free(ip); 373 374 ip = alloc( dim, ((int*)0p)`realloc, align`align ); 375 test_base(ip, size, align); 376 test_use(ip, size / elemSize); 377 free(ip); 378 379 ip = alloc( 0, ((int*)0p)`realloc, align`align ); 380 test_base(ip, 0, libAlign); 381 free(ip); 380 382 381 383 ip = alloc( align`align, FillC`fill ); 382 test_base( ip, elemSize, align);383 test_fill( ip, 0, elemSize, FillC);384 test_use( ip, elemSize / elemSize);385 free( ip);384 test_base(ip, elemSize, align); 385 test_fill(ip, 0, elemSize, FillC); 386 test_use(ip, elemSize / elemSize); 387 free(ip); 386 388 387 389 ip = alloc( align`align, FillT`fill ); 388 test_base( ip, elemSize, align);389 test_fill( ip, 0, 1, FillT);390 test_use( ip, elemSize / elemSize);391 free( ip);390 test_base(ip, elemSize, align); 391 test_fill(ip, 0, 1, FillT); 392 test_use(ip, elemSize / elemSize); 393 free(ip); 392 394 393 395 ip = alloc( dim, align`align, FillC`fill ); 394 test_base( ip, size, align);395 test_fill( ip, 0, size, FillC);396 test_use( ip, size / elemSize);397 free( ip);396 test_base(ip, size, align); 397 test_fill(ip, 0, size, FillC); 398 test_use(ip, size / elemSize); 399 free(ip); 398 400 399 401 ip = alloc( 0, align`align, FillC`fill ); 400 test_base( ip, 0, libAlign);401 free( ip);402 test_base(ip, 0, libAlign); 403 free(ip); 402 404 403 405 ip = alloc( dim, align`align, FillT`fill ); 404 test_base( ip, size, align);405 test_fill( ip, 0, dim, FillT);406 test_use( ip, size / elemSize);407 free( ip);406 test_base(ip, size, align); 407 test_fill(ip, 0, dim, FillT); 408 test_use(ip, size / elemSize); 409 free(ip); 408 410 409 411 ip = alloc( 0, align`align, FillT`fill ); 410 test_base( ip, 0, libAlign);411 free( ip);412 test_base(ip, 0, libAlign); 413 free(ip); 412 414 413 415 ip = alloc( dim, align`align, [FillA, dim/4]`fill ); 414 test_base( ip, size, align);415 test_fill( ip, 0, size/4, FillA);416 test_use( ip, size / elemSize);417 free( ip);416 test_base(ip, size, align); 417 test_fill(ip, 0, size/4, FillA); 418 test_use(ip, size / elemSize); 419 free(ip); 418 420 419 421 ip = alloc( 0, align`align, [FillA, dim/4]`fill ); 420 test_base( ip, 0, libAlign);421 free( ip);422 423 op = alloc( dim, 0xdeadbeefN`fill );422 test_base(ip, 0, libAlign); 423 free(ip); 424 425 op = alloc( dim, ((int)0xdeadbeef)`fill ); 424 426 ip = alloc( dim, op`realloc, align`align, FillC`fill ); 425 test_base( ip, size, align);426 test_fill( ip, 0, dim, 0xdeadbeefN);427 test_use( ip, size / elemSize);428 free( ip);429 430 op = alloc( dim, 0xdeadbeefN`fill );427 test_base(ip, size, align); 428 test_fill(ip, 0, dim, (int)0xdeadbeef); 429 test_use(ip, size / elemSize); 430 free(ip); 431 432 op = alloc( dim, ((int)0xdeadbeef)`fill ); 431 433 ip = alloc( dim / 4, op`realloc, align`align, FillC`fill ); 432 test_base( ip, size / 4, align);433 test_fill( ip, 0, dim / 4, 0xdeadbeefN);434 test_use( ip, size / 4 / elemSize);435 free( ip);436 437 op = alloc( dim, 0xdeadbeefN`fill );434 test_base(ip, size / 4, align); 435 test_fill(ip, 0, dim / 4, (int)0xdeadbeef); 436 test_use(ip, size / 4 / elemSize); 437 free(ip); 438 439 op = alloc( dim, ((int)0xdeadbeef)`fill ); 438 440 ip = alloc( dim * 4, op`realloc, align`align, FillC`fill ); 439 test_base( ip, size * 4, align);440 test_fill( ip, 0, dim, 0xdeadbeefN);441 test_fill( ip, size, size * 4, FillC);442 test_use( ip, size * 4 / elemSize);443 free( ip);444 445 op = alloc( dim, 0xdeadbeefN`fill );441 test_base(ip, size * 4, align); 442 test_fill(ip, 0, dim, (int)0xdeadbeef); 443 test_fill(ip, size, size * 4, FillC); 444 test_use(ip, size * 4 / elemSize); 445 free(ip); 446 447 op = alloc( dim, ((int)0xdeadbeef)`fill ); 446 448 ip = alloc( 0, op`realloc, align`align, FillC`fill ); 447 test_base( ip, 0, libAlign);448 free( ip);449 450 ip = alloc( dim, 0p`realloc, align`align, FillC`fill );451 test_base( ip, size, align);452 test_fill( ip, 0, size, FillC);453 test_use( ip, size / elemSize);454 free( ip);455 456 ip = alloc( 0, 0p`realloc, align`align, FillC`fill );457 test_base( ip, 0, libAlign);458 free( ip);459 460 op = alloc( dim, 0xdeadbeefN`fill );449 test_base(ip, 0, libAlign); 450 free(ip); 451 452 ip = alloc( dim, ((int*)0p)`realloc, align`align, FillC`fill ); 453 test_base(ip, size, align); 454 test_fill(ip, 0, size, FillC); 455 test_use(ip, size / elemSize); 456 free(ip); 457 458 ip = alloc( 0, ((int*)0p)`realloc, align`align, FillC`fill ); 459 test_base(ip, 0, libAlign); 460 free(ip); 461 462 op = alloc( dim, ((int)0xdeadbeef)`fill ); 461 463 ip = alloc( dim, op`realloc, align`align, FillT`fill ); 462 test_base( ip, size, align);463 test_fill( ip, 0, dim, 0xdeadbeefN);464 test_use( ip, size / elemSize);465 free( ip);466 467 op = alloc( dim, 0xdeadbeefN`fill );464 test_base(ip, size, align); 465 test_fill(ip, 0, dim, (int)0xdeadbeef); 466 test_use(ip, size / elemSize); 467 free(ip); 468 469 op = alloc( dim, ((int)0xdeadbeef)`fill ); 468 470 ip = alloc( dim / 4, op`realloc, align`align, FillT`fill ); 469 test_base( ip, size / 4, align);470 test_fill( ip, 0, dim / 4, 0xdeadbeefN);471 test_use( ip, size / 4 / elemSize);472 free( ip);473 474 op = alloc( dim, 0xdeadbeefN`fill );471 test_base(ip, size / 4, align); 472 test_fill(ip, 0, dim / 4, (int)0xdeadbeef); 473 test_use(ip, size / 4 / elemSize); 474 free(ip); 475 476 op = alloc( dim, ((int)0xdeadbeef)`fill ); 475 477 ip = alloc( dim * 4, op`realloc, align`align, FillT`fill ); 476 test_base( ip, size * 4, align);477 test_fill( ip, 0, dim, 0xdeadbeefN);478 test_fill( ip, dim, dim * 4, FillT);479 test_use( ip, size * 4 / elemSize);480 free( ip);481 482 op = alloc( dim, 0xdeadbeefN`fill );478 test_base(ip, size * 4, align); 479 test_fill(ip, 0, dim, (int)0xdeadbeef); 480 test_fill(ip, dim, dim * 4, FillT); 481 test_use(ip, size * 4 / elemSize); 482 free(ip); 483 484 op = alloc( dim, ((int)0xdeadbeef)`fill ); 483 485 ip = alloc( 0, op`realloc, align`align, FillT`fill ); 484 test_base( ip, 0, libAlign);485 free( ip);486 487 ip = alloc( dim, 0p`realloc, align`align, FillT`fill );488 test_base( ip, size, align);489 test_fill( ip, 0, dim, FillT);490 test_use( ip, size / elemSize);491 free( ip);492 493 ip = alloc( 0, 0p`realloc, align`align, FillT`fill );494 test_base( ip, 0, libAlign);495 free( ip);496 497 if ( tests_failed == 0 ) printf( "PASSED alloc tests\n\n");498 else printf( "failed alloc tests : %d/%d\n\n", tests_failed, tests_total);499 500 // testing alloc ( aligned struct)486 test_base(ip, 0, libAlign); 487 free(ip); 488 489 ip = alloc( dim, ((int*)0p)`realloc, align`align, FillT`fill ); 490 test_base(ip, size, align); 491 test_fill(ip, 0, dim, FillT); 492 test_use(ip, size / elemSize); 493 free(ip); 494 495 ip = alloc( 0, ((int*)0p)`realloc, align`align, FillT`fill ); 496 test_base(ip, 0, libAlign); 497 free(ip); 498 499 if (tests_failed == 0) printf("PASSED alloc tests\n\n"); 500 else printf("failed alloc tests : %d/%d\n\n", tests_failed, tests_total); 501 502 // testing alloc (aligned struct) 501 503 502 504 elemSize = sizeof(T1); … … 507 509 508 510 t1p = alloc(); 509 test_base( t1p, elemSize, tAlign);510 test_use( t1p, elemSize / elemSize);511 free( t1p);511 test_base(t1p, elemSize, tAlign); 512 test_use(t1p, elemSize / elemSize); 513 free(t1p); 512 514 513 515 t1p = alloc( dim ); 514 test_base( t1p, size, tAlign);515 test_use( t1p, size / elemSize);516 free( t1p);516 test_base(t1p, size, tAlign); 517 test_use(t1p, size / elemSize); 518 free(t1p); 517 519 518 520 t1p = alloc( 0 ); 519 test_base( t1p, 0, libAlign);520 free( t1p);521 test_base(t1p, 0, libAlign); 522 free(t1p); 521 523 522 524 dp = alloc( dim ); 523 525 t1p = alloc( dp`resize ); 524 test_base( t1p, elemSize, tAlign);525 test_use( t1p, elemSize / elemSize);526 free( t1p);527 528 t1p = alloc( 0p`resize );529 test_base( t1p, elemSize, tAlign);530 test_use( t1p, elemSize / elemSize);531 free( t1p);526 test_base(t1p, elemSize, tAlign); 527 test_use(t1p, elemSize / elemSize); 528 free(t1p); 529 530 t1p = alloc( ((double*)0p)`resize ); 531 test_base(t1p, elemSize, tAlign); 532 test_use(t1p, elemSize / elemSize); 533 free(t1p); 532 534 533 535 dp = alloc( dim ); 534 536 t1p = alloc( dim, dp`resize ); 535 test_base( t1p, size, tAlign);536 test_use( t1p, size / elemSize);537 free( t1p);537 test_base(t1p, size, tAlign); 538 test_use(t1p, size / elemSize); 539 free(t1p); 538 540 539 541 dp = alloc( dim ); 540 542 t1p = alloc( 0, dp`resize ); 541 test_base( t1p, 0, libAlign);542 free( t1p);543 544 t1p = alloc( dim, 0p`resize );545 test_base( t1p, size, tAlign);546 test_use( t1p, size / elemSize);547 free( t1p);548 549 t1p = alloc( 0, 0p`resize );550 test_base( t1p, 0, libAlign);551 free( t1p);543 test_base(t1p, 0, libAlign); 544 free(t1p); 545 546 t1p = alloc( dim, ((double*)0p)`resize ); 547 test_base(t1p, size, tAlign); 548 test_use(t1p, size / elemSize); 549 free(t1p); 550 551 t1p = alloc( 0, ((double*)0p)`resize ); 552 test_base(t1p, 0, libAlign); 553 free(t1p); 552 554 553 555 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 554 556 t1p = alloc( dim, t1op`realloc ); 555 test_base( t1p, size, tAlign);556 test_fill( t1p, 0, dim, (T1){0xdeadbeef});557 test_use( t1p, size / elemSize);558 free( t1p);557 test_base(t1p, size, tAlign); 558 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 559 test_use(t1p, size / elemSize); 560 free(t1p); 559 561 560 562 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 561 563 t1p = alloc( 0, t1op`realloc ); 562 test_base( t1p, 0, libAlign);563 free( t1p);564 565 t1p = alloc( dim, 0p`realloc );566 test_base( t1p, size, tAlign);567 test_use( t1p, size / elemSize);568 free( t1p);569 570 t1p = alloc( 0, 0p`realloc );571 test_base( t1p, 0, libAlign);572 free( t1p);564 test_base(t1p, 0, libAlign); 565 free(t1p); 566 567 t1p = alloc( dim, ((T1*)0p)`realloc ); 568 test_base(t1p, size, tAlign); 569 test_use(t1p, size / elemSize); 570 free(t1p); 571 572 t1p = alloc( 0, ((T1*)0p)`realloc ); 573 test_base(t1p, 0, libAlign); 574 free(t1p); 573 575 574 576 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 575 577 t1p = alloc( dim, t1op`resize ); 576 test_base( t1p, size, tAlign);577 test_use( t1p, size / elemSize);578 free( t1p);578 test_base(t1p, size, tAlign); 579 test_use(t1p, size / elemSize); 580 free(t1p); 579 581 580 582 t1p = alloc( FillC`fill ); 581 test_base( t1p, elemSize, tAlign);582 test_fill( t1p, 0, elemSize, FillC);583 test_use( t1p, elemSize / elemSize);584 free( t1p);583 test_base(t1p, elemSize, tAlign); 584 test_fill(t1p, 0, elemSize, FillC); 585 test_use(t1p, elemSize / elemSize); 586 free(t1p); 585 587 586 588 t1p = alloc( FillT1`fill ); 587 test_base( t1p, elemSize, tAlign);588 test_fill( t1p, 0, 1, FillT1);589 test_use( t1p, elemSize / elemSize);590 free( t1p);589 test_base(t1p, elemSize, tAlign); 590 test_fill(t1p, 0, 1, FillT1); 591 test_use(t1p, elemSize / elemSize); 592 free(t1p); 591 593 592 594 t1p = alloc( dim, FillC`fill ); 593 test_base( t1p, size, tAlign);594 test_fill( t1p, 0, size, FillC);595 test_use( t1p, size / elemSize);596 free( t1p);595 test_base(t1p, size, tAlign); 596 test_fill(t1p, 0, size, FillC); 597 test_use(t1p, size / elemSize); 598 free(t1p); 597 599 598 600 t1p = alloc( 0, FillC`fill ); 599 test_base( t1p, 0, libAlign);600 free( t1p);601 test_base(t1p, 0, libAlign); 602 free(t1p); 601 603 602 604 t1p = alloc( dim, FillT1`fill ); 603 test_base( t1p, size, tAlign);604 test_fill( t1p, 0, dim, FillT1);605 test_use( t1p, size / elemSize);606 free( t1p);605 test_base(t1p, size, tAlign); 606 test_fill(t1p, 0, dim, FillT1); 607 test_use(t1p, size / elemSize); 608 free(t1p); 607 609 608 610 t1p = alloc( 0, FillT1`fill ); 609 test_base( t1p, 0, libAlign);610 free( t1p);611 test_base(t1p, 0, libAlign); 612 free(t1p); 611 613 612 614 t1p = alloc( dim, [FillT1A, dim / 4]`fill ); 613 test_base( t1p, size, tAlign);614 test_fill( t1p, 0, size/4, FillT1A);615 test_use( t1p, size / elemSize);616 free( t1p);615 test_base(t1p, size, tAlign); 616 test_fill(t1p, 0, size/4, FillT1A); 617 test_use(t1p, size / elemSize); 618 free(t1p); 617 619 618 620 t1p = alloc( 0, [FillT1A, dim / 4]`fill ); 619 test_base( t1p, 0, libAlign);620 free( t1p);621 test_base(t1p, 0, libAlign); 622 free(t1p); 621 623 622 624 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 623 625 t1p = alloc( dim, t1op`realloc, FillC`fill ); 624 test_base( t1p, size, tAlign);625 test_fill( t1p, 0, dim, (T1){0xdeadbeef});626 test_use( t1p, size / elemSize);627 free( t1p);626 test_base(t1p, size, tAlign); 627 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 628 test_use(t1p, size / elemSize); 629 free(t1p); 628 630 629 631 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 630 632 t1p = alloc( dim / 4, t1op`realloc, FillC`fill ); 631 test_base( t1p, size / 4, tAlign);632 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});633 test_use( t1p, size / 4 / elemSize);634 free( t1p);633 test_base(t1p, size / 4, tAlign); 634 test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef}); 635 test_use(t1p, size / 4 / elemSize); 636 free(t1p); 635 637 636 638 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 637 639 t1p = alloc( dim * 4, t1op`realloc, FillC`fill ); 638 test_base( t1p, size * 4, tAlign);639 test_fill( t1p, 0, dim, (T1){0xdeadbeef});640 test_fill( t1p, size, size * 4, FillC);641 test_use( t1p, size * 4 / elemSize);642 free( t1p);640 test_base(t1p, size * 4, tAlign); 641 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 642 test_fill(t1p, size, size * 4, FillC); 643 test_use(t1p, size * 4 / elemSize); 644 free(t1p); 643 645 644 646 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 645 647 t1p = alloc( 0, t1op`realloc, FillC`fill ); 646 test_base( t1p, 0, libAlign);647 free( t1p);648 649 t1p = alloc( dim, 0p`realloc, FillC`fill );650 test_base( t1p, size, tAlign);651 test_fill( t1p, 0, size, FillC);652 test_use( t1p, size / elemSize);653 free( t1p);654 655 t1p = alloc( 0, 0p`realloc, FillC`fill );656 test_base( t1p, 0, libAlign);657 free( t1p);648 test_base(t1p, 0, libAlign); 649 free(t1p); 650 651 t1p = alloc( dim, ((T1*)0p)`realloc, FillC`fill ); 652 test_base(t1p, size, tAlign); 653 test_fill(t1p, 0, size, FillC); 654 test_use(t1p, size / elemSize); 655 free(t1p); 656 657 t1p = alloc( 0, ((T1*)0p)`realloc, FillC`fill ); 658 test_base(t1p, 0, libAlign); 659 free(t1p); 658 660 659 661 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 660 662 t1p = alloc( dim, t1op`realloc, FillT1`fill ); 661 test_base( t1p, size, tAlign);662 test_fill( t1p, 0, dim, (T1){0xdeadbeef});663 test_use( t1p, size / elemSize);664 free( t1p);663 test_base(t1p, size, tAlign); 664 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 665 test_use(t1p, size / elemSize); 666 free(t1p); 665 667 666 668 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 667 669 t1p = alloc( dim / 4, t1op`realloc, FillT1`fill ); 668 test_base( t1p, size / 4, tAlign);669 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});670 test_use( t1p, size / 4 / elemSize);671 free( t1p);670 test_base(t1p, size / 4, tAlign); 671 test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef}); 672 test_use(t1p, size / 4 / elemSize); 673 free(t1p); 672 674 673 675 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 674 676 t1p = alloc( dim * 4, t1op`realloc, FillT1`fill ); 675 test_base( t1p, size * 4, tAlign);676 test_fill( t1p, 0, dim, (T1){0xdeadbeef});677 test_fill( t1p, dim, dim * 4, FillT1);678 test_use( t1p, size * 4 / elemSize);679 free( t1p);677 test_base(t1p, size * 4, tAlign); 678 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 679 test_fill(t1p, dim, dim * 4, FillT1); 680 test_use(t1p, size * 4 / elemSize); 681 free(t1p); 680 682 681 683 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 682 684 t1p = alloc( 0, t1op`realloc, FillT1`fill ); 683 test_base( t1p, 0, libAlign);684 free( t1p);685 686 t1p = alloc( dim, 0p`realloc, FillT1`fill );687 test_base( t1p, size, tAlign);688 test_fill( t1p, 0, dim, FillT1);689 test_use( t1p, size / elemSize);690 free( t1p);691 692 t1p = alloc( 0, 0p`realloc, FillT1`fill );693 test_base( t1p, 0, libAlign);694 free( t1p);685 test_base(t1p, 0, libAlign); 686 free(t1p); 687 688 t1p = alloc( dim, ((T1*)0p)`realloc, FillT1`fill ); 689 test_base(t1p, size, tAlign); 690 test_fill(t1p, 0, dim, FillT1); 691 test_use(t1p, size / elemSize); 692 free(t1p); 693 694 t1p = alloc( 0, ((T1*)0p)`realloc, FillT1`fill ); 695 test_base(t1p, 0, libAlign); 696 free(t1p); 695 697 696 698 t1p = alloc( align`align ); 697 test_base( t1p, elemSize, align);698 test_use( t1p, elemSize / elemSize);699 free( t1p);699 test_base(t1p, elemSize, align); 700 test_use(t1p, elemSize / elemSize); 701 free(t1p); 700 702 701 703 t1p = alloc( dim, align`align ); 702 test_base( t1p, size, align);703 test_use( t1p, size / elemSize);704 free( t1p);704 test_base(t1p, size, align); 705 test_use(t1p, size / elemSize); 706 free(t1p); 705 707 706 708 t1p = alloc( 0, align`align ); 707 test_base( t1p, 0, libAlign);708 free( t1p);709 test_base(t1p, 0, libAlign); 710 free(t1p); 709 711 710 712 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 711 713 t1p = alloc( t1op`realloc, align`align ); 712 test_base( t1p, elemSize, align);713 test_fill( t1p, 0, 1, (T1){0xdeadbeef});714 test_use( t1p, elemSize / elemSize);715 free( t1p);716 717 t1p = alloc( 0p`realloc, align`align );718 test_base( t1p, elemSize, align);719 test_use( t1p, elemSize / elemSize);720 free( t1p);714 test_base(t1p, elemSize, align); 715 test_fill(t1p, 0, 1, (T1){0xdeadbeef}); 716 test_use(t1p, elemSize / elemSize); 717 free(t1p); 718 719 t1p = alloc( ((T1*)0p)`realloc, align`align ); 720 test_base(t1p, elemSize, align); 721 test_use(t1p, elemSize / elemSize); 722 free(t1p); 721 723 722 724 dp = alloc( dim ); 723 725 t1p = alloc( dp`resize, align`align ); 724 test_base( t1p, elemSize, align);725 test_use( t1p, elemSize / elemSize);726 free( t1p);727 728 t1p = alloc( 0p`resize, align`align );729 test_base( t1p, elemSize, align);730 test_use( t1p, elemSize / elemSize);731 free( t1p);726 test_base(t1p, elemSize, align); 727 test_use(t1p, elemSize / elemSize); 728 free(t1p); 729 730 t1p = alloc( ((double*)0p)`resize, align`align ); 731 test_base(t1p, elemSize, align); 732 test_use(t1p, elemSize / elemSize); 733 free(t1p); 732 734 733 735 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 734 736 t1p = alloc( dim, t1op`realloc, align`align ); 735 test_base( t1p, size, align);736 test_fill( t1p, 0, dim, (T1){0xdeadbeef});737 test_use( t1p, size / elemSize);738 free( t1p);737 test_base(t1p, size, align); 738 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 739 test_use(t1p, size / elemSize); 740 free(t1p); 739 741 740 742 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 741 743 t1p = alloc( 0, t1op`realloc, align`align ); 742 test_base( t1p, 0, libAlign);743 free( t1p);744 745 t1p = alloc( dim, 0p`realloc, align`align );746 test_base( t1p, size, align);747 test_use( t1p, size / elemSize);748 free( t1p);749 750 t1p = alloc( 0, 0p`realloc, align`align );751 test_base( t1p, 0, libAlign);752 free( t1p);744 test_base(t1p, 0, libAlign); 745 free(t1p); 746 747 t1p = alloc( dim, ((T1*)0p)`realloc, align`align ); 748 test_base(t1p, size, align); 749 test_use(t1p, size / elemSize); 750 free(t1p); 751 752 t1p = alloc( 0, ((T1*)0p)`realloc, align`align ); 753 test_base(t1p, 0, libAlign); 754 free(t1p); 753 755 754 756 t1p = alloc( align`align, FillC`fill ); 755 test_base( t1p, elemSize, align);756 test_fill( t1p, 0, elemSize, FillC);757 test_use( t1p, elemSize / elemSize);758 free( t1p);757 test_base(t1p, elemSize, align); 758 test_fill(t1p, 0, elemSize, FillC); 759 test_use(t1p, elemSize / elemSize); 760 free(t1p); 759 761 760 762 t1p = alloc( align`align, FillT1`fill ); 761 test_base( t1p, elemSize, align);762 test_fill( t1p, 0, 1, FillT1);763 test_use( t1p, elemSize / elemSize);764 free( t1p);763 test_base(t1p, elemSize, align); 764 test_fill(t1p, 0, 1, FillT1); 765 test_use(t1p, elemSize / elemSize); 766 free(t1p); 765 767 766 768 t1p = alloc( dim, align`align, FillC`fill ); 767 test_base( t1p, size, align);768 test_fill( t1p, 0, size, FillC);769 test_use( t1p, size / elemSize);770 free( t1p);769 test_base(t1p, size, align); 770 test_fill(t1p, 0, size, FillC); 771 test_use(t1p, size / elemSize); 772 free(t1p); 771 773 772 774 t1p = alloc( 0, align`align, FillC`fill ); 773 test_base( t1p, 0, libAlign);774 free( t1p);775 test_base(t1p, 0, libAlign); 776 free(t1p); 775 777 776 778 t1p = alloc( dim, align`align, FillT1`fill ); 777 test_base( t1p, size, align);778 test_fill( t1p, 0, dim, FillT1);779 test_use( t1p, size / elemSize);780 free( t1p);779 test_base(t1p, size, align); 780 test_fill(t1p, 0, dim, FillT1); 781 test_use(t1p, size / elemSize); 782 free(t1p); 781 783 782 784 t1p = alloc( 0, align`align, FillT1`fill ); 783 test_base( t1p, 0, libAlign);784 free( t1p);785 test_base(t1p, 0, libAlign); 786 free(t1p); 785 787 786 788 t1p = alloc( dim, align`align, [FillT1A, dim / 4]`fill ); 787 test_base( t1p, size, align);788 test_fill( t1p, 0, size/4, FillT1A);789 test_use( t1p, size / elemSize);790 free( t1p);789 test_base(t1p, size, align); 790 test_fill(t1p, 0, size/4, FillT1A); 791 test_use(t1p, size / elemSize); 792 free(t1p); 791 793 792 794 t1p = alloc( 0, align`align, [FillT1A, dim / 4]`fill ); 793 test_base( t1p, 0, libAlign);794 free( t1p);795 test_base(t1p, 0, libAlign); 796 free(t1p); 795 797 796 798 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 797 799 t1p = alloc( dim, t1op`realloc, align`align, FillC`fill ); 798 test_base( t1p, size, align);799 test_fill( t1p, 0, dim, (T1){0xdeadbeef});800 test_use( t1p, size / elemSize);801 free( t1p);800 test_base(t1p, size, align); 801 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 802 test_use(t1p, size / elemSize); 803 free(t1p); 802 804 803 805 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 804 806 t1p = alloc( dim / 4, t1op`realloc, align`align, FillC`fill ); 805 test_base( t1p, size / 4, align);806 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});807 test_use( t1p, size / 4 / elemSize);808 free( t1p);807 test_base(t1p, size / 4, align); 808 test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef}); 809 test_use(t1p, size / 4 / elemSize); 810 free(t1p); 809 811 810 812 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 811 813 t1p = alloc( dim * 4, t1op`realloc, align`align, FillC`fill ); 812 test_base( t1p, size * 4, align);813 test_fill( t1p, 0, dim, (T1){0xdeadbeef});814 test_fill( t1p, size, size * 4, FillC);815 test_use( t1p, size * 4 / elemSize);816 free( t1p);814 test_base(t1p, size * 4, align); 815 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 816 test_fill(t1p, size, size * 4, FillC); 817 test_use(t1p, size * 4 / elemSize); 818 free(t1p); 817 819 818 820 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 819 821 t1p = alloc( 0, t1op`realloc, align`align, FillC`fill ); 820 test_base( t1p, 0, libAlign);821 free( t1p);822 823 t1p = alloc( dim, 0p`realloc, align`align, FillC`fill );824 test_base( t1p, size, align);825 test_fill( t1p, 0, size, FillC);826 test_use( t1p, size / elemSize);827 free( t1p);828 829 t1p = alloc( 0, 0p`realloc, align`align, FillC`fill );830 test_base( t1p, 0, libAlign);831 free( t1p);832 833 t1op = alloc( dim, ((T1){0xdeadbeef})`fill );822 test_base(t1p, 0, libAlign); 823 free(t1p); 824 825 t1p = alloc( dim, ((T1*)0p)`realloc, align`align, FillC`fill ); 826 test_base(t1p, size, align); 827 test_fill(t1p, 0, size, FillC); 828 test_use(t1p, size / elemSize); 829 free(t1p); 830 831 t1p = alloc( 0, ((T1*)0p)`realloc, align`align, FillC`fill ); 832 test_base(t1p, 0, libAlign); 833 free(t1p); 834 835 t1op = alloc( dim, ((T1){0xdeadbeef})`fill); 834 836 t1p = alloc( dim, t1op`realloc, align`align, FillT1`fill ); 835 test_base( t1p, size, align);836 test_fill( t1p, 0, dim, (T1){0xdeadbeef});837 test_use( t1p, size / elemSize);838 free( t1p);837 test_base(t1p, size, align); 838 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 839 test_use(t1p, size / elemSize); 840 free(t1p); 839 841 840 842 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 841 843 t1p = alloc( dim / 4, t1op`realloc, align`align, FillT1`fill ); 842 test_base( t1p, size / 4, align);843 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});844 test_use( t1p, size / 4 / elemSize);845 free( t1p);844 test_base(t1p, size / 4, align); 845 test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef}); 846 test_use(t1p, size / 4 / elemSize); 847 free(t1p); 846 848 847 849 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 848 850 t1p = alloc( dim * 4, t1op`realloc, align`align, FillT1`fill ); 849 test_base( t1p, size * 4, align);850 test_fill( t1p, 0, dim, (T1){0xdeadbeef});851 test_fill( t1p, dim, dim * 4, FillT1);852 test_use( t1p, size * 4 / elemSize);853 free( t1p);851 test_base(t1p, size * 4, align); 852 test_fill(t1p, 0, dim, (T1){0xdeadbeef}); 853 test_fill(t1p, dim, dim * 4, FillT1); 854 test_use(t1p, size * 4 / elemSize); 855 free(t1p); 854 856 855 857 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 856 858 t1p = alloc( 0, t1op`realloc, align`align, FillT1`fill ); 857 test_base( t1p, 0, libAlign ); 858 free( t1p ); 859 860 t1p = alloc( dim, 0p`realloc, align`align, FillT1`fill ); 861 test_base( t1p, size, align ); 862 test_fill( t1p, 0, dim, FillT1); 863 test_use( t1p, size / elemSize ); 864 free( t1p ); 865 866 t1p = alloc( 0, 0p`realloc, align`align, FillT1`fill ); 867 test_base( t1p, 0, libAlign ); 868 free( t1p ); 869 870 if ( tests_failed == 0) printf( "PASSED alloc tests (aligned struct)\n\n"); 871 else printf( "failed alloc tests ( aligned struct ) : %d/%d\n\n", tests_failed, tests_total ); 872 873 printf( "(if applicable) alignment error below indicates memory trashing caused by test_use.\n\n"); 874 free( FillA ); 875 free( FillT1A ); 859 test_base(t1p, 0, libAlign); 860 free(t1p); 861 862 t1p = alloc( dim, ((T1*)0p)`realloc, align`align, FillT1`fill ); 863 test_base(t1p, size, align); 864 test_fill(t1p, 0, dim, FillT1); 865 test_use(t1p, size / elemSize); 866 free(t1p); 867 868 t1p = alloc( 0, ((T1*)0p)`realloc, align`align, FillT1`fill ); 869 test_base(t1p, 0, libAlign); 870 free(t1p); 871 872 if (tests_failed == 0) printf("PASSED alloc tests (aligned struct)\n\n"); 873 else printf("failed alloc tests (aligned struct) : %d/%d\n\n", tests_failed, tests_total); 874 875 printf("(if applicable) alignment error below indicates memory trashing caused by test_use.\n\n"); 876 free(FillA); 877 free(FillT1A); 878 return 0; 876 879 } // main -
tests/enum.cfa
rffec1bf r9e23b446 24 24 } 25 25 26 // test constant-expressions27 28 struct S {29 int i;30 };31 enum K { P = 3 + 4 };32 enum Y { W = 9 + (3 && 4 || 7)};33 int p[W];34 enum { X = W + -3 + ~1 / 2 * (int)4 + sizeof(struct S) + _Alignof(struct S) || 3 && 5 + (3 ? 1 : 2 ) + __builtin_offsetof(struct S, i ) };35 int x[X];36 enum { B = 3 + 4 - 7 * 20 / 34 << 3 >> 4 > 8 < 9 <= 23 >= 42 == 12 != 13 & 4 ^ 2 | 8 + sizeof(struct S) + _Alignof(struct S) };37 int y[B];38 enum { J = +3 + -4 / ~20 * ! 0 };39 int z[J] = { 1, 2, 3 };40 int aa[41] @= { [3] : 3, [1] : 6 };41 42 26 //Dummy main 43 27 int main(int argc, char const *argv[]) { -
tests/enum_tests/structEnum.cfa
rffec1bf r9e23b446 24 24 int main() { 25 25 printf("%d %c\n", apple.x, apple.y); 26 // Failed; enumInstType is now not a real type and not instantiated. 26 // Failed; enumInstType is now not a real type and not instantiated. 27 27 // Not sure if we want that 28 28 // printf("%d %c\n", second.x, second.y); 29 29 return 0; 30 30 } 31 32 33 -
tests/pybin/tools.py
rffec1bf r9e23b446 46 46 47 47 print(cmd) 48 return 0, None , None48 return 0, None 49 49 50 50 with contextlib.ExitStack() as onexit: … … 291 291 ################################################################################ 292 292 def jobserver_version(): 293 make_ret, out, err = sh('make', '.test_makeflags', '-j2', ignore_dry_run = True,output_file=subprocess.PIPE, error=subprocess.PIPE)293 make_ret, out, err = sh('make', '.test_makeflags', '-j2', output_file=subprocess.PIPE, error=subprocess.PIPE) 294 294 if make_ret != 0: 295 295 print("ERROR: cannot find Makefile jobserver version", file=sys.stderr) -
tests/unified_locking/.expect/pthread_locks.txt
rffec1bf r9e23b446 5 5 Start Test 3: lock and condition variable multiple acquire and wait/notify 6 6 Done Test 3 7 Start Test 4: lock and condition variable single timed wait/notify8 Done Test 4 -
tests/unified_locking/mutex_test.hfa
rffec1bf r9e23b446 22 22 } 23 23 24 uint32_t cs( uint32_t & entries) {24 uint32_t cs() { 25 25 thread$ * me = active_thread(); 26 26 uint32_t value; 27 27 lock(mo.l); 28 28 { 29 entries++;30 29 uint32_t tsum = mo.sum; 31 30 uint32_t cnt = mo.cnt; … … 43 42 thread LockCheck { 44 43 uint32_t sum; 45 uint32_t entries;46 44 }; 47 45 48 46 void main(LockCheck & this) { 49 47 this.sum = 0; 50 this.entries = 0;51 48 for(num_times) { 52 49 trash(); 53 this.sum += cs( this.entries);50 this.sum += cs(); 54 51 trash(); 55 52 yield(random(10)); … … 61 58 mo.sum = -32; 62 59 mo.cnt = 0; 63 uint32_t real_entries = 0;64 60 processor p[2]; 65 61 sout | "Starting"; … … 67 63 LockCheck checkers[13]; 68 64 for(i;13) { 69 LockCheck & curr = join(checkers[i]); 70 sum += curr.sum; 71 real_entries += curr.entries; 65 sum += join(checkers[i]).sum; 72 66 } 73 67 } 74 68 sout | "Done!"; 75 if(real_entries != (13 * num_times)) sout | "Invalid real cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')'; 76 if(mo.cnt != (13 * num_times)) sout | "Invalid concurrent cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')'; 69 if(mo.cnt != (13 * num_times)) sout | "Invalid cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')'; 77 70 if(sum == mo.sum) sout | "Match!"; 78 71 else sout | "No Match!" | sum | "vs" | mo.sum; -
tests/unified_locking/pthread_locks.cfa
rffec1bf r9e23b446 3 3 #include <stdlib.hfa> 4 4 #include <thread.hfa> 5 #include <time.h>6 #include <stdlib.hfa>7 5 8 const unsigned int num_times = 50 ;6 const unsigned int num_times = 50000; 9 7 10 8 simple_owner_lock l; 11 9 pthread_cond_var( simple_owner_lock ) c; 12 13 owner_lock l2;14 condition_variable( owner_lock ) c2;15 10 16 11 volatile int counter = 0; … … 64 59 } 65 60 66 thread Wait_Time_Signal_1 {};67 68 void main( Wait_Time_Signal_1 & this ) {69 for (unsigned int i = 0; i < num_times; i++) {70 lock(l);71 if(empty(c) || random(10) >= 9 ) {72 timespec t;73 clock_gettime(CLOCK_REALTIME, &t);74 timespec waitTime{0,1};75 bool woken = wait(c,l, t + waitTime);76 }else{77 notify_one(c);78 }79 unlock(l);80 }81 }82 83 61 int main() { 84 processor p[ 1];62 processor p[3]; 85 63 printf("Start Test 1: lock and condition variable single wait/notify\n"); 86 64 { … … 100 78 } 101 79 printf("Done Test 3\n"); 102 103 printf("Start Test 4: lock and condition variable single timed wait/notify\n");104 {105 Wait_Time_Signal_1 t1[2];106 }107 printf("Done Test 4\n");108 80 } -
tools/gdb/utils-gdb.py
rffec1bf r9e23b446 89 89 return argv 90 90 91 class ClusterIter: 92 def __init__(self, root): 93 self.curr = None 94 self.root = root 95 96 def __iter__(self): 97 return self 98 99 def __next__(self): 100 # Clusters form a cycle 101 # If we haven't seen the root yet, then the root is the first 102 if not self.curr: 103 self.curr = self.root 104 return self.curr 105 106 # if we already saw the root, then go forward 107 self.curr = self.curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1'] 108 109 # if we reached the root again, then we are done 110 if self.curr == self.root: 111 raise StopIteration 112 113 # otherwise return the next 114 return self.curr 115 116 def all_clusters(): 117 """ 118 Return: a list of all the clusters as an iterator. 119 obtained from gdb.Value of globalClusters.root (is an address) 120 """ 121 if not is_cforall(): 122 return [] 123 91 def get_cluster_root(): 92 """ 93 Return: gdb.Value of globalClusters.root (is an address) 94 """ 124 95 cluster_root = gdb.parse_and_eval('_X11mainClusterPS7cluster_1') 125 96 if cluster_root.address == 0x0: 126 97 print('No clusters, program terminated') 127 return [] 128 129 return ClusterIter(cluster_root) 130 131 class ProcIter: 132 def __init__(self, root): 133 self.curr = None 134 self.root = root 135 136 def __iter__(self): 137 return self 138 139 def check(self): 140 # check if this is the last value 141 addr = int(self.curr) 142 mask = 1 << ((8 * int(gdb.parse_and_eval('sizeof(void*)'))) - 1) 143 if 0 != (mask & addr): 144 raise StopIteration 145 146 def __next__(self): 147 cfa_t = get_cfa_types() 148 149 # Processors form a cycle 150 # If we haven't seen the root yet, then the root is the first 151 if not self.curr: 152 my_next = self.root 153 self.curr = my_next.cast(cfa_t.processor_ptr) 154 155 #check if this is an empty list 156 self.check() 157 158 return self.curr 159 160 # if we already saw the root, then go forward 161 my_next = self.curr['__anonymous_object2225']['_X4nextPY13__tE_generic__1'] 162 self.curr = my_next.cast(cfa_t.processor_ptr) 163 164 #check if we reached the end 165 self.check() 166 167 # otherwise return the next 168 return self.curr 169 170 def proc_list(cluster): 171 """ 172 Return: for a given processor, return the active and idle processors, as 2 iterators 173 """ 98 return cluster_root 99 100 def get_sched_lock(): 101 """ 102 Return: gdb.Value of __scheduler_lock 103 """ 104 lock = gdb.parse_and_eval('_X16__scheduler_lockPS20__scheduler_RWLock_t_1') 105 if lock.address == 0x0: 106 print('No scheduler lock, program terminated') 107 return lock 108 109 def all_clusters(): 110 if not is_cforall(): 111 return None 112 113 cluster_root = get_cluster_root() 114 if cluster_root.address == 0x0: 115 return 116 117 curr = cluster_root 118 ret = [curr] 119 120 while True: 121 curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1'] 122 if curr == cluster_root: 123 break 124 125 ret.append(curr) 126 127 return ret 128 129 def all_processors(): 130 if not is_cforall(): 131 return None 132 174 133 cfa_t = get_cfa_types() 175 proclist = cluster['_X5procsS19__cluster_proc_list_1'] 176 idle = proclist['_X5idlesS5dlist_S9processorS5dlink_S9processor___1']['__anonymous_object2167']['_X4nextPY13__tE_generic__1'] 177 active = proclist['_X7activesS5dlist_S9processorS5dlink_S9processor___1']['__anonymous_object2167']['_X4nextPY13__tE_generic__1'] 178 return ProcIter(active.cast(cfa_t.processor_ptr)), ProcIter(idle.cast(cfa_t.processor_ptr)) 179 180 def all_processors(): 181 procs = [] 182 for c in all_clusters(): 183 active, idle = proc_list(c) 184 for p in active: 185 procs.append(p) 186 187 for p in idle: 188 procs.append(p) 189 190 print(procs) 191 return procs 134 135 # get processors from registration to the RWlock 136 lock = get_sched_lock() 137 138 #get number of elements 139 count = lock['_X5readyVj_1'] 140 141 #find all the procs 142 raw_procs = [lock['_X4dataPS21__scheduler_lock_id_t_1'][i]['_X6handleVPS16__processor_id_t_1'] for i in range(count)] 143 144 # pre cast full procs 145 procs = [p.cast(cfa_t.processor_ptr) for p in raw_procs if p['_X9full_procb_1']] 146 147 # sort procs by clusters 148 return sorted(procs, key=lambda p: p['_X4cltrPS7cluster_1']) 192 149 193 150 def tls_for_pthread(pthrd): … … 203 160 204 161 def tls_for_proc(proc): 205 return proc['_X10local_dataPS16KernelThreadData_1']162 return tls_for_pthread(proc['_X13kernel_threadm_1']) 206 163 207 164 def thread_for_pthread(pthrd): … … 223 180 def lookup_cluster(name = None): 224 181 """ 225 Look up one or more cluster given a name182 Look up a cluster given its ID 226 183 @name: str 227 184 Return: gdb.Value … … 230 187 return None 231 188 232 clusters = all_clusters()233 if not clusters:189 root = get_cluster_root() 190 if root.address == 0x0: 234 191 return None 235 192 236 193 if not name: 237 return clusters.root194 return root 238 195 239 196 # lookup for the task associated with the id 240 found = [c for c in clusters if c['_X4namePKc_1'].string() == name] 241 242 if not found: 197 cluster = None 198 curr = root 199 while True: 200 if curr['_X4namePKc_1'].string() == name: 201 cluster = curr.address 202 break 203 curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1'] 204 if curr == root or curr == 0x0: 205 break 206 207 if not cluster: 243 208 print("Cannot find a cluster with the name: {}.".format(name)) 244 209 return None 245 210 246 return found 247 211 return cluster 248 212 249 213 def lookup_threads_by_cluster(cluster): … … 330 294 super(Processors, self).__init__('info processors', gdb.COMMAND_USER) 331 295 332 def print_processor(self, processor , in_stats):296 def print_processor(self, processor): 333 297 should_stop = processor['_X12do_terminateVb_1'] 334 298 if not should_stop: 335 status = in_stats 299 midle = processor['_X6$linksS7$dlinks_S9processor__1']['_X4nextS9$mgd_link_Y13__tE_generic___1']['_X4elemPY13__tE_generic__1'] != 0x0 300 end = processor['_X6$linksS7$dlinks_S9processor__1']['_X4nextS9$mgd_link_Y13__tE_generic___1']['_X10terminatorPv_1'] != 0x0 301 302 status = 'Idle' if midle or end else 'Active' 336 303 else: 337 304 stop_count = processor['_X10terminatedS9semaphore_1']['_X5counti_1'] … … 369 336 return 370 337 338 procs = all_processors() 339 371 340 print('{:>20} {:>11} {:<7} {}'.format('Processor', '', 'Pending', 'Object')) 372 341 print('{:>20} {:>11} {:<7} {}'.format('Name', 'Status', 'Yield', 'Address')) 373 for c in clusters: 374 print('Cluster {}'.format(c['_X4namePKc_1'].string())) 375 376 active, idle = proc_list(c) 342 cl = None 343 for p in procs: 344 # if this is a different cluster print it 345 if cl != p['_X4cltrPS7cluster_1']: 346 if cl: 347 print() 348 cl = p['_X4cltrPS7cluster_1'] 349 print('Cluster {}'.format(cl['_X4namePKc_1'].string())) 350 377 351 # print the processor information 378 for p in active: 379 self.print_processor(p, 'Active') 380 381 for p in idle: 382 self.print_processor(p, 'Idle') 383 384 print() 352 self.print_processor(p) 385 353 386 354 print() … … 465 433 cluster = lookup_cluster(arg) 466 434 if not cluster: 467 print(" No matching cluster")435 print("Could not find cluster '{}'".format(arg)) 468 436 return 469 437
Note:
See TracChangeset
for help on using the changeset viewer.