Changeset def751f
- Timestamp:
- Jul 25, 2022, 3:17:25 PM (3 years ago)
- Branches:
- ADT, ast-experimental, master, pthread-emulation, qualifiedEnum
- Children:
- b0d9ff7
- Parents:
- 4e2befe3 (diff), ffec1bf (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - Files:
-
- 2 added
- 76 edited
Legend:
- Unmodified
- Added
- Removed
-
Jenkins/FullBuild
r4e2befe3 rdef751f 161 161 <p>${result}</p> 162 162 163 <p>- Performance ---------------------------------------------------------</p>164 165 <img src="https://cforall.uwaterloo.ca/jenkins/job/Cforall/job/master/plot/Compilation/getPlot?index=0" >166 <img src="https://cforall.uwaterloo.ca/jenkins/job/Cforall/job/master/plot/Compilation/getPlot?index=1" >167 168 163 <p>- Logs ----------------------------------------------------------------</p> 169 164 """ -
Jenkinsfile
r4e2befe3 rdef751f 209 209 210 210 if( Settings.Publish && !Settings.RunBenchmark ) { echo 'No results to publish!!!' } 211 212 def groupCompile = new PlotGroup('Compilation', 'duration (s) - lower is better', true)213 def groupConcurrency = new PlotGroup('Concurrency', 'duration (n) - lower is better', false)214 215 //Then publish the results216 do_plot(Settings.RunBenchmark && Settings.Publish, 'compile' , groupCompile , false, 'Compilation')217 do_plot(Settings.RunBenchmark && Settings.Publish, 'compile.diff' , groupCompile , true , 'Compilation (relative)')218 do_plot(Settings.RunBenchmark && Settings.Publish, 'ctxswitch' , groupConcurrency, false, 'Context Switching')219 do_plot(Settings.RunBenchmark && Settings.Publish, 'ctxswitch.diff' , groupConcurrency, true , 'Context Switching (relative)')220 do_plot(Settings.RunBenchmark && Settings.Publish, 'mutex' , groupConcurrency, false, 'Mutual Exclusion')221 do_plot(Settings.RunBenchmark && Settings.Publish, 'mutex.diff' , groupConcurrency, true , 'Mutual Exclusion (relative)')222 do_plot(Settings.RunBenchmark && Settings.Publish, 'scheduling' , groupConcurrency, false, 'Internal and External Scheduling')223 do_plot(Settings.RunBenchmark && Settings.Publish, 'scheduling.diff', groupConcurrency, true , 'Internal and External Scheduling (relative)')224 211 } 225 212 } … … 376 363 this.GitNewRef = '' 377 364 this.GitOldRef = '' 378 }379 }380 381 class PlotGroup implements Serializable {382 public String name383 public String unit384 public boolean log385 386 PlotGroup(String name, String unit, boolean log) {387 this.name = name388 this.unit = unit389 this.log = log390 365 } 391 366 } … … 476 451 } 477 452 } 478 479 def do_plot(boolean new_data, String file, PlotGroup group, boolean relative, String title) {480 481 if(new_data) {482 echo "Publishing new data"483 }484 485 def series = new_data ? [[486 file: "${file}.csv",487 exclusionValues: '',488 displayTableFlag: false,489 inclusionFlag: 'OFF',490 url: ''491 ]] : [];492 493 echo "file is ${BuildDir}/benchmark/${file}.csv, group ${group}, title ${title}"494 dir("${BuildDir}/benchmark/") {495 plot csvFileName: "cforall-${env.BRANCH_NAME}-${file}.csv",496 csvSeries: series,497 group: "${group.name}",498 title: "${title}",499 style: 'lineSimple',500 exclZero: false,501 keepRecords: false,502 logarithmic: !relative && group.log,503 numBuilds: '120',504 useDescr: true,505 yaxis: group.unit,506 yaxisMaximum: '',507 yaxisMinimum: ''508 }509 } -
Makefile.am
r4e2befe3 rdef751f 52 52 @find libcfa -name config.status -printf "\n%h\n\t" -exec {} --config \; | sed "s/ /\n\t/g; s/\t'/\t/g; s/'\n/\n/g; s/^'//g; s/'$$//g" 53 53 54 mostlyclean-local: @LIBCFA_TARGET_MAKEFILES@ 55 for dir in @LIBCFA_TARGET_DIRS@; do \ 56 $(MAKE) -C $${dir} mostlyclean; \ 57 done 54 @LIBCFA_TARGET_DIRS@:: 55 $(MAKE) -C $@ $(MAKECMDGOALS) 58 56 59 clean-local: @LIBCFA_TARGET_MAKEFILES@ 60 for dir in @LIBCFA_TARGET_DIRS@; do \ 61 $(MAKE) -C $${dir} clean; \ 62 done 63 64 distclean-local: @LIBCFA_TARGET_MAKEFILES@ 65 for dir in @LIBCFA_TARGET_DIRS@; do \ 66 $(MAKE) -C $${dir} distclean; \ 67 rm $${dir}/config.data; \ 68 done 57 mostlyclean clean distclean maintainer-clean: @LIBCFA_TARGET_DIRS@ -
benchmark/readyQ/churn.cfa
r4e2befe3 rdef751f 58 58 59 59 threads_left = nthreads; 60 BThrd * threads[nthreads];60 BThrd ** threads = alloc(nthreads); 61 61 for(i; nthreads ) { 62 62 BThrd & t = *(threads[i] = malloc()); … … 90 90 91 91 free(spots); 92 free(threads); 92 93 } 93 94 -
benchmark/readyQ/cycle.cfa
r4e2befe3 rdef751f 52 52 { 53 53 threads_left = tthreads; 54 BThrd * threads[tthreads];55 Partner thddata[tthreads];54 BThrd ** threads = alloc(tthreads); 55 Partner * thddata = alloc(tthreads); 56 56 for(i; tthreads) { 57 (thddata[i]){}; 57 58 unsigned pi = (i + nthreads) % tthreads; 58 59 thddata[i].next = &thddata[pi].self; … … 83 84 delete(threads[i]); 84 85 } 86 free(threads); 87 free(thddata); 85 88 } 86 89 -
benchmark/readyQ/cycle.cpp
r4e2befe3 rdef751f 39 39 { 40 40 threads_left = tthreads; 41 Fibre * threads[tthreads];42 Partner thddata[tthreads];41 Fibre ** threads = new Fibre *[tthreads](); 42 Partner* thddata = new Partner[tthreads](); 43 43 for(unsigned i = 0; i < tthreads; i++) { 44 44 unsigned pi = (i + nthreads) % tthreads; … … 69 69 global_blocks += thddata[i].blocks; 70 70 } 71 72 delete[](threads); 73 delete[](thddata); 71 74 } 72 75 -
benchmark/readyQ/locality.cfa
r4e2befe3 rdef751f 222 222 threads_left = nprocs; 223 223 { 224 MyThread * threads[nthreads];224 MyThread ** threads = alloc(nthreads); 225 225 for(i; nthreads) { 226 226 threads[i] = malloc(); … … 259 259 free( threads[i] ); 260 260 } 261 free( threads ); 261 262 } 262 263 -
benchmark/readyQ/locality.cpp
r4e2befe3 rdef751f 217 217 { 218 218 FibreInit(1, nprocs); 219 MyData * data_arrays[nthreads];219 MyData ** data_arrays = new MyData *[nthreads](); 220 220 for(size_t i = 0; i < nthreads; i++) { 221 221 data_arrays[i] = new MyData( i, wsize ); … … 228 228 229 229 threads_left = nthreads - nspots; 230 Fibre * threads[nthreads];231 MyCtx * thddata[nthreads];230 Fibre ** threads = new Fibre *[nthreads](); 231 MyCtx ** thddata = new MyCtx *[nthreads](); 232 232 { 233 233 for(size_t i = 0; i < nthreads; i++) { … … 240 240 i 241 241 ); 242 threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(thread_main), thddata[i] ); 242 threads[i] = new Fibre(); 243 threads[i]->run( reinterpret_cast<void (*)(MyCtx*)>(thread_main), thddata[i] ); 243 244 } 244 245 … … 267 268 delete( data_arrays[i] ); 268 269 } 270 delete[](data_arrays); 269 271 270 272 for(size_t i = 0; i < nspots; i++) { 271 273 delete( spots[i] ); 272 274 } 275 276 delete[](threads); 277 delete[](thddata); 273 278 } 274 279 -
benchmark/readyQ/yield.cfa
r4e2befe3 rdef751f 34 34 { 35 35 threads_left = nthreads; 36 Yielder threads[nthreads]; 36 Yielder * threads = alloc(nthreads); 37 for(i; nthreads) { 38 (threads[i]){}; 39 } 40 37 41 printf("Starting\n"); 38 42 … … 52 56 Yielder & y = join( threads[i] ); 53 57 global_counter += y.count; 58 ^(threads[i]){}; 54 59 } 60 free(threads); 55 61 } 56 62 -
benchmark/readyQ/yield.cpp
r4e2befe3 rdef751f 33 33 { 34 34 threads_left = nthreads; 35 Fibre * threads[nthreads];35 Fibre ** threads = new Fibre *[nthreads](); 36 36 for(unsigned i = 0; i < nthreads; i++) { 37 37 threads[i] = new Fibre(); … … 52 52 fibre_join( threads[i], nullptr ); 53 53 } 54 delete[] threads; 54 55 } 55 56 -
doc/bibliography/pl.bib
r4e2befe3 rdef751f 2024 2024 @manual{C++20Coroutine19, 2025 2025 keywords = {coroutine}, 2026 key = {Coroutines}, 2026 2027 contributer = {pabuhr@plg}, 2027 2028 title = {Coroutines (C++20)}, 2028 2029 organization= {cppreference.com}, 2029 month = apr,2030 year = 20 19,2030 month = jun, 2031 year = 2022, 2031 2032 note = {\href{https://en.cppreference.com/w/cpp/language/coroutines}{https://\-en.cppreference.com/\-w/\-cpp/\-language/\-coroutines}}, 2032 2033 } … … 6991 6992 % S 6992 6993 6994 @inproceedings{Imam14, 6995 keywords = {actor model, performance comparison, java actor libraries, benchmark suite}, 6996 contributer = {pabuhr@plg}, 6997 author = {Shams M. Imam and Vivek Sarkar}, 6998 title = {Savina - An Actor Benchmark Suite: Enabling Empirical Evaluation of Actor Libraries}, 6999 year = {2014}, 7000 publisher = {ACM}, 7001 address = {New York, NY, USA}, 7002 booktitle = {Proceedings of the 4th International Workshop on Programming Based on Actors Agents \& Decentralized Control}, 7003 pages = {67-80}, 7004 numpages = {14}, 7005 location = {Portland, Oregon, USA}, 7006 series = {AGERE! '14} 7007 } 7008 6993 7009 @manual{Scala, 6994 7010 keywords = {Scala programming language}, -
doc/theses/mike_brooks_MMath/array.tex
r4e2befe3 rdef751f 182 182 \CFA's array is also the first extension of C to use its tracked bounds to generate the pointer arithmetic implied by advanced allocation patterns. Other bound-tracked extensions of C either forbid certain C patterns entirely, or address the problem of \emph{verifying} that the user's provided pointer arithmetic is self-consistent. The \CFA array, applied to accordion structures [TOD: cross-reference] \emph{implies} the necessary pointer arithmetic, generated automatically, and not appearing at all in a user's program. 183 183 184 \subs ction{Safety in a padded room}184 \subsection{Safety in a padded room} 185 185 186 186 Java's array [todo:cite] is a straightforward example of assuring safety against undefined behaviour, at a cost of expressiveness for more applied properties. Consider the array parameter declarations in: -
doc/theses/thierry_delisle_PhD/thesis/fig/cycle.fig
r4e2befe3 rdef751f 8 8 -2 9 9 1200 2 10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 31 44.643 2341.072 3525 2250 3375 2025 3150 195011 2 01.00 60.00 120.0012 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 1955.357 2341.072 1950 1950 1725 2025 1575 225013 2 01.00 60.00 120.0014 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3637.500 3487.500 3750 3750 3900 3600 3900 337515 2 01.00 60.00 120.0016 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2587.500 4087.500 2325 4500 2550 4575 2850 450017 2 01.00 60.00 120.0018 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 1612.500 3487.500 1200 3375 1200 3600 1350 382519 2 01.00 60.00 120.0020 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3 675 2850 586 586 3675 2850 4125 322521 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3300 4125 586 586 3300 4125 3750 450022 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1875 4125 586 586 1875 4125 2325 450023 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1425 2850 586 586 1425 2850 1875 322524 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2550 1950 586 586 2550 1950 3000 232525 4 0 0 50 -1 0 11 0.0000 2 135 720 1125 2925 Thread 2\00126 4 2 0 50 -1 0 11 0.0000 2 165 540 1650 1950 Unpark\00127 4 0 0 50 -1 0 11 0.0000 2 165 540 4050 3600 Unpark\00128 4 2 0 50 -1 0 11 0.0000 2 165 540 1125 3750 Unpark\00129 4 2 0 50 -1 0 11 0.0000 2 165 540 2850 4800 Unpark\00130 4 0 0 50 -1 0 11 0.0000 2 135 720 2250 2025 Thread 1\00131 4 0 0 50 -1 0 11 0.0000 2 1 35 720 3000 4200 Thread 4\00132 4 0 0 50 -1 0 11 0.0000 2 135 720 1575 4200 Thread 3\00133 4 0 0 50 -1 0 11 0.0000 2 165 540 3525 2025 Unpark\00134 4 0 0 50 -1 0 11 0.0000 2 1 35 720 3375 2925 Thread 5\00110 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3150.000 4012.500 2850 4575 3150 4650 3450 4575 11 1 1 1.00 60.00 120.00 12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 2268.750 3450.000 1950 3825 1800 3600 1800 3300 13 1 1 1.00 60.00 120.00 14 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 4031.250 3450.000 4350 3825 4500 3600 4500 3300 15 1 1 1.00 60.00 120.00 16 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 3675.000 2250.000 3750 1725 4050 1875 4200 2175 17 1 1 1.00 60.00 120.00 18 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2625.000 2250.000 2550 1725 2250 1875 2100 2175 19 1 1 1.00 60.00 120.00 20 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3150 1800 600 600 3150 1800 3750 1800 21 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1875 2700 600 600 1875 2700 2475 2700 22 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2400 4200 600 600 2400 4200 3000 4200 23 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 4200 600 600 3900 4200 4500 4200 24 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4425 2700 600 600 4425 2700 5025 2700 25 4 1 0 50 -1 0 11 0.0000 2 165 855 2400 4275 Thread$_3$\001 26 4 1 0 50 -1 0 11 0.0000 2 165 855 3900 4275 Thread$_4$\001 27 4 1 0 50 -1 0 11 0.0000 2 165 855 1875 2775 Thread$_2$\001 28 4 1 0 50 -1 0 11 0.0000 2 165 855 3150 1875 Thread$_1$\001 29 4 1 0 50 -1 0 11 0.0000 2 165 855 4425 2775 Thread$_5$\001 30 4 1 0 50 -1 0 11 0.0000 2 180 540 3150 4875 Unpark\001 31 4 0 0 50 -1 0 11 0.0000 2 180 540 4650 3675 Unpark\001 32 4 2 0 50 -1 0 11 0.0000 2 180 540 1650 3600 Unpark\001 33 4 2 0 50 -1 0 11 0.0000 2 180 540 2100 1875 Unpark\001 34 4 0 0 50 -1 0 11 0.0000 2 180 540 4200 1875 Unpark\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/idle.fig
r4e2befe3 rdef751f 8 8 -2 9 9 1200 2 10 6 5919 5250 6375 5775 11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410 12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410 13 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 14 6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501 15 6284 5501 6284 5410 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 17 6102 5410 6102 5501 6192 5501 6192 5410 18 -6 19 6 7442 6525 7875 6900 10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3376.136 2169.318 2250 2625 2775 3225 3525 3375 11 1 1 1.00 60.00 120.00 12 7 1 1.00 60.00 60.00 13 6 3466 2774 3899 3149 20 14 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 21 7501 6584 7442 690015 3525 2833 3466 3149 22 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 23 7856 6584 7836 670317 3880 2833 3860 2952 24 18 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 25 7481 6703 7599 6663 7737 6722 7836 670319 3505 2952 3623 2912 3761 2971 3860 2952 26 20 0.000 -0.500 -0.500 0.000 27 21 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 28 7503 6579 7621 6540 7759 6599 7857 657922 3527 2828 3645 2789 3783 2848 3881 2828 29 23 0.000 -0.500 -0.500 0.000 30 24 -6 31 6 7575 6825 7950 732525 6 3599 3074 3974 3574 32 26 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 33 7575 6950 7700 6825 7950 6825 7950 7325 7575 7325 7575 695034 7700 6950 7700 682527 3599 3199 3724 3074 3974 3074 3974 3574 3599 3574 3599 3199 28 3724 3199 3724 3074 35 29 -6 36 6 9092 6525 9525 690030 6 5116 2774 5549 3149 37 31 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 38 9151 6584 9092 690032 5175 2833 5116 3149 39 33 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 40 9506 6584 9486 670334 5530 2833 5510 2952 41 35 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 42 9131 6703 9249 6663 9387 6722 9486 670336 5155 2952 5273 2912 5411 2971 5510 2952 43 37 0.000 -0.500 -0.500 0.000 44 38 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 45 9153 6579 9271 6540 9409 6599 9507 657939 5177 2828 5295 2789 5433 2848 5531 2828 46 40 0.000 -0.500 -0.500 0.000 47 41 -6 48 6 9225 6825 9600 732542 6 5249 3074 5625 3574 49 43 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 50 9225 6950 9350 6825 9600 6825 9600 7325 9225 7325 9225 695051 9350 6950 9350 682544 5249 3199 5374 3074 5625 3074 5625 3574 5249 3574 5249 3199 45 5374 3199 5374 3074 52 46 -6 53 6 10742 6525 11175 690047 6 6766 2774 7199 3149 54 48 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 55 10801 6584 10742 690049 6825 2833 6766 3149 56 50 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 57 11156 6584 11136 670351 7180 2833 7160 2952 58 52 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 59 10781 6703 10899 6663 11037 6722 11136 670353 6805 2952 6923 2912 7061 2971 7160 2952 60 54 0.000 -0.500 -0.500 0.000 61 55 3 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4 62 10803 6579 10921 6540 11059 6599 11157 657956 6827 2828 6945 2789 7083 2848 7181 2828 63 57 0.000 -0.500 -0.500 0.000 64 58 -6 65 6 10875 6825 11250 732559 6 6899 3074 7274 3574 66 60 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 67 10875 6950 11000 6825 11250 6825 11250 7325 10875 7325 10875 6950 68 11000 6950 11000 6825 61 6899 3199 7024 3074 7274 3074 7274 3574 6899 3574 6899 3199 62 7024 3199 7024 3074 63 -6 64 6 1875 1500 2331 2025 65 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660 66 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660 67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 68 1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751 69 2240 1751 2240 1660 70 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 71 2058 1660 2058 1751 2148 1751 2148 1660 69 72 -6 70 73 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 71 5850 6150 6675 6150 72 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 73 5850 5250 6675 5250 6675 6600 5850 6600 5850 5250 74 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 75 1 1 1.00 60.00 120.00 76 7 0 1.00 60.00 60.00 77 7725 6150 7725 6525 78 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 79 1 1 1.00 60.00 120.00 80 7 0 1.00 60.00 60.00 81 9375 6150 9375 6525 82 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 83 1 1 1.00 60.00 120.00 84 7 0 1.00 60.00 60.00 85 11025 6150 11025 6525 86 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 87 10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400 88 10500 5854 89 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 90 8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400 91 8850 5854 92 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 93 7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400 94 7200 5854 74 1800 2400 2699 2399 95 75 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 96 76 1 1 1.00 60.00 120.00 97 77 7 1 1.00 60.00 60.00 98 6450 5925 7275 592578 3749 2399 3749 2774 99 79 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 100 80 1 1 1.00 60.00 120.00 101 81 7 1 1.00 60.00 60.00 102 8025 5925 8925 592582 5399 2399 5399 2774 103 83 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 104 84 1 1 1.00 60.00 120.00 105 85 7 1 1.00 60.00 60.00 106 9675 5925 10575 592586 2550 2175 3299 2174 107 87 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 108 88 1 1 1.00 60.00 120.00 109 89 7 1 1.00 60.00 60.00 110 10725 5775 9825 577590 4049 2174 4949 2174 111 91 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 112 92 1 1 1.00 60.00 120.00 113 93 7 1 1.00 60.00 60.00 114 9075 5775 8175 5775115 3 2 0 1 0 7 50 -1 -1 0.000 0 1 1 4 94 5699 2174 6599 2174 95 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 116 96 1 1 1.00 60.00 120.00 117 97 7 1 1.00 60.00 60.00 118 6300 6375 6375 6825 6750 7050 7350 6975 119 0.000 -0.500 -0.500 0.000 120 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001 121 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001 122 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001 123 4 0 0 50 -1 0 11 0.0000 2 135 540 5775 6900 Atomic\001 124 4 0 0 50 -1 0 11 0.0000 2 135 630 5775 7125 Pointer\001 125 4 0 0 50 -1 0 11 0.0000 2 165 810 7950 6675 Benaphore\001 126 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 7125 Event FD\001 127 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001 128 4 0 0 50 -1 0 11 0.0000 2 165 810 9600 6675 Benaphore\001 129 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 7125 Event FD\001 130 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001 131 4 0 0 50 -1 0 11 0.0000 2 165 810 11250 6675 Benaphore\001 132 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 7125 Event FD\001 133 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001 98 6749 2024 5849 2024 99 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 100 1 1 1.00 60.00 120.00 101 7 1 1.00 60.00 60.00 102 5099 2024 4199 2024 103 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 104 1800 1499 2699 1499 2699 2850 1800 2850 1800 1499 105 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 106 4950 1650 5850 1650 5850 2550 4950 2550 4950 1650 107 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 108 3300 1650 4200 1650 4200 2550 3300 2550 3300 1650 109 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 110 6600 1650 7500 1650 7500 2550 6600 2550 6600 1650 111 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 112 1 1 1.00 60.00 120.00 113 7 1 1.00 60.00 60.00 114 7049 2399 7049 2774 115 4 0 0 50 -1 0 11 0.0000 2 120 525 1799 3149 Atomic\001 116 4 0 0 50 -1 0 11 0.0000 2 120 510 1799 3374 Pointer\001 117 4 0 0 50 -1 0 11 0.0000 2 180 765 3974 2924 Benaphore\001 118 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3374 Event FD\001 119 4 0 0 50 -1 0 11 0.0000 2 180 765 5625 2924 Benaphore\001 120 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3374 Event FD\001 121 4 0 0 50 -1 0 11 0.0000 2 180 765 7274 2924 Benaphore\001 122 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3374 Event FD\001 123 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001 124 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001 125 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001 126 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001 127 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001 128 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/idle1.fig
r4e2befe3 rdef751f 8 8 -2 9 9 1200 2 10 6 5919 5250 6375 577511 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 541012 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 541010 6 1875 1500 2331 2025 11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660 12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660 13 13 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 14 6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 550115 6284 5501 6284 541014 1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751 15 2240 1751 2240 1660 16 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 17 6102 5410 6102 5501 6192 5501 6192 541017 2058 1660 2058 1751 2148 1751 2148 1660 18 18 -6 19 6 7575 6525 7950 702519 6 3599 2774 3974 3274 20 20 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 21 7575 6650 7700 6525 7950 6525 7950 7025 7575 7025 7575 665022 7700 6650 7700 652521 3599 2899 3724 2774 3974 2774 3974 3274 3599 3274 3599 2899 22 3724 2899 3724 2774 23 23 -6 24 6 9225 6525 9600 702524 6 5249 2774 5625 3274 25 25 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 26 9225 6650 9350 6525 9600 6525 9600 7025 9225 7025 9225 665027 9350 6650 9350 652526 5249 2899 5374 2774 5625 2774 5625 3274 5249 3274 5249 2899 27 5374 2899 5374 2774 28 28 -6 29 6 10875 6525 11250 702529 6 6899 2774 7274 3274 30 30 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 31 10875 6650 11000 6525 11250 6525 11250 7025 10875 7025 10875 665032 11000 6650 11000 652531 6899 2899 7024 2774 7274 2774 7274 3274 6899 3274 6899 2899 32 7024 2899 7024 2774 33 33 -6 34 34 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 35 35 1 1 1.00 60.00 120.00 36 7 0 1.00 60.00 60.00 37 7725 6150 7725 6525 38 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 39 1 1 1.00 60.00 120.00 40 7 0 1.00 60.00 60.00 41 9375 6150 9375 6525 42 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 43 1 1 1.00 60.00 120.00 44 7 0 1.00 60.00 60.00 45 11025 6150 11025 6525 46 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 47 10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400 48 10500 5854 49 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 50 8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400 51 8850 5854 52 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 53 7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400 54 7200 5854 36 7 1 1.00 60.00 60.00 37 3749 2399 3749 2774 55 38 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 56 39 1 1 1.00 60.00 120.00 57 40 7 1 1.00 60.00 60.00 58 6450 5925 7275 592541 5399 2399 5399 2774 59 42 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 60 43 1 1 1.00 60.00 120.00 61 44 7 1 1.00 60.00 60.00 62 8025 5925 8925 592545 7049 2399 7049 2774 63 46 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 64 47 1 1 1.00 60.00 120.00 65 48 7 1 1.00 60.00 60.00 66 9675 5925 10575 592549 2550 2175 3299 2174 67 50 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 68 51 1 1 1.00 60.00 120.00 69 52 7 1 1.00 60.00 60.00 70 10725 5775 9825 577553 4049 2174 4949 2174 71 54 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 72 55 1 1 1.00 60.00 120.00 73 56 7 1 1.00 60.00 60.00 74 9075 5775 8175 5775 57 5699 2174 6599 2174 58 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 59 1 1 1.00 60.00 120.00 60 7 1 1.00 60.00 60.00 61 6749 2024 5849 2024 62 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 63 1 1 1.00 60.00 120.00 64 7 1 1.00 60.00 60.00 65 5099 2024 4199 2024 75 66 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 76 5850 5250 6675 5250 6675 6075 5850 6075 5850 5250 77 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001 78 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001 79 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001 80 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001 81 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001 82 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001 83 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 6825 Event FD\001 84 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 6825 Event FD\001 85 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 6825 Event FD\001 67 4950 1650 5850 1650 5850 2550 4950 2550 4950 1650 68 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 69 3300 1650 4200 1650 4200 2550 3300 2550 3300 1650 70 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 71 6600 1650 7500 1650 7500 2550 6600 2550 6600 1650 72 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 73 1800 1499 2699 1499 2699 2400 1800 2400 1800 1499 74 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001 75 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001 76 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001 77 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001 78 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001 79 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001 80 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3074 Event FD\001 81 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3074 Event FD\001 82 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3074 Event FD\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/idle2.fig
r4e2befe3 rdef751f 8 8 -2 9 9 1200 2 10 6 5919 5250 6375 5775 11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410 12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410 10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3150.000 2106.250 2250 2625 2775 3075 3525 3075 11 1 1 1.00 60.00 120.00 12 7 1 1.00 60.00 60.00 13 6 1875 1500 2331 2025 14 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660 15 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660 13 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 14 6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 550115 6284 5501 6284 541017 1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751 18 2240 1751 2240 1660 16 19 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 17 6102 5410 6102 5501 6192 5501 6192 541020 2058 1660 2058 1751 2148 1751 2148 1660 18 21 -6 19 6 7575 6525 7950 702522 6 3599 2774 3974 3274 20 23 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 21 7575 6650 7700 6525 7950 6525 7950 7025 7575 7025 7575 665022 7700 6650 7700 652524 3599 2899 3724 2774 3974 2774 3974 3274 3599 3274 3599 2899 25 3724 2899 3724 2774 23 26 -6 24 6 9225 6525 9600 702527 6 5249 2774 5625 3274 25 28 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 26 9225 6650 9350 6525 9600 6525 9600 7025 9225 7025 9225 665027 9350 6650 9350 652529 5249 2899 5374 2774 5625 2774 5625 3274 5249 3274 5249 2899 30 5374 2899 5374 2774 28 31 -6 29 6 10875 6525 11250 702532 6 6899 2774 7274 3274 30 33 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 31 10875 6650 11000 6525 11250 6525 11250 7025 10875 7025 10875 665032 11000 6650 11000 652534 6899 2899 7024 2774 7274 2774 7274 3274 6899 3274 6899 2899 35 7024 2899 7024 2774 33 36 -6 34 37 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 35 5850 6150 6675 6150 36 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 37 5850 5250 6675 5250 6675 6600 5850 6600 5850 5250 38 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 39 1 1 1.00 60.00 120.00 40 7 0 1.00 60.00 60.00 41 7725 6150 7725 6525 42 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 43 1 1 1.00 60.00 120.00 44 7 0 1.00 60.00 60.00 45 9375 6150 9375 6525 46 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 47 1 1 1.00 60.00 120.00 48 7 0 1.00 60.00 60.00 49 11025 6150 11025 6525 50 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 51 10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400 52 10500 5854 53 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 54 8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400 55 8850 5854 56 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7 57 7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400 58 7200 5854 38 1800 2400 2699 2399 59 39 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 60 40 1 1 1.00 60.00 120.00 61 41 7 1 1.00 60.00 60.00 62 6450 5925 7275 592542 3749 2399 3749 2774 63 43 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 64 44 1 1 1.00 60.00 120.00 65 45 7 1 1.00 60.00 60.00 66 8025 5925 8925 592546 5399 2399 5399 2774 67 47 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 68 48 1 1 1.00 60.00 120.00 69 49 7 1 1.00 60.00 60.00 70 9675 5925 10575 592550 7049 2399 7049 2774 71 51 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 72 52 1 1 1.00 60.00 120.00 73 53 7 1 1.00 60.00 60.00 74 10725 5775 9825 577554 2550 2175 3299 2174 75 55 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 76 56 1 1 1.00 60.00 120.00 77 57 7 1 1.00 60.00 60.00 78 9075 5775 8175 577579 3 2 0 1 0 7 50 -1 -1 0.000 0 1 1 4 58 4049 2174 4949 2174 59 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 80 60 1 1 1.00 60.00 120.00 81 61 7 1 1.00 60.00 60.00 82 6300 6375 6375 6825 6900 6975 7500 6750 83 0.000 -0.500 -0.500 0.000 84 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001 85 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001 86 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001 87 4 0 0 50 -1 0 11 0.0000 2 135 540 5775 6900 Atomic\001 88 4 0 0 50 -1 0 11 0.0000 2 135 630 5775 7125 Pointer\001 89 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001 90 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001 91 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001 92 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 6825 Event FD\001 93 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 6825 Event FD\001 94 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 6825 Event FD\001 62 5699 2174 6599 2174 63 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 64 1 1 1.00 60.00 120.00 65 7 1 1.00 60.00 60.00 66 6749 2024 5849 2024 67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2 68 1 1 1.00 60.00 120.00 69 7 1 1.00 60.00 60.00 70 5099 2024 4199 2024 71 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 72 1800 1499 2699 1499 2699 2850 1800 2850 1800 1499 73 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 74 4950 1650 5850 1650 5850 2550 4950 2550 4950 1650 75 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 76 3300 1650 4200 1650 4200 2550 3300 2550 3300 1650 77 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 78 6600 1650 7500 1650 7500 2550 6600 2550 6600 1650 79 4 0 0 50 -1 0 11 0.0000 2 120 525 1799 3149 Atomic\001 80 4 0 0 50 -1 0 11 0.0000 2 120 510 1799 3374 Pointer\001 81 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001 82 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001 83 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001 84 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001 85 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001 86 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001 87 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3074 Event FD\001 88 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3074 Event FD\001 89 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3074 Event FD\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/idle_state.fig
r4e2befe3 rdef751f 8 8 -2 9 9 1200 2 10 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3 900 3600 571 571 3900 3600 3375 337511 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 3600 605 605 6300 3600 5775 330012 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 5400 600 600 5100 5400 4500 540010 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3000 3600 600 600 3000 3600 2400 3600 11 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1800 1800 600 600 1800 1800 1200 1800 12 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4205 1800 600 600 4205 1800 3605 1800 13 13 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 14 0 01.00 60.00 120.0015 4200 4125 4725 495014 1 1 1.00 60.00 120.00 15 2100 2325 2625 3150 16 16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 17 0 01.00 60.00 120.0018 4500 3600 5700 360017 1 1 1.00 60.00 120.00 18 2400 1800 3600 1800 19 19 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 20 0 01.00 60.00 120.0021 5923 4125 5475 487522 4 1 0 50 -1 0 11 0.0000 2 1 35 450 5100 5475 AWAKE\00123 4 1 0 50 -1 0 11 0.0000 2 1 35 450 6300 3675 SLEEP\00124 4 1 0 50 -1 0 11 0.0000 2 1 35 540 3900 3675 SEARCH\00125 4 0 0 50 -1 0 11 0.0000 2 135 360 5775 4650 WAKE\00126 4 2 0 50 -1 0 11 0.0000 2 135 540 4350 4650 CANCEL\00127 4 1 0 50 -1 0 11 0.0000 2 135 630 5025 3450 CONFIRM\00120 1 1 1.00 60.00 120.00 21 3900 2325 3375 3150 22 4 1 0 50 -1 0 11 0.0000 2 120 675 3000 3675 AWAKE\001 23 4 1 0 50 -1 0 11 0.0000 2 120 525 4200 1875 SLEEP\001 24 4 1 0 50 -1 0 11 0.0000 2 120 720 1800 1875 SEARCH\001 25 4 2 0 50 -1 0 11 0.0000 2 120 720 2250 2850 CANCEL\001 26 4 1 0 50 -1 0 11 0.0000 2 120 840 2925 1650 CONFIRM\001 27 4 0 0 50 -1 0 11 0.0000 2 120 540 3750 2850 WAKE\001 -
doc/theses/thierry_delisle_PhD/thesis/fig/io_uring.fig
r4e2befe3 rdef751f 8 8 -2 9 9 1200 2 10 6 180 3240 2025 351010 6 675 3105 2520 3375 11 11 2 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2 12 720 3240 720 351012 1215 3105 1215 3375 13 13 2 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2 14 450 3240 450 351014 945 3105 945 3375 15 15 2 2 0 1 0 7 45 -1 20 0.000 0 0 -1 0 0 5 16 180 3240 1260 3240 1260 3510 180 3510 180 324016 675 3105 1755 3105 1755 3375 675 3375 675 3105 17 17 2 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2 18 990 3240 990 351019 4 0 0 40 -1 0 12 0.0000 2 165 9 90 1035 3420{\\small S3}\00120 4 0 0 40 -1 0 12 0.0000 2 165 9 90 765 3420{\\small S2}\00121 4 0 0 40 -1 0 12 0.0000 2 165 9 90 225 3420{\\small S0}\00122 4 0 0 40 -1 0 12 0.0000 2 165 9 90 495 3420{\\small S1}\00118 1485 3105 1485 3375 19 4 0 0 40 -1 0 12 0.0000 2 165 930 1530 3285 {\\small S3}\001 20 4 0 0 40 -1 0 12 0.0000 2 165 930 1260 3285 {\\small S2}\001 21 4 0 0 40 -1 0 12 0.0000 2 165 930 720 3285 {\\small S0}\001 22 4 0 0 40 -1 0 12 0.0000 2 165 930 990 3285 {\\small S1}\001 23 23 -6 24 6 1530 2610 3240 414025 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 2 455.714 3375.000 1890 2700 1575 3375 1890 405024 6 2025 2475 3735 4005 25 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 2950.714 3240.000 2385 2565 2070 3240 2385 3915 26 26 1 1 1.00 60.00 120.00 27 1 3 0 1 0 7 40 -1 20 0.000 1 0.0000 2 475 3375 315 315 2475 3375 2790 337528 1 3 0 1 0 7 50 -1 20 0.000 1 0.0000 2 475 3375 765 765 2475 3375 3240 337527 1 3 0 1 0 7 40 -1 20 0.000 1 0.0000 2970 3240 315 315 2970 3240 3285 3240 28 1 3 0 1 0 7 50 -1 20 0.000 1 0.0000 2970 3240 765 765 2970 3240 3735 3240 29 29 2 1 0 1 0 7 45 -1 -1 0.000 0 0 -1 0 0 2 30 2 475 3375 2133 269030 2970 3240 2628 2555 31 31 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 32 2 475 3375 1769 309332 2970 3240 2264 2958 33 33 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 34 2 475 3375 1769 366134 2970 3240 2264 3526 35 35 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 36 2 475 3375 2133 405736 2970 3240 2628 3922 37 37 2 1 1 1 0 7 35 -1 0 4.000 0 0 -1 0 0 2 38 2 205 3375 2745 337538 2700 3240 3240 3240 39 39 -6 40 6 585 2250 1485 261041 4 2 0 50 -1 0 12 0.0000 2 135 9 00 1485 2385Submission\00142 4 2 0 50 -1 0 12 0.0000 2 1 65 360 1485 2580Ring\00140 6 1080 2115 1980 2475 41 4 2 0 50 -1 0 12 0.0000 2 135 945 1980 2250 Submission\001 42 4 2 0 50 -1 0 12 0.0000 2 180 405 1980 2445 Ring\001 43 43 -6 44 6 3600 2610 5265 414045 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 4 384.000 3375.000 4950 4050 5265 3375 4950 270044 6 4095 2475 5760 4005 45 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 4879.000 3240.000 5445 3915 5760 3240 5445 2565 46 46 1 1 1.00 60.00 120.00 47 1 3 0 1 0 7 40 -1 20 0.000 1 3.1416 4 365 3375 315 315 4365 3375 4050 337548 1 3 0 1 0 7 50 -1 20 0.000 1 3.1416 4 365 3375 765 765 4365 3375 3600 337547 1 3 0 1 0 7 40 -1 20 0.000 1 3.1416 4860 3240 315 315 4860 3240 4545 3240 48 1 3 0 1 0 7 50 -1 20 0.000 1 3.1416 4860 3240 765 765 4860 3240 4095 3240 49 49 2 1 0 1 0 7 45 -1 -1 0.000 0 0 -1 0 0 2 50 4 365 3375 4707 406050 4860 3240 5202 3925 51 51 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 52 4 365 3375 5071 365752 4860 3240 5566 3522 53 53 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 54 4 365 3375 5071 308954 4860 3240 5566 2954 55 55 2 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2 56 4 365 3375 4707 269356 4860 3240 5202 2558 57 57 2 1 1 1 0 7 35 -1 0 4.000 0 0 -1 0 0 2 58 4635 3375 4095 337558 5130 3240 4590 3240 59 59 -6 60 6 5 355 2250 6255 261061 4 0 0 50 -1 0 12 0.0000 2 1 65 360 5355 2580Ring\00162 4 0 0 50 -1 0 12 0.0000 2 1 65 900 5355 2385Completion\00160 6 5850 2115 6750 2475 61 4 0 0 50 -1 0 12 0.0000 2 180 405 5850 2445 Ring\001 62 4 0 0 50 -1 0 12 0.0000 2 180 975 5850 2250 Completion\001 63 63 -6 64 64 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 65 65 1 1 1.00 60.00 120.00 66 2925 2025 2550 248666 3420 1890 3045 2351 67 67 2 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2 68 68 1 1 1.00 60.00 120.00 69 4 275 2475 3825 202569 4770 2340 4320 1890 70 70 2 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2 71 71 1 1 1.00 60.00 120.00 72 2751 4268 3066 453872 3060 4095 3600 4410 73 73 2 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2 74 74 1 1 1.00 60.00 120.00 75 3780 4545 4275 423075 4275 4410 4770 4095 76 76 2 1 1 1 0 7 55 -1 -1 4.000 0 0 -1 0 0 2 77 0 3375 6255 337578 4 0 0 35 -1 0 12 0.0000 2 165 11 70 1845 3060{\\small \\&S2}\00179 4 0 0 35 -1 0 12 0.0000 2 165 1170 1755 3420 {\\small \\&S3}\00180 4 0 0 35 -1 0 12 0.0000 2 165 1170 1890 3735 {\\small \\&S0}\00181 4 0 0 50 -1 0 12 0.0000 6 135 360 2790 2565 Push\00182 4 0 0 50 -1 0 12 0.0000 6 165 270 2880 4230 Pop\00183 4 0 0 50 -1 0 12 0.0000 6 135 360 2025 4275 Head\00184 4 0 0 50 -1 0 12 0.0000 6 135 360 2025 2565Tail\00185 4 0 0 35 -1 0 12 0.0000 2 165 990 4635 3060 {\\small C0}\00186 4 0 0 35 -1 0 12 0.0000 2 165 990 4815 3420 {\\small C1}\00187 4 0 0 35 -1 0 12 0.0000 2 165 990 4635 3780 {\\small C2}\00188 4 0 0 50 -1 0 12 0.0000 4 135 360 4725 4275 Tail\00189 4 0 0 50 -1 0 12 0.0000 6 135 360 4590 2565Head\00190 4 0 0 50 -1 0 12 0.0000 2 135 990 5535 3285 Kernel Line\00191 4 1 0 50 -1 0 12 0.0000 2 180 1350 3375 4815 {\\Large Kernel}\00192 4 1 0 50 -1 0 12 0.0000 2 180 1 800 3375 1845 {\\Large Application}\00193 4 0 0 50 -1 0 12 0.0000 6 1 65 270 3690 2565Pop\00194 4 0 0 50 -1 0 12 0.0000 4 135 360 3465 4230 Push\00195 4 0 0 50 -1 0 12 0.0000 2 135 90 0 3285 S\00177 495 3240 6750 3240 78 4 0 0 35 -1 0 12 0.0000 2 165 1140 2340 2925 {\\small \\&S2}\001 79 4 0 0 50 -1 0 12 0.0000 6 135 390 3285 2430 Push\001 80 4 0 0 50 -1 0 12 0.0000 6 135 330 2520 2430 Tail\001 81 4 0 0 35 -1 0 12 0.0000 2 165 960 5130 2925 {\\small C0}\001 82 4 0 0 35 -1 0 12 0.0000 2 165 960 5310 3285 {\\small C1}\001 83 4 0 0 35 -1 0 12 0.0000 2 165 960 5130 3645 {\\small C2}\001 84 4 0 0 50 -1 0 12 0.0000 4 135 330 5220 4140 Tail\001 85 4 0 0 50 -1 0 12 0.0000 6 135 420 5085 2430 Head\001 86 4 0 0 50 -1 0 12 0.0000 2 135 960 6030 3150 Kernel Line\001 87 4 0 0 50 -1 0 12 0.0000 2 135 105 495 3150 S\001 88 4 0 0 35 -1 0 12 0.0000 2 165 1140 2385 3645 {\\small \\&S0}\001 89 4 0 0 50 -1 0 12 0.0000 6 135 420 2340 4140 Head\001 90 4 0 0 35 -1 0 12 0.0000 2 165 1140 2250 3285 {\\small \\&S3}\001 91 4 2 0 50 -1 0 12 0.0000 4 135 390 4500 4140 Push\001 92 4 1 0 50 -1 0 12 0.0000 2 180 1290 3915 4680 {\\Large Kernel}\001 93 4 0 0 50 -1 0 12 0.0000 6 180 315 3285 4140 Pop\001 94 4 1 0 50 -1 0 12 0.0000 2 180 1725 3915 1755 {\\Large Application}\001 95 4 2 0 50 -1 0 12 0.0000 6 180 315 4545 2430 Pop\001 -
doc/theses/thierry_delisle_PhD/thesis/local.bib
r4e2befe3 rdef751f 2 2 % Cforall 3 3 @misc{cfa:frontpage, 4 url = {https://cforall.uwaterloo.ca/}4 howpublished = {\href{https://cforall.uwaterloo.ca}{https://\-cforall.uwaterloo.ca}} 5 5 } 6 6 @article{cfa:typesystem, … … 481 481 @misc{MAN:linux/cfs, 482 482 title = {{CFS} Scheduler - The Linux Kernel documentation}, 483 url = {https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}483 howpublished = {\href{https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}{https://\-www.kernel.org/\-doc/\-html/\-latest/\-scheduler/\-sched-design-CFS.html}} 484 484 } 485 485 … … 489 489 year = {2019}, 490 490 month = {February}, 491 url = {https://opensource.com/article/19/2/fair-scheduling-linux}491 howpublished = {\href{https://opensource.com/article/19/2/fair-scheduling-linux}{https://\-opensource.com/\-article/\-19/2\-/\-fair-scheduling-linux}} 492 492 } 493 493 … … 499 499 } 500 500 501 @ article{MAN:linux/cfs/balancing,501 @misc{MAN:linux/cfs/balancing, 502 502 title={Reworking {CFS} load balancing}, 503 journal={LWN article, available at: https://lwn.net/Articles/793427/}, 504 year={2013} 503 journal={LWN article}, 504 year={2019}, 505 howpublished = {\href{https://lwn.net/Articles/793427}{https://\-lwn.net/\-Articles/\-793427}}, 505 506 } 506 507 … … 523 524 title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide}, 524 525 organization = {Apple Inc.}, 525 url = {https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}526 howPublish = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}} 526 527 } 527 528 … … 536 537 month = {June}, 537 538 series = {Developer Reference}, 538 url = {https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}539 } 540 541 @ online{GITHUB:go,539 howpublished = {\href{https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}{https://\-www.microsoftpressstore.com/\-articles/\-article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}} 540 } 541 542 @misc{GITHUB:go, 542 543 title = {GitHub - The Go Programming Language}, 543 544 author = {The Go Programming Language}, 544 url = {https://github.com/golang/go},545 howpublished = {\href{https://github.com/golang/go}{https://\-github.com/\-golang/\-go}}, 545 546 version = {Change-Id: If07f40b1d73b8f276ee28ffb8b7214175e56c24d} 546 547 } … … 551 552 year = {2019}, 552 553 booktitle = {Hydra}, 553 url = {https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}554 howpublished = {\href{https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}{https://\-www.youtube.com/\-watch?v=-K11rY57K7k&ab_channel=Hydra}} 554 555 } 555 556 … … 559 560 year = {2008}, 560 561 booktitle = {Erlang User Conference}, 561 url = {http://www.erlang.se/euc/08/euc_smp.pdf} 562 } 563 564 562 howpublished = {\href{http://www.erlang.se/euc/08/euc_smp.pdf}{http://\-www.erlang.se/\-euc/\-08/\-euc_smp.pdf}} 563 } 565 564 566 565 @manual{MAN:tbb/scheduler, 567 566 title = {Scheduling Algorithm - Intel{\textregistered} Threading Building Blocks Developer Reference}, 568 567 organization = {Intel{\textregistered}}, 569 url = {https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}568 howpublished = {\href{https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}{https://\-www.threadingbuildingblocks.org/\-docs/\-help/\-reference/\-task\_scheduler/\-scheduling\_algorithm.html}} 570 569 } 571 570 … … 573 572 title = {Quasar Core - Quasar User Manual}, 574 573 organization = {Parallel Universe}, 575 url = {https://docs.paralleluniverse.co/quasar/}574 howpublished = {\href{https://docs.paralleluniverse.co/quasar}{https://\-docs.paralleluniverse.co/\-quasar}} 576 575 } 577 576 @misc{MAN:project-loom, 578 url = {https://www.baeldung.com/openjdk-project-loom}577 howpublished = {\href{https://www.baeldung.com/openjdk-project-loom}{https://\-www.baeldung.com/\-openjdk-project-loom}} 579 578 } 580 579 581 580 @misc{MAN:java/fork-join, 582 url = {https://www.baeldung.com/java-fork-join}581 howpublished = {\href{https://www.baeldung.com/java-fork-join}{https://\-www.baeldung.com/\-java-fork-join}} 583 582 } 584 583 … … 633 632 month = "March", 634 633 version = {0,4}, 635 howpublished = {\ url{https://kernel.dk/io_uring.pdf}}634 howpublished = {\href{https://kernel.dk/io_uring.pdf}{https://\-kernel.dk/\-io\_uring.pdf}} 636 635 } 637 636 … … 642 641 title = "Control theory --- {W}ikipedia{,} The Free Encyclopedia", 643 642 year = "2020", 644 url = "https://en.wikipedia.org/wiki/Task_parallelism",643 howpublished = {\href{https://en.wikipedia.org/wiki/Task_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Task\_parallelism}}, 645 644 note = "[Online; accessed 22-October-2020]" 646 645 } … … 650 649 title = "Task parallelism --- {W}ikipedia{,} The Free Encyclopedia", 651 650 year = "2020", 652 url = "https://en.wikipedia.org/wiki/Control_theory",651 howpublished = "\href{https://en.wikipedia.org/wiki/Control_theory}{https://\-en.wikipedia.org/\-wiki/\-Control\_theory}", 653 652 note = "[Online; accessed 22-October-2020]" 654 653 } … … 658 657 title = "Implicit parallelism --- {W}ikipedia{,} The Free Encyclopedia", 659 658 year = "2020", 660 url = "https://en.wikipedia.org/wiki/Implicit_parallelism",659 howpublished = "\href{https://en.wikipedia.org/wiki/Implicit_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Implicit\_parallelism}", 661 660 note = "[Online; accessed 23-October-2020]" 662 661 } … … 666 665 title = "Explicit parallelism --- {W}ikipedia{,} The Free Encyclopedia", 667 666 year = "2017", 668 url = "https://en.wikipedia.org/wiki/Explicit_parallelism",667 howpublished = "\href{https://en.wikipedia.org/wiki/Explicit_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Explicit\_parallelism}", 669 668 note = "[Online; accessed 23-October-2020]" 670 669 } … … 674 673 title = "Linear congruential generator --- {W}ikipedia{,} The Free Encyclopedia", 675 674 year = "2020", 676 url = "https://en.wikipedia.org/wiki/Linear_congruential_generator",675 howpublished = "\href{https://en.wikipedia.org/wiki/Linear_congruential_generator}{https://en.wikipedia.org/wiki/Linear\_congruential\_generator}", 677 676 note = "[Online; accessed 2-January-2021]" 678 677 } … … 682 681 title = "Futures and promises --- {W}ikipedia{,} The Free Encyclopedia", 683 682 year = "2020", 684 url = "https://en.wikipedia.org/wiki/Futures_and_promises",683 howpublished = "\href{https://en.wikipedia.org/wiki/Futures_and_promises}{https://\-en.wikipedia.org/\-wiki/Futures\_and\_promises}", 685 684 note = "[Online; accessed 9-February-2021]" 686 685 } … … 690 689 title = "Read-copy-update --- {W}ikipedia{,} The Free Encyclopedia", 691 690 year = "2022", 692 url = "https://en.wikipedia.org/wiki/Linear_congruential_generator",691 howpublished = "\href{https://en.wikipedia.org/wiki/Linear_congruential_generator}{https://\-en.wikipedia.org/\-wiki/\-Linear\_congruential\_generator}", 693 692 note = "[Online; accessed 12-April-2022]" 694 693 } … … 698 697 title = "Readers-writer lock --- {W}ikipedia{,} The Free Encyclopedia", 699 698 year = "2021", 700 url = "https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock",699 howpublished = "\href{https://en.wikipedia.org/wiki/Readers-writer_lock}{https://\-en.wikipedia.org/\-wiki/\-Readers-writer\_lock}", 701 700 note = "[Online; accessed 12-April-2022]" 702 701 } 702 703 703 @misc{wiki:binpak, 704 704 author = "{Wikipedia contributors}", 705 705 title = "Bin packing problem --- {W}ikipedia{,} The Free Encyclopedia", 706 706 year = "2022", 707 url = "https://en.wikipedia.org/wiki/Bin_packing_problem",707 howpublished = "\href{https://en.wikipedia.org/wiki/Bin_packing_problem}{https://\-en.wikipedia.org/\-wiki/\-Bin\_packing\_problem}", 708 708 note = "[Online; accessed 29-June-2022]" 709 709 } … … 712 712 % [05/04, 12:36] Trevor Brown 713 713 % i don't know where rmr complexity was first introduced, but there are many many many papers that use the term and define it 714 % [05/04, 12:37] Trevor Brown714 % [05/04, 12:37] Trevor Brown 715 715 % here's one paper that uses the term a lot and links to many others that use it... might trace it to something useful there https://drops.dagstuhl.de/opus/volltexte/2021/14832/pdf/LIPIcs-DISC-2021-30.pdf 716 % [05/04, 12:37] Trevor Brown716 % [05/04, 12:37] Trevor Brown 717 717 % another option might be to cite a textbook 718 % [05/04, 12:42] Trevor Brown718 % [05/04, 12:42] Trevor Brown 719 719 % but i checked two textbooks in the area i'm aware of and i don't see a definition of rmr complexity in either 720 % [05/04, 12:42] Trevor Brown720 % [05/04, 12:42] Trevor Brown 721 721 % this one has a nice statement about the prevelance of rmr complexity, as well as some rough definition 722 % [05/04, 12:42] Trevor Brown722 % [05/04, 12:42] Trevor Brown 723 723 % https://dl.acm.org/doi/pdf/10.1145/3465084.3467938 724 724 … … 728 728 % 729 729 % https://doi.org/10.1137/1.9781611973099.100 730 731 732 @misc{AIORant, 733 author = "Linus Torvalds", 734 title = "Re: [PATCH 09/13] aio: add support for async openat()", 735 year = "2016", 736 month = jan, 737 howpublished = "\href{https://lwn.net/Articles/671657}{https://\-lwn.net/\-Articles/671657}", 738 note = "[Online; accessed 6-June-2022]" 739 } 740 741 @misc{apache, 742 key = {Apache Software Foundation}, 743 title = {{T}he {A}pache Web Server}, 744 howpublished = {\href{http://httpd.apache.org}{http://\-httpd.apache.org}}, 745 note = "[Online; accessed 6-June-2022]" 746 } 747 748 @misc{SeriallyReusable, 749 author = {IBM}, 750 title = {Serially reusable programs}, 751 month = mar, 752 howpublished= {\href{https://www.ibm.com/docs/en/ztpf/1.1.0.15?topic=structures-serially-reusable-programs}{https://www.ibm.com/\-docs/\-en/\-ztpf/\-1.1.0.15?\-topic=structures\--serially\--reusable-programs}}, 753 year = 2021, 754 } 755 756 @inproceedings{Albers12, 757 author = {Susanne Albers and Antonios Antoniadis}, 758 title = {Race to Idle: New Algorithms for Speed Scaling with a Sleep State}, 759 booktitle = {Proceedings of the 2012 Annual ACM-SIAM Symposium on Discrete Algorithms (SODA)}, 760 doi = {10.1137/1.9781611973099.100}, 761 URL = {https://epubs.siam.org/doi/abs/10.1137/1.9781611973099.100}, 762 eprint = {https://epubs.siam.org/doi/pdf/10.1137/1.9781611973099.100}, 763 year = 2012, 764 month = jan, 765 pages = {1266-1285}, 766 } -
doc/theses/thierry_delisle_PhD/thesis/text/core.tex
r4e2befe3 rdef751f 322 322 Building a scheduler that is cache aware poses two main challenges: discovering the cache topology and matching \procs to this cache structure. 323 323 Unfortunately, there is no portable way to discover cache topology, and it is outside the scope of this thesis to solve this problem. 324 This work uses the cache topology information from Linux's \texttt{/sys/devices/system/cpu}directory.324 This work uses the cache topology information from Linux's @/sys/devices/system/cpu@ directory. 325 325 This leaves the challenge of matching \procs to cache structure, or more precisely identifying which subqueues of the ready queue are local to which subcomponents of the cache structure. 326 326 Once a matching is generated, the helping algorithm is changed to add bias so that \procs more often help subqueues local to the same cache substructure.\footnote{ … … 330 330 Instead of having each subqueue local to a specific \proc, the system is initialized with subqueues for each hardware hyperthread/core up front. 331 331 Then \procs dequeue and enqueue by first asking which CPU id they are executing on, in order to identify which subqueues are the local ones. 332 \Glspl{proc} can get the CPU id from \texttt{sched\_getcpu} or \texttt{librseq}.332 \Glspl{proc} can get the CPU id from @sched_getcpu@ or @librseq@. 333 333 334 334 This approach solves the performance problems on systems with topologies with narrow L3 caches, similar to Figure \ref{fig:cache-noshare}. … … 341 341 342 342 \subsection{Topological Work Stealing} 343 \label{s:TopologicalWorkStealing} 343 344 Therefore, the approach used in the \CFA scheduler is to have per-\proc subqueues, but have an explicit data-structure track which cache substructure each subqueue is tied to. 344 345 This tracking requires some finesse because reading this data structure must lead to fewer cache misses than not having the data structure in the first place. -
doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex
r4e2befe3 rdef751f 1 1 \chapter{Micro-Benchmarks}\label{microbench} 2 2 3 The first step of evaluation is always to test-out small controlled cases, to ensure that the basics are workingproperly.4 This sectionspresents five different experimental setup, evaluating some of the basic features of \CFA's scheduler.3 The first step in evaluating this work is to test-out small controlled cases to ensure the basics work properly. 4 This chapter presents five different experimental setup, evaluating some of the basic features of \CFA's scheduler. 5 5 6 6 \section{Benchmark Environment} 7 All of these benchmarks are run on two distinct hardware environment, an AMD and an INTEL machine. 8 9 For all benchmarks, \texttt{taskset} is used to limit the experiment to 1 NUMA Node with no hyper threading. 7 All benchmarks are run on two distinct hardware platforms. 8 \begin{description} 9 \item[AMD] is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM. 10 The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}. 11 Each CPU has 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches, respectively. 12 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}. 13 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55. 14 15 \item[Intel] is a server with four Intel Xeon Platinum 8160 CPUs and 384GB of DDR4 RAM. 16 The Xeon CPU has 24 cores with 2 \glspl{hthrd} per core, for 48 \glspl{hthrd} per socket with 4 sockets for a total of 196 \glspl{hthrd}. 17 Each CPU has 3 MB, 96 MB and 132 MB of L1, L2 and L3 caches respectively. 18 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared across the entire CPU, therefore 48 \glspl{hthrd}. 19 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55. 20 \end{description} 21 22 For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA Node with no hyper threading. 10 23 If more \glspl{hthrd} are needed, then 1 NUMA Node with hyperthreading is used. 11 If still more \glspl{hthrd} are needed then the experiment is limited to as few NUMA Nodes as needed. 12 13 14 \paragraph{AMD} The AMD machine is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM. 15 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55. 16 These EPYCs have 64 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 256 \glspl{hthrd}. 17 The cpus each have 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches respectively. 18 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}. 19 20 \paragraph{Intel} The Intel machine is a server with four Intel Xeon Platinum 8160 CPUs and 384GB of DDR4 RAM. 21 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55. 22 These Xeon Platinums have 24 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 192 \glspl{hthrd}. 23 The cpus each have 3 MB, 96 MB and 132 MB of L1, L2 and L3 caches respectively. 24 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared across the entire CPU, therefore 48 \glspl{hthrd}. 25 26 This limited sharing of the last level cache on the AMD machine is markedly different than the Intel machine. Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different cpu incurr a significant latency, on AMD it is also the case that cache misses served by a different L3 instance on the same cpu still incur high latency. 24 If still more \glspl{hthrd} are needed, then the experiment is limited to as few NUMA Nodes as needed. 25 26 The limited sharing of the last-level cache on the AMD machine is markedly different than the Intel machine. 27 Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different CPU incur a significant latency, on the AMD it is also the case that cache misses served by a different L3 instance on the same CPU still incur high latency. 27 28 28 29 … … 34 35 \label{fig:cycle} 35 36 \end{figure} 36 The most basic evaluation of any ready queue is to evaluate the latency needed to push and pop one element from the ready-queue. 37 Since these two operation also describe a \texttt{yield} operation, many systems use this as the most basic benchmark. 38 However, yielding can be treated as a special case, since it also carries the information that the number of the ready \glspl{at} will not change. 39 Not all systems use this information, but those which do may appear to have better performance than they would for disconnected push/pop pairs. 40 For this reason, I chose a different first benchmark, which I call the Cycle Benchmark. 41 This benchmark arranges many \glspl{at} into multiple rings of \glspl{at}. 42 Each ring is effectively a circular singly-linked list. 37 The most basic evaluation of any ready queue is to evaluate the latency needed to push and pop one element from the ready queue. 38 Since these two operation also describe a @yield@ operation, many systems use this operation as the most basic benchmark. 39 However, yielding can be treated as a special case by optimizing it away (dead code) since the number of ready \glspl{at} does not change. 40 Not all systems perform this optimization, but those that do have an artificial performance benefit because the yield becomes a \emph{nop}. 41 For this reason, I chose a different first benchmark, called \newterm{Cycle Benchmark}. 42 This benchmark arranges a number of \glspl{at} into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list. 43 43 At runtime, each \gls{at} unparks the next \gls{at} before parking itself. 44 This corresponds to the desired pair of ready queue operations. 45 Unparking the next \gls{at} requires pushing that \gls{at} onto the ready queue and the ensuing park will cause the runtime to pop a \gls{at} from the ready-queue. 46 Figure~\ref{fig:cycle} shows a visual representation of this arrangement. 47 48 The goal of this ring is that the underlying runtime cannot rely on the guarantee that the number of ready \glspl{at} will stay constant over the duration of the experiment. 44 Unparking the next \gls{at} pushes that \gls{at} onto the ready queue as does the ensuing park. 45 46 Hence, the underlying runtime cannot rely on the number of ready \glspl{at} staying constant over the duration of the experiment. 49 47 In fact, the total number of \glspl{at} waiting on the ready queue is expected to vary because of the race between the next \gls{at} unparking and the current \gls{at} parking. 50 The size of the cycle is also decided based on this race: cycles that are too small may see the chain of unparks go full circle before the first \gls{at} can park. 51 While this would not be a correctness problem, every runtime system must handle that race, it could lead to pushes and pops being optimized away. 52 Since silently omitting ready-queue operations would throw off the measuring of these operations, the ring of \glspl{at} must be big enough so the \glspl{at} have the time to fully park before they are unparked. 53 Note that this problem is only present on SMP machines and is significantly mitigated by the fact that there are multiple rings in the system. 54 55 To avoid this benchmark from being dominated by the idle sleep handling, the number of rings is kept at least as high as the number of \glspl{proc} available. 56 Beyond this point, adding more rings serves to mitigate even more the idle sleep handling. 57 This is to avoid the case where one of the \glspl{proc} runs out of work because of the variation on the number of ready \glspl{at} mentionned above. 58 59 The actual benchmark is more complicated to handle termination, but that simply requires using a binary semphore or a channel instead of raw \texttt{park}/\texttt{unpark} and carefully picking the order of the \texttt{P} and \texttt{V} with respect to the loop condition. 60 Figure~\ref{fig:cycle:code} shows pseudo code for this benchmark. 61 62 \begin{figure} 63 \begin{lstlisting} 64 Thread.main() { 65 count := 0 66 for { 67 wait() 68 this.next.wake() 69 count ++ 70 if must_stop() { break } 71 } 72 global.count += count 73 } 74 \end{lstlisting} 75 \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code} 76 \label{fig:cycle:code} 77 \end{figure} 78 79 48 That is, the runtime cannot anticipate that the current task will immediately park. 49 As well, the size of the cycle is also decided based on this race, \eg a small cycle may see the chain of unparks go full circle before the first \gls{at} parks because of time-slicing or multiple \procs. 50 Every runtime system must handle this race and cannot optimized away the ready-queue pushes and pops. 51 To prevent any attempt of silently omitting ready-queue operations, the ring of \glspl{at} is made big enough so the \glspl{at} have time to fully park before being unparked again. 52 (Note, an unpark is like a V on a semaphore, so the subsequent park (P) may not block.) 53 Finally, to further mitigate any underlying push/pop optimizations, especially on SMP machines, multiple rings are created in the experiment. 54 55 To avoid this benchmark being affected by idle-sleep handling, the number of rings is multiple times greater than the number of \glspl{proc}. 56 This design avoids the case where one of the \glspl{proc} runs out of work because of the variation on the number of ready \glspl{at} mentioned above. 57 58 Figure~\ref{fig:cycle:code} shows the pseudo code for this benchmark. 59 There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw @park@/@unpark@ and carefully picking the order of the @P@ and @V@ with respect to the loop condition. 60 61 \begin{figure} 62 \begin{cfa} 63 Thread.main() { 64 count := 0 65 for { 66 @wait()@ 67 @this.next.wake()@ 68 count ++ 69 if must_stop() { break } 70 } 71 global.count += count 72 } 73 \end{cfa} 74 \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code} 75 \label{fig:cycle:code} 76 \end{figure} 80 77 81 78 \subsection{Results} 79 Figure~\ref{fig:cycle:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle. 80 82 81 \begin{figure} 83 82 \subfloat[][Throughput, 100 \ats per \proc]{ … … 106 105 \label{fig:cycle:jax:low:ns} 107 106 } 108 \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput as a function of \proc count , using 100 cycles per \proc,5 \ats per cycle.}107 \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput as a function of \proc count with 100 cycles per \proc and 5 \ats per cycle.} 109 108 \label{fig:cycle:jax} 110 109 \end{figure} 111 Figure~\ref{fig:cycle:jax} shows the throughput as a function of \proc count, with the following constants:112 Each run uses 100 cycles per \proc, 5 \ats per cycle.113 110 114 111 \todo{results discussion} 115 112 116 113 \section{Yield} 117 For completion, I also include the yield benchmark.118 This benchmark is much simpler than the cycle tests, it simply creates many \glspl{at} that call \texttt{yield}.119 As mention ned in the previous section, this benchmark may be less representative of usages that only make limited use of \texttt{yield}, due to potential shortcuts in the routine.120 Its only interesting variable is the number of \glspl{at} per \glspl{proc}, where ratios close to 1 means the ready queue(s) couldbe empty.121 This s ometimes puts more strain on the idle sleep handling, compared to scenarios where there is clearly plenty of work to be done.122 Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, the ``wait/wake-next'' is simply replaced by a yield.123 124 \begin{figure} 125 \begin{lstlisting}126 127 128 129 yield()130 131 132 133 134 135 \end{lstlisting}136 137 114 For completion, the classic yield benchmark is included. 115 This benchmark is simpler than the cycle test: it creates many \glspl{at} that call @yield@. 116 As mentioned, this benchmark may not be representative because of optimization shortcuts in @yield@. 117 The only interesting variable in this benchmark is the number of \glspl{at} per \glspl{proc}, where ratios close to 1 means the ready queue(s) can be empty. 118 This scenario can put a strain on the idle-sleep handling compared to scenarios where there is plenty of work. 119 Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, where the @wait/next.wake@ is replaced by @yield@. 120 121 \begin{figure} 122 \begin{cfa} 123 Thread.main() { 124 count := 0 125 for { 126 @yield()@ 127 count ++ 128 if must_stop() { break } 129 } 130 global.count += count 131 } 132 \end{cfa} 133 \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code} 134 \label{fig:yield:code} 138 135 \end{figure} 139 136 140 137 \subsection{Results} 138 139 Figure~\ref{fig:yield:jax} shows the throughput as a function of \proc count, where each run uses 100 \ats per \proc. 140 141 141 \begin{figure} 142 142 \subfloat[][Throughput, 100 \ats per \proc]{ … … 168 168 \label{fig:yield:jax} 169 169 \end{figure} 170 Figure~\ref{fig:yield:ops:jax} shows the throughput as a function of \proc count, with the following constants:171 Each run uses 100 \ats per \proc.172 170 173 171 \todo{results discussion} 174 172 175 176 173 \section{Churn} 177 The Cycle and Yield benchmark represent s an ``easy'' scenario for a scheduler, \eg,an embarrassingly parallel application.178 In these benchmarks, \glspl{at} can be easily partitioned over the different \glspl{proc} up -front and none of the \glspl{at} communicate with each other.179 180 The Churn benchmark represents more chaotic usages, where there is no relation between the last \gls{proc} on which a \gls{at} ran and the \gls{proc} that unblockedit.181 W hen a \gls{at} is unblocked from a different \gls{proc} than the one on which it last ran, the unblocking \gls{proc} must either ``steal'' the \gls{at} or place it on a remotequeue.182 This results can result in either contention on the remote queueor \glspl{rmr} on \gls{at} data structure.183 In either case, this benchmark aims to highlight how each scheduler handles these cases, since both cases can lead to performance degradation if they arenot handled correctly.184 185 T o achieve this the benchmark uses a fixed size array ofsemaphores.186 Each \gls{at} picks a random semaphore, \texttt{V}s it to unblock a \at waiting and then \texttt{P}s on the semaphore.174 The Cycle and Yield benchmark represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application. 175 In these benchmarks, \glspl{at} can be easily partitioned over the different \glspl{proc} upfront and none of the \glspl{at} communicate with each other. 176 177 The Churn benchmark represents more chaotic execution, where there is no relation between the last \gls{proc} on which a \gls{at} ran and blocked and the \gls{proc} that subsequently unblocks it. 178 With processor-specific ready-queues, when a \gls{at} is unblocked by a different \gls{proc} that means the unblocking \gls{proc} must either ``steal'' the \gls{at} from another processor or find it on a global queue. 179 This dequeuing results in either contention on the remote queue and/or \glspl{rmr} on \gls{at} data structure. 180 In either case, this benchmark aims to highlight how each scheduler handles these cases, since both cases can lead to performance degradation if not handled correctly. 181 182 This benchmark uses a fixed-size array of counting semaphores. 183 Each \gls{at} picks a random semaphore, @V@s it to unblock any \at waiting, and then @P@s on the semaphore. 187 184 This creates a flow where \glspl{at} push each other out of the semaphores before being pushed out themselves. 188 For this benchmark to work however, the number of \glspl{at} must be equal or greater to the number of semaphores plus the number of \glspl{proc}. 189 Note that the nature of these semaphores mean the counter can go beyond 1, which could lead to calls to \texttt{P} not blocking. 185 For this benchmark to work, the number of \glspl{at} must be equal or greater than the number of semaphores plus the number of \glspl{proc}. 186 Note, the nature of these semaphores mean the counter can go beyond 1, which can lead to nonblocking calls to @P@. 187 Figure~\ref{fig:churn:code} shows pseudo code for this benchmark, where the @yield@ is replaced by @V@ and @P@. 188 189 \begin{figure} 190 \begin{cfa} 191 Thread.main() { 192 count := 0 193 for { 194 r := random() % len(spots) 195 @spots[r].V()@ 196 @spots[r].P()@ 197 count ++ 198 if must_stop() { break } 199 } 200 global.count += count 201 } 202 \end{cfa} 203 \caption[Churn Benchmark : Pseudo Code]{Churn Benchmark : Pseudo Code} 204 \label{fig:churn:code} 205 \end{figure} 206 207 \subsection{Results} 208 Figure~\ref{fig:churn:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle. 209 210 \begin{figure} 211 \subfloat[][Throughput, 100 \ats per \proc]{ 212 \resizebox{0.5\linewidth}{!}{ 213 \input{result.churn.jax.ops.pstex_t} 214 } 215 \label{fig:churn:jax:ops} 216 } 217 \subfloat[][Throughput, 1 \ats per \proc]{ 218 \resizebox{0.5\linewidth}{!}{ 219 \input{result.churn.low.jax.ops.pstex_t} 220 } 221 \label{fig:churn:jax:low:ops} 222 } 223 224 \subfloat[][Latency, 100 \ats per \proc]{ 225 \resizebox{0.5\linewidth}{!}{ 226 \input{result.churn.jax.ns.pstex_t} 227 } 228 229 } 230 \subfloat[][Latency, 1 \ats per \proc]{ 231 \resizebox{0.5\linewidth}{!}{ 232 \input{result.churn.low.jax.ns.pstex_t} 233 } 234 \label{fig:churn:jax:low:ns} 235 } 236 \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. 237 Throughput is the total operation per second across all cores. Latency is the duration of each operation.} 238 \label{fig:churn:jax} 239 \end{figure} 240 241 \todo{results discussion} 242 243 \section{Locality} 190 244 191 245 \todo{code, setup, results} 192 \begin{lstlisting}193 Thread.main() {194 count := 0195 for {196 r := random() % len(spots)197 spots[r].V()198 spots[r].P()199 count ++200 if must_stop() { break }201 }202 global.count += count203 }204 \end{lstlisting}205 206 \begin{figure}207 \subfloat[][Throughput, 100 \ats per \proc]{208 \resizebox{0.5\linewidth}{!}{209 \input{result.churn.jax.ops.pstex_t}210 }211 \label{fig:churn:jax:ops}212 }213 \subfloat[][Throughput, 1 \ats per \proc]{214 \resizebox{0.5\linewidth}{!}{215 \input{result.churn.low.jax.ops.pstex_t}216 }217 \label{fig:churn:jax:low:ops}218 }219 220 \subfloat[][Latency, 100 \ats per \proc]{221 \resizebox{0.5\linewidth}{!}{222 \input{result.churn.jax.ns.pstex_t}223 }224 225 }226 \subfloat[][Latency, 1 \ats per \proc]{227 \resizebox{0.5\linewidth}{!}{228 \input{result.churn.low.jax.ns.pstex_t}229 }230 \label{fig:churn:jax:low:ns}231 }232 \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. Throughput is the total operation per second across all cores. Latency is the duration of each opeartion.}233 \label{fig:churn:jax}234 \end{figure}235 236 \section{Locality}237 238 \todo{code, setup, results}239 246 240 247 \section{Transfer} 241 The last benchmark is more exactly characterize asan experiment than a benchmark.242 It tests the behavio r of the schedulers for a particularlymisbehaved workload.248 The last benchmark is more of an experiment than a benchmark. 249 It tests the behaviour of the schedulers for a misbehaved workload. 243 250 In this workload, one of the \gls{at} is selected at random to be the leader. 244 251 The leader then spins in a tight loop until it has observed that all other \glspl{at} have acknowledged its leadership. 245 252 The leader \gls{at} then picks a new \gls{at} to be the ``spinner'' and the cycle repeats. 246 247 The benchmark comes in two flavours for the behavior of the non-leader \glspl{at}: 248 once they acknowledged the leader, they either block on a semaphore or yield repeatadly. 249 250 This experiment is designed to evaluate the short term load balancing of the scheduler. 251 Indeed, schedulers where the runnable \glspl{at} are partitioned on the \glspl{proc} may need to balance the \glspl{at} for this experient to terminate. 252 This is because the spinning \gls{at} is effectively preventing the \gls{proc} from runnning any other \glspl{thrd}. 253 In the semaphore flavour, the number of runnable \glspl{at} will eventually dwindle down to only the leader. 254 This is a simpler case to handle for schedulers since \glspl{proc} eventually run out of work. 253 The benchmark comes in two flavours for the non-leader \glspl{at}: 254 once they acknowledged the leader, they either block on a semaphore or spin yielding. 255 256 The experiment is designed to evaluate the short-term load-balancing of a scheduler. 257 Indeed, schedulers where the runnable \glspl{at} are partitioned on the \glspl{proc} may need to balance the \glspl{at} for this experiment to terminate. 258 This problem occurs because the spinning \gls{at} is effectively preventing the \gls{proc} from running any other \glspl{thrd}. 259 In the semaphore flavour, the number of runnable \glspl{at} eventually dwindles down to only the leader. 260 This scenario is a simpler case to handle for schedulers since \glspl{proc} eventually run out of work. 255 261 In the yielding flavour, the number of runnable \glspl{at} stays constant. 256 This is a harder case to handle because corrective measures must be taken even if work is stillavailable.257 Note that languages that have mandatory preemption docircumvent this problem by forcing the spinner to yield.262 This scenario is a harder case to handle because corrective measures must be taken even when work is available. 263 Note, runtime systems with preemption circumvent this problem by forcing the spinner to yield. 258 264 259 265 \todo{code, setup, results} 260 \begin{lstlisting} 261 Thread.lead() { 262 this.idx_seen = ++lead_idx 263 if lead_idx > stop_idx { 264 done := true 265 return 266 } 267 268 // Wait for everyone to acknowledge my leadership 269 start: = timeNow() 266 267 \begin{figure} 268 \begin{cfa} 269 Thread.lead() { 270 this.idx_seen = ++lead_idx 271 if lead_idx > stop_idx { 272 done := true 273 return 274 } 275 276 // Wait for everyone to acknowledge my leadership 277 start: = timeNow() 278 for t in threads { 279 while t.idx_seen != lead_idx { 280 asm pause 281 if (timeNow() - start) > 5 seconds { error() } 282 } 283 } 284 285 // pick next leader 286 leader := threads[ prng() % len(threads) ] 287 288 // wake every one 289 if ! exhaust { 270 290 for t in threads { 271 while t.idx_seen != lead_idx { 272 asm pause 273 if (timeNow() - start) > 5 seconds { error() } 274 } 275 } 276 277 // pick next leader 278 leader := threads[ prng() % len(threads) ] 279 280 // wake every one 281 if !exhaust { 282 for t in threads { 283 if t != me { t.wake() } 284 } 285 } 286 } 287 288 Thread.wait() { 289 this.idx_seen := lead_idx 290 if exhaust { wait() } 291 else { yield() } 292 } 293 294 Thread.main() { 295 while !done { 296 if leader == me { this.lead() } 297 else { this.wait() } 298 } 299 } 300 \end{lstlisting} 291 if t != me { t.wake() } 292 } 293 } 294 } 295 296 Thread.wait() { 297 this.idx_seen := lead_idx 298 if exhaust { wait() } 299 else { yield() } 300 } 301 302 Thread.main() { 303 while !done { 304 if leader == me { this.lead() } 305 else { this.wait() } 306 } 307 } 308 \end{cfa} 309 \caption[Transfer Benchmark : Pseudo Code]{Transfer Benchmark : Pseudo Code} 310 \label{fig:transfer:code} 311 \end{figure} 312 313 \subsection{Results} 314 Figure~\ref{fig:transfer:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle. 315 316 \todo{results discussion} -
doc/theses/thierry_delisle_PhD/thesis/text/existing.tex
r4e2befe3 rdef751f 14 14 15 15 \section{Naming Convention} 16 Scheduling has been studied by various communities concentrating on different incarnation of the same problems. As a result, there are no standard naming conventions for scheduling that is respected across these communities. This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats. 16 Scheduling has been studied by various communities concentrating on different incarnation of the same problems. 17 As a result, there are no standard naming conventions for scheduling that is respected across these communities. 18 This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats. 17 19 18 20 \section{Static Scheduling} … … 26 28 \section{Dynamic Scheduling} 27 29 \newterm{Dynamic schedulers} determine \ats dependencies and costs during scheduling, if at all. 28 Hence, unlike static scheduling, \ats dependencies are conditional and detected at runtime. This detection takes the form of observing new \ats(s) in the system and determining dependencies from their behaviour, including suspending or halting a \ats that dynamically detects unfulfilled dependencies. 30 Hence, unlike static scheduling, \ats dependencies are conditional and detected at runtime. 31 This detection takes the form of observing new \ats(s) in the system and determining dependencies from their behaviour, including suspending or halting a \ats that dynamically detects unfulfilled dependencies. 29 32 Furthermore, each \ats has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled. 30 33 As a consequence, the scheduler often has an incomplete view of the system, seeing only \ats with no pending dependencies. … … 178 181 \begin{displayquote} 179 182 \begin{enumerate} 180 \item The task returned by \textit{t} \texttt{.execute()}183 \item The task returned by \textit{t}@.execute()@ 181 184 \item The successor of t if \textit{t} was its last completed predecessor. 182 185 \item A task popped from the end of the thread's own deque. … … 193 196 \paragraph{Quasar/Project Loom} 194 197 Java has two projects, Quasar~\cite{MAN:quasar} and Project Loom~\cite{MAN:project-loom}\footnote{It is unclear if these are distinct projects.}, that are attempting to introduce lightweight thread\-ing in the form of Fibers. 195 Both projects seem to be based on the \texttt{ForkJoinPool}in Java, which appears to be a simple incarnation of randomized work-stealing~\cite{MAN:java/fork-join}.198 Both projects seem to be based on the @ForkJoinPool@ in Java, which appears to be a simple incarnation of randomized work-stealing~\cite{MAN:java/fork-join}. 196 199 197 200 \paragraph{Grand Central Dispatch} … … 204 207 % http://web.archive.org/web/20090920043909/http://images.apple.com/macosx/technology/docs/GrandCentral_TB_brief_20090903.pdf 205 208 206 In terms of semantics, the Dispatch Queues seem to be very similar to Intel\textregistered ~TBB \texttt{execute()}and predecessor semantics.209 In terms of semantics, the Dispatch Queues seem to be very similar to Intel\textregistered ~TBB @execute()@ and predecessor semantics. 207 210 208 211 \paragraph{LibFibre} -
doc/theses/thierry_delisle_PhD/thesis/text/intro.tex
r4e2befe3 rdef751f 103 103 An algorithm for load-balancing and idle sleep of processors, including NUMA awareness. 104 104 \item 105 Support for user-level \glsxtrshort{io} capabilities based on Linux's \texttt{io\_uring}.105 Support for user-level \glsxtrshort{io} capabilities based on Linux's @io_uring@. 106 106 \end{enumerate} -
doc/theses/thierry_delisle_PhD/thesis/text/io.tex
r4e2befe3 rdef751f 1 1 \chapter{User Level \io} 2 As mentioned in Section~\ref{prev:io}, User-Level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations.2 As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations. 3 3 Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating-system. 4 4 5 5 \section{Kernel Interface} 6 Since this work fundamentally depends on operating-system support, the first step of any design is to discuss the available interfaces and pick one (or more) as the foundations of the non-blocking \io subsystem.6 Since this work fundamentally depends on operating-system support, the first step of this design is to discuss the available interfaces and pick one (or more) as the foundation for the non-blocking \io subsystem in this work. 7 7 8 8 \subsection{\lstinline{O_NONBLOCK}} … … 10 10 In this mode, ``Neither the @open()@ nor any subsequent \io operations on the [opened file descriptor] will cause the calling process to wait''~\cite{MAN:open}. 11 11 This feature can be used as the foundation for the non-blocking \io subsystem. 12 However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be use in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait13 \footnote{In this context, ready means \emph{some} operation can be performed without blocking.12 However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be used in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait.\footnote{ 13 In this context, ready means \emph{some} operation can be performed without blocking. 14 14 It does not mean an operation returning \lstinline{EAGAIN} succeeds on the next try. 15 For example, a ready read may only return a subset of bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}.15 For example, a ready read may only return a subset of requested bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.} 16 16 This mechanism is also crucial in determining when all \glspl{thrd} are blocked and the application \glspl{kthrd} can now block. 17 17 18 There are three options to monitor file descriptors in Linux 19 \footnote{For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}.18 There are three options to monitor file descriptors in Linux:\footnote{ 19 For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}. 20 20 The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.}, 21 21 @select@~\cite{MAN:select}, @poll@~\cite{MAN:poll} and @epoll@~\cite{MAN:epoll}. 22 22 All three of these options offer a system call that blocks a \gls{kthrd} until at least one of many file descriptors becomes ready. 23 The group of file descriptors being waited is called the \newterm{interest set}. 24 25 \paragraph{\lstinline{select}} is the oldest of these options, it takes as an input a contiguous array of bits, where each bits represent a file descriptor of interest. 26 On return, it modifies the set in place to identify which of the file descriptors changed status. 27 This destructive change means that calling select in a loop requires re-initializing the array each time and the number of file descriptors supported has a hard limit. 28 Another limit of @select@ is that once the call is started, the interest set can no longer be modified. 29 Monitoring a new file descriptor generally requires aborting any in progress call to @select@ 30 \footnote{Starting a new call to \lstinline{select} is possible but requires a distinct kernel thread, and as a result is not an acceptable multiplexing solution when the interest set is large and highly dynamic unless the number of parallel calls to \lstinline{select} can be strictly bounded.}. 31 32 \paragraph{\lstinline{poll}} is an improvement over select, which removes the hard limit on the number of file descriptors and the need to re-initialize the input on every call. 33 It works using an array of structures as an input rather than an array of bits, thus allowing a more compact input for small interest sets. 34 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed while the call is blocked. 35 36 \paragraph{\lstinline{epoll}} further improves these two functions by allowing the interest set to be dynamically added to and removed from while a \gls{kthrd} is blocked on an @epoll@ call. 23 The group of file descriptors being waited on is called the \newterm{interest set}. 24 25 \paragraph{\lstinline{select}} is the oldest of these options, and takes as input a contiguous array of bits, where each bit represents a file descriptor of interest. 26 Hence, the array length must be as long as the largest FD currently of interest. 27 On return, it outputs the set in place to identify which of the file descriptors changed state. 28 This destructive change means selecting in a loop requires re-initializing the array for each iteration. 29 Another limit of @select@ is that calls from different \glspl{kthrd} sharing FDs are independent. 30 Hence, if one \gls{kthrd} is managing the select calls, other threads can only add/remove to/from the manager's interest set through synchronized calls to update the interest set. 31 However, these changes are only reflected when the manager makes its next call to @select@. 32 Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/ttys it is waiting on never get data again. 33 Often the I/O manager has a timeout, polls, or is sent a signal on changes to mitigate this problem. 34 35 \begin{comment} 36 From: Tim Brecht <brecht@uwaterloo.ca> 37 Subject: Re: FD sets 38 Date: Wed, 6 Jul 2022 00:29:41 +0000 39 40 Large number of open files 41 -------------------------- 42 43 In order to be able to use more than the default number of open file 44 descriptors you may need to: 45 46 o increase the limit on the total number of open files /proc/sys/fs/file-max 47 (on Linux systems) 48 49 o increase the size of FD_SETSIZE 50 - the way I often do this is to figure out which include file __FD_SETSIZE 51 is defined in, copy that file into an appropriate directory in ./include, 52 and then modify it so that if you use -DBIGGER_FD_SETSIZE the larger size 53 gets used 54 55 For example on a RH 9.0 distribution I've copied 56 /usr/include/bits/typesizes.h into ./include/i386-linux/bits/typesizes.h 57 58 Then I modify typesizes.h to look something like: 59 60 #ifdef BIGGER_FD_SETSIZE 61 #define __FD_SETSIZE 32767 62 #else 63 #define __FD_SETSIZE 1024 64 #endif 65 66 Note that the since I'm moving and testing the userver on may different 67 machines the Makefiles are set up to use -I ./include/$(HOSTTYPE) 68 69 This way if you redefine the FD_SETSIZE it will get used instead of the 70 default original file. 71 \end{comment} 72 73 \paragraph{\lstinline{poll}} is the next oldest option, and takes as input an array of structures containing the FD numbers rather than their position in an array of bits, allowing a more compact input for interest sets that contain widely spaced FDs. 74 For small interest sets with densely packed FDs, the @select@ bit mask can take less storage, and hence, copy less information into the kernel. 75 Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialize on every call. 76 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \gls{kthrd}, while a manager thread is blocked in @poll@. 77 78 \paragraph{\lstinline{epoll}} follows after @poll@, and places the interest set in the kernel rather than the application, where it is managed by an internal \gls{kthrd}. 79 There are two separate functions: one to add to the interest set and another to check for FDs with state changes. 37 80 This dynamic capability is accomplished by creating an \emph{epoll instance} with a persistent interest set, which is used across multiple calls. 38 This capability significantly reduces synchronization overhead on the part of the caller (in this case the \io subsystem), since the interest set can be modified when adding or removing file descriptors without having to synchronize with other \glspl{kthrd} potentially calling @epoll@. 39 40 However, all three of these system calls have limitations. 81 As the interest set is augmented, the changes become implicitly part of the interest set for a blocked manager \gls{kthrd}. 82 This capability significantly reduces synchronization between \glspl{kthrd} and the manager calling @epoll@. 83 84 However, all three of these I/O systems have limitations. 41 85 The @man@ page for @O_NONBLOCK@ mentions that ``[@O_NONBLOCK@] has no effect for regular files and block devices'', which means none of these three system calls are viable multiplexing strategies for these types of \io operations. 42 86 Furthermore, @epoll@ has been shown to have problems with pipes and ttys~\cit{Peter's examples in some fashion}. … … 53 97 It also supports batching multiple operations in a single system call. 54 98 55 AIO offers two different approach to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.99 AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed. 56 100 For the purpose of \io multiplexing, @aio_suspend@ is the best interface. 57 101 However, even if AIO requests can be submitted concurrently, @aio_suspend@ suffers from the same limitation as @select@ and @poll@, \ie, the interest set cannot be dynamically changed while a call to @aio_suspend@ is in progress. … … 70 114 71 115 \begin{flushright} 72 -- Linus Torvalds \cit{https://lwn.net/Articles/671657/}116 -- Linus Torvalds~\cite{AIORant} 73 117 \end{flushright} 74 118 \end{displayquote} … … 85 129 A very recent addition to Linux, @io_uring@~\cite{MAN:io_uring}, is a framework that aims to solve many of the problems listed in the above interfaces. 86 130 Like AIO, it represents \io operations as entries added to a queue. 87 But like @epoll@, new requests can be submitted while a blocking call waiting for requests to completeis already in progress.131 But like @epoll@, new requests can be submitted, while a blocking call waiting for requests to complete, is already in progress. 88 132 The @io_uring@ interface uses two ring buffers (referred to simply as rings) at its core: a submit ring to which programmers push \io requests and a completion ring from which programmers poll for completion. 89 133 … … 97 141 In the worst case, where all \glspl{thrd} are consistently blocking on \io, it devolves into 1-to-1 threading. 98 142 However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \glspl{thrd} are ready to run. 99 This approach is used by languages like Go\cit{Go} and frameworks like libuv\cit{libuv}, since it has the advantage that it can easily be used across multiple operating systems.143 This approach is used by languages like Go\cit{Go}, frameworks like libuv\cit{libuv}, and web servers like Apache~\cite{apache} and Nginx~\cite{nginx}, since it has the advantage that it can easily be used across multiple operating systems. 100 144 This advantage is especially relevant for languages like Go, which offer a homogeneous \glsxtrshort{api} across all platforms. 101 145 As opposed to C, which has a very limited standard api for \io, \eg, the C standard library has no networking. … … 111 155 \section{Event-Engine} 112 156 An event engine's responsibility is to use the kernel interface to multiplex many \io operations onto few \glspl{kthrd}. 113 In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engine s then starts theoperation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.157 In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engine then starts an operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}. 114 158 The parked \glspl{thrd} are then rescheduled by the event engine once the desired operation has completed. 115 159 … … 134 178 \begin{enumerate} 135 179 \item 136 An SQE is allocated from the pre-allocated array (denoted \emph{S} in Figure~\ref{fig:iouring}).180 An SQE is allocated from the pre-allocated array \emph{S}. 137 181 This array is created at the same time as the @io_uring@ instance, is in kernel-locked memory visible by both the kernel and the application, and has a fixed size determined at creation. 138 How these entries are allocated is not important for the functioning of @io_uring@, the only requirement is that no entry is reused before the kernel has consumed it. 182 How these entries are allocated is not important for the functioning of @io_uring@; 183 the only requirement is that no entry is reused before the kernel has consumed it. 139 184 \item 140 185 The SQE is filled according to the desired operation. 141 This step is straight forward, the only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries. 186 This step is straight forward. 187 The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries. 142 188 \item 143 189 The SQE is submitted to the submission ring by appending the index of the SQE to the ring following regular ring buffer steps: \lstinline{buffer[head] = item; head++}. 144 190 Since the head is visible to the kernel, some memory barriers may be required to prevent the compiler from reordering these operations. 145 191 Since the submission ring is a regular ring buffer, more than one SQE can be added at once and the head is updated only after all entries are updated. 192 Note, SQE can be filled and submitted in any order, \eg in Figure~\ref{fig:iouring} the submission order is S0, S3, S2 and S1 has not been submitted. 146 193 \item 147 194 The kernel is notified of the change to the ring using the system call @io_uring_enter@. … … 161 208 The @io_uring_enter@ system call is protected by a lock inside the kernel. 162 209 This protection means that concurrent call to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@. 163 It is possible to do the first three submission steps in parallel, however, doing so requires careful synchronization. 210 It is possible to do the first three submission steps in parallel; 211 however, doing so requires careful synchronization. 164 212 165 213 @io_uring@ also introduces constraints on the number of simultaneous operations that can be ``in flight''. 166 Obviously, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once.167 In addition, the @io_uring_enter@ system call can fail because ``The kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can havepending.''.214 First, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once. 215 Second, the @io_uring_enter@ system call can fail because ``The kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can have pending.''. 168 216 This restriction means \io request bursts may have to be subdivided and submitted in chunks at a later time. 169 217 170 218 \subsection{Multiplexing \io: Submission} 219 171 220 The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made in the submission side. 172 While it is possible to do the first steps of submission in parallel, the duration of the system call scales with number of entries submitted. 221 While there is freedom in designing the submission side, there are some realities of @io_uring@ that must be taken into account. 222 It is possible to do the first steps of submission in parallel; 223 however, the duration of the system call scales with the number of entries submitted. 173 224 The consequence is that the amount of parallelism used to prepare submissions for the next system call is limited. 174 225 Beyond this limit, the length of the system call is the throughput limiting factor. 175 I concluded from early experiments that preparing submissions seems to take at most as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}. 176 Therefore the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances. 177 Similarly to scheduling, this sharding can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two. 178 Since completions are sent to the instance where requests were submitted, all instances with pending operations must be polled continously 179 \footnote{As will be described in Chapter~\ref{practice}, this does not translate into constant cpu usage.}. 226 I concluded from early experiments that preparing submissions seems to take almost as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}. 227 Therefore, the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances. 228 Since completions are sent to the instance where requests were submitted, all instances with pending operations must be polled continuously\footnote{ 229 As described in Chapter~\ref{practice}, this does not translate into constant CPU usage.}. 180 230 Note that once an operation completes, there is nothing that ties it to the @io_uring@ instance that handled it. 181 There is nothing preventing a new operation with, for example,the same file descriptors to a different @io_uring@ instance.231 There is nothing preventing a new operation with, \eg the same file descriptors to a different @io_uring@ instance. 182 232 183 233 A complicating aspect of submission is @io_uring@'s support for chains of operations, where the completion of an operation triggers the submission of the next operation on the link. 184 234 SQEs forming a chain must be allocated from the same instance and must be contiguous in the Submission Ring (see Figure~\ref{fig:iouring}). 185 The consequence of this feature is that filling SQEs can be arbitrarly complex and therefore users may need to run arbitrary code between allocation and submission. 186 Supporting chains is a requirement of the \io subsystem, but it is still valuable. 187 Support for this feature can be fulfilled simply to supporting arbitrary user code between allocation and submission. 188 189 \subsubsection{Public Instances} 190 One approach is to have multiple shared instances. 191 \Glspl{thrd} attempting \io operations pick one of the available instances and submit operations to that instance. 192 Since there is no coupling between \glspl{proc} and @io_uring@ instances in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently. 193 Since @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects: the synchronization needed to submit does not induce more contention than @io_uring@ already does and the scheme to route \io requests to specific @io_uring@ instances does not introduce contention. 194 This second aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm. 195 196 Allocation in this scheme can be handled fairly easily. 197 Free SQEs, \ie, SQEs that aren't currently being used to represent a request, can be written to safely and have a field called @user_data@ which the kernel only reads to copy to @cqe@s. 198 Allocation also requires no ordering guarantee as all free SQEs are interchangeable. 199 This requires a simple concurrent bag. 200 The only added complexity is that the number of SQEs is fixed, which means allocation can fail. 201 202 Allocation failures need to be pushed up to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available. 203 Furthermore, the routing algorithm should block operations up-front if none of the instances have available SQEs. 204 205 Once an SQE is allocated, \glspl{thrd} can fill them normally, they simply need to keep track of the SQE index and which instance it belongs to. 206 207 Once an SQE is filled in, what needs to happen is that the SQE must be added to the submission ring buffer, an operation that is not thread-safe on itself, and the kernel must be notified using the @io_uring_enter@ system call. 208 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail 209 \footnote{This is because it is invalid to have the same \lstinline{sqe} multiple times in the ring buffer.}. 210 However, as mentioned, the system call itself can fail with the expectation that it will be retried once some of the already submitted operations complete. 211 Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency. 212 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted. 213 This can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section. 214 215 In the case of designating a \gls{thrd}, ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests would be batched together and one of the \glspl{thrd} would do the system call on behalf of the others, referred to as the \newterm{submitter}. 216 In practice however, it is important that the \io requests are not left pending indefinitely and as such, it may be required to have a ``next submitter'' that guarentees everything that is missed by the current submitter is seen by the next one. 217 Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call will include their request. 218 Once the system call is done, the submitter must also free SQEs so that the allocator can reused them. 219 220 Finally, the completion side is much simpler since the @io_uring@ system call enforces a natural synchronization point. 221 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}. 222 Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}. 223 If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events. 224 A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled. 225 226 With this pool of instances approach, the big advantage is that it is fairly flexible. 227 It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions. 228 It also can gracefully handle running out of ressources, SQEs or the kernel returning @EBUSY@. 229 The down side to this is that many of the steps used for submitting need complex synchronization to work properly. 230 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed. 231 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused and handle the kernel returning @EBUSY@. 232 All this synchronization may have a significant cost and, compared to the next approach presented, this synchronization is entirely overhead. 235 The consequence of this feature is that filling SQEs can be arbitrarily complex, and therefore, users may need to run arbitrary code between allocation and submission. 236 Supporting chains is not a requirement of the \io subsystem, but it is still valuable. 237 Support for this feature can be fulfilled simply by supporting arbitrary user code between allocation and submission. 238 239 Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two. 240 These three sharding approaches are analyzed. 233 241 234 242 \subsubsection{Private Instances} 235 Another approach is to simply create one ring instance per \gls{proc}. 236 This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not interrupted in between two submission steps. 237 This is effectively the same requirement as using @thread_local@ variables. 238 Since SQEs that are allocated must be submitted to the same ring, on the same \gls{proc}, this effectively forces the application to submit SQEs in allocation order 239 \footnote{The actual requirement is that \glspl{thrd} cannot context switch between allocation and submission. 240 This requirement means that from the subsystem's point of view, the allocation and submission are sequential. 241 To remove this requirement, a \gls{thrd} would need the ability to ``yield to a specific \gls{proc}'', \ie, park with the promise that it will be run next on a specific \gls{proc}, the \gls{proc} attached to the correct ring.} 242 , greatly simplifying both allocation and submission. 243 In this design, allocation and submission form a partitionned ring buffer as shown in Figure~\ref{fig:pring}. 244 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to do the system call. 245 Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, etc. 243 The private approach creates one ring instance per \gls{proc}, \ie one-to-one coupling. 244 This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not time-sliced during submission steps. 245 This requirement is the same as accessing @thread_local@ variables, where a \gls{thrd} is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data. 246 This failure is the serially reusable problem~\cite{SeriallyReusable}. 247 Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in allocation order.\footnote{ 248 To remove this requirement, a \gls{thrd} needs the ability to ``yield to a specific \gls{proc}'', \ie, park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.} 249 From the subsystem's point of view, the allocation and submission are sequential, greatly simplifying both. 250 In this design, allocation and submission form a partitioned ring buffer as shown in Figure~\ref{fig:pring}. 251 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to perform the system call. 252 Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, \etc. 246 253 247 254 \begin{figure} … … 254 261 \end{figure} 255 262 256 This approach has the advantage that it does not require much of the synchronization needed in the shared approach. 257 This comes at the cost that \glspl{thrd} submitting \io operations have less flexibility, they cannot park or yield, and several exceptional cases are handled poorly. 258 Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations, in such a case the \gls{thrd} needs to be moved to a different \gls{proc}, the only current way of achieving this would be to @yield()@ hoping to be scheduled on a different \gls{proc}, which is not guaranteed. 259 260 A more involved version of this approach can seem to solve most of these problems, using a pattern called \newterm{helping}. 261 \Glspl{thrd} that wish to submit \io operations but cannot do so 262 \footnote{either because of an allocation failure or because they were migrate to a different \gls{proc} between allocation and submission} 263 create an object representing what they wish to achieve and add it to a list somewhere. 264 For this particular problem, one solution would be to have a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster. 265 The problem with these ``solutions'' is that they are still bound by the strong coupling between \glspl{proc} and @io_uring@ instances. 266 These data structures would allow moving \glspl{thrd} to a specific \gls{proc} when the current \gls{proc} cannot fulfill the \io request. 267 268 Imagine a simple case with two \glspl{thrd} on two \glspl{proc}, one \gls{thrd} submits an \io operation and then sets a flag, the other \gls{thrd} spins until the flag is set. 269 If the first \gls{thrd} is preempted between allocation and submission and moves to the other \gls{proc}, the original \gls{proc} could start running the spinning \gls{thrd}. 270 If this happens, the helping ``solution'' is for the \io \gls{thrd}to added append an item to the submission list of the \gls{proc} where the allocation was made. 263 This approach has the advantage that it does not require much of the synchronization needed in a shared approach. 264 However, this benefit means \glspl{thrd} submitting \io operations have less flexibility: they cannot park or yield, and several exceptional cases are handled poorly. 265 Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations. 266 In this case, the \io \gls{thrd} needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed. 267 268 A more involved version of this approach tries to solve these problems using a pattern called \newterm{helping}. 269 \Glspl{thrd} that cannot submit \io operations, either because of an allocation failure or migration to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster. 270 While there is still the strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \glspl{thrd} to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request. 271 272 Imagine a simple scenario with two \glspl{thrd} on two \glspl{proc}, where one \gls{thrd} submits an \io operation and then sets a flag, while the other \gls{thrd} spins until the flag is set. 273 Assume both \glspl{thrd} are running on the same \gls{proc}, and the \io \gls{thrd} is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \gls{thrd}. 274 In this case, the helping solution has the \io \gls{thrd} append an \io object to the submission list of the first \gls{proc}, where the allocation was made. 271 275 No other \gls{proc} can help the \gls{thrd} since @io_uring@ instances are strongly coupled to \glspl{proc}. 272 However, in this case, the \gls{proc} is unable to help because it is executing the spinning \gls{thrd} mentioned when first expression this case 273 \footnote{This particular example is completely artificial, but in the presence of many more \glspl{thrd}, it is not impossible that this problem would arise ``in the wild''. 274 Furthermore, this pattern is difficult to reliably detect and avoid.} 275 resulting in a deadlock. 276 Once in this situation, the only escape is to interrupted the execution of the \gls{thrd}, either directly or due to regular preemption, only then can the \gls{proc} take the time to handle the pending request to help. 277 Interrupting \glspl{thrd} for this purpose is far from desireable, the cost is significant and the situation may be hard to detect. 278 However, a more subtle reason why interrupting the \gls{thrd} is not a satisfying solution is that the \gls{proc} is not actually using the instance it is tied to. 279 If it were to use it, then helping could be done as part of the usage. 276 However, the \io \gls{proc} is unable to help because it is executing the spinning \gls{thrd} resulting in a deadlock. 277 While this example is artificial, in the presence of many \glspl{thrd}, it is possible for this problem to arise ``in the wild''. 278 Furthermore, this pattern is difficult to reliably detect and avoid. 279 Once in this situation, the only escape is to interrupted the spinning \gls{thrd}, either directly or via some regular preemption, \eg time slicing. 280 Having to interrupt \glspl{thrd} for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect. 280 281 Interrupts are needed here entirely because the \gls{proc} is tied to an instance it is not using. 281 Therefore a more satisfying solution would be for the \gls{thrd} submitting the operation to simply notice that the instance is unused and simply go ahead and use it. 282 This is the approach presented next. 282 Therefore, a more satisfying solution is for the \gls{thrd} submitting the operation to notice that the instance is unused and simply go ahead and use it. 283 This approach is presented shortly. 284 285 \subsubsection{Public Instances} 286 The public approach creates decoupled pools of @io_uring@ instances and processors, \ie without one-to-one coupling. 287 \Glspl{thrd} attempting an \io operation pick one of the available instances and submit the operation to that instance. 288 Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently. 289 Because @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects: 290 \begin{itemize} 291 \item 292 The synchronization needed to submit does not induce more contention than @io_uring@ already does. 293 \item 294 The scheme to route \io requests to specific @io_uring@ instances does not introduce contention. 295 This aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm. 296 \end{itemize} 297 298 Allocation in this scheme is fairly easy. 299 Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written to safely and have a field called @user_data@ that the kernel only reads to copy to @cqe@s. 300 Allocation also requires no ordering guarantee as all free SQEs are interchangeable. 301 The only added complexity is that the number of SQEs is fixed, which means allocation can fail. 302 303 Allocation failures need to be pushed to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available. 304 Furthermore, the routing algorithm should block operations up-front, if none of the instances have available SQEs. 305 306 Once an SQE is allocated, \glspl{thrd} insert the \io request information, and keep track of the SQE index and the instance it belongs to. 307 308 Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread-safe, and then the kernel must be notified using the @io_uring_enter@ system call. 309 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean a \lstinline{sqe} multiple times in the ring buffer, which is undefined behaviour. 310 However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations complete. 311 312 Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency. 313 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted. 314 Balancing submission can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section. 315 316 Ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \glspl{thrd} is designated to do the system call on behalf of the others, called the \newterm{submitter}. 317 However, in practice, \io requests must be handed promptly so there is a need to guarantee everything missed by the current submitter is seen by the next one. 318 Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call includes their request. 319 Once the system call is done, the submitter must also free SQEs so that the allocator can reused them. 320 321 Finally, the completion side is much simpler since the @io_uring@ system-call enforces a natural synchronization point. 322 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}. 323 Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}. 324 If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events. 325 A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled. 326 327 With the pool of SEQ instances approach, the big advantage is that it is fairly flexible. 328 It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions. 329 It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@. 330 The down side to this approach is that many of the steps used for submitting need complex synchronization to work properly. 331 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed. 332 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused, and handle the kernel returning @EBUSY@. 333 All this synchronization has a significant cost, and compared to the private-instance approach, this synchronization is entirely overhead. 283 334 284 335 \subsubsection{Instance borrowing} 285 Both of the approaches presented above have undesirable aspects that stem from too loose or too tight coupling between @io_uring@ and \glspl{proc}. 286 In the first approach, loose coupling meant that all operations have synchronization overhead that a tighter coupling can avoid. 287 The second approach on the other hand suffers from tight coupling causing problems when the \gls{proc} do not benefit from the coupling. 288 While \glspl{proc} are continously issuing \io operations tight coupling is valuable since it avoids synchronization costs. 289 However, in unlikely failure cases or when \glspl{proc} are not making use of their instance, tight coupling is no longer advantageous. 290 A compromise between these approaches would be to allow tight coupling but have the option to revoke this coupling dynamically when failure cases arise. 291 I call this approach ``instance borrowing''\footnote{While it looks similar to work-sharing and work-stealing, I think it is different enough from either to warrant a different verb to avoid confusion.}. 292 293 In this approach, each cluster owns a pool of @io_uring@ instances managed by an arbiter. 336 Both of the prior approaches have undesirable aspects that stem from tight or loose coupling between @io_uring@ and \glspl{proc}. 337 The first approach suffers from tight coupling causing problems when a \gls{proc} does not benefit from the coupling. 338 The second approach suffers from loose coupling causing operations to have synchronization overhead, which tighter coupling avoids. 339 When \glspl{proc} are continuously issuing \io operations, tight coupling is valuable since it avoids synchronization costs. 340 However, in unlikely failure cases or when \glspl{proc} are not using their instances, tight coupling is no longer advantageous. 341 A compromise between these approaches is to allow tight coupling but have the option to revoke the coupling dynamically when failure cases arise. 342 I call this approach \newterm{instance borrowing}.\footnote{ 343 While instance borrowing looks similar to work sharing and stealing, I think it is different enough to warrant a different verb to avoid confusion.} 344 345 In this approach, each cluster, see Figure~\ref{fig:system}, owns a pool of @io_uring@ instances managed by an \newterm{arbiter}. 294 346 When a \gls{thrd} attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance. 295 However, in doing so it ties to the instance to the \gls{proc} it is currentlyrunning on.296 This coupling is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.297 This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at a ny giventime, akin to the private instances approach.298 However, where it differs is that revocation fromthe arbiter means this approach does not suffer from the deadlock scenario described above.347 This instance is now bound to the \gls{proc} the \gls{thrd} is running on. 348 This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io. 349 This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at a time, akin to the private instances approach. 350 However, it differs in that revocation by the arbiter means this approach does not suffer from the deadlock scenario described above. 299 351 300 352 Arbitration is needed in the following cases: 301 353 \begin{enumerate} 302 \item The current \gls{proc} does not currentlyhold an instance.354 \item The current \gls{proc} does not hold an instance. 303 355 \item The current instance does not have sufficient SQEs to satisfy the request. 304 \item The current \gls{proc} has the wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission. 305 I will refer to these as \newterm{External Submissions}. 356 \item The current \gls{proc} has a wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission, called \newterm{external submissions}. 306 357 \end{enumerate} 307 However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their ownership of the instance is not being revoked.308 This can be accomplished by a lock-less handshake\footnote{Note that the handshake is not Lock-\emph{Free} since it lacks the proper progress guarantee.}. 358 However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their instance ownership is not being revoked, which is accomplished by a lock-\emph{less} handshake.\footnote{ 359 Note the handshake is not lock \emph{free} since it lacks the proper progress guarantee.} 309 360 A \gls{proc} raises a local flag before using its borrowed instance and checks if the instance is marked as revoked or if the arbiter has raised its flag. 310 If not it proceeds, otherwise it delegates the operation to the arbiter.361 If not, it proceeds, otherwise it delegates the operation to the arbiter. 311 362 Once the operation is completed, the \gls{proc} lowers its local flag. 312 363 313 Correspondingly, before revoking an instance the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag.364 Correspondingly, before revoking an instance, the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag. 314 365 Only then does it reclaim the instance and potentially assign it to an other \gls{proc}. 315 366 … … 323 374 324 375 \paragraph{External Submissions} are handled by the arbiter by revoking the appropriate instance and adding the submission to the submission ring. 325 There is no need to immediately revoke the instance however.376 However, there is no need to immediately revoke the instance. 326 377 External submissions must simply be added to the ring before the next system call, \ie, when the submission ring is flushed. 327 This means that whoever is responsible for the system call first checks if the instance has any external submissions. 328 If it is the case, it asks the arbiter to revoke the instance and add the external submissions to the ring. 329 330 \paragraph{Pending Allocations} can be more complicated to handle. 331 If the arbiter has available instances, the arbiter can attempt to directly hand over the instance and satisfy the request. 332 Otherwise it must hold onto the list of threads until SQEs are made available again. 333 This handling becomes that much more complex if pending allocation require more than one SQE, since the arbiter must make a decision between statisfying requests in FIFO ordering or satisfy requests for fewer SQEs first. 334 335 While this arbiter has the potential to solve many of the problems mentionned in above, it also introduces a significant amount of complexity. 378 This means whoever is responsible for the system call, first checks if the instance has any external submissions. 379 If so, it asks the arbiter to revoke the instance and add the external submissions to the ring. 380 381 \paragraph{Pending Allocations} are handled by the arbiter when it has available instances and can directly hand over the instance and satisfy the request. 382 Otherwise, it must hold onto the list of threads until SQEs are made available again. 383 This handling is more complex when an allocation requires multiple SQEs, since the arbiter must make a decision between satisfying requests in FIFO ordering or for fewer SQEs. 384 385 While an arbiter has the potential to solve many of the problems mentioned above, it also introduces a significant amount of complexity. 336 386 Tracking which processors are borrowing which instances and which instances have SQEs available ends-up adding a significant synchronization prelude to any I/O operation. 337 387 Any submission must start with a handshake that pins the currently borrowed instance, if available. 338 388 An attempt to allocate is then made, but the arbiter can concurrently be attempting to allocate from the same instance from a different \gls{hthrd}. 339 Once the allocation is completed, the submission must still check that the instance is still burrowed before attemptto flush.340 These extra synchronization steps end-up having a similar cost to the multiple sharedinstances approach.389 Once the allocation is completed, the submission must check that the instance is still burrowed before attempting to flush. 390 These synchronization steps turn out to have a similar cost to the multiple shared-instances approach. 341 391 Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end-up cycling the processors, which leads to significant cache deterioration. 342 Because ofthese reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice.392 For these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice. 343 393 344 394 \subsubsection{Private Instances V2} 345 395 346 347 348 396 % Verbs of this design 349 397 350 398 % Allocation: obtaining an sqe from which to fill in the io request, enforces the io instance to use since it must be the one which provided the sqe. Must interact with the arbiter if the instance does not have enough sqe for the allocation. (Typical allocation will ask for only one sqe, but chained sqe must be allocated from the same context so chains of sqe must be allocated in bulks) 351 399 352 % Submi tion: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.400 % Submission: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission. 353 401 354 402 % Flushing: Taking all the sqes that were submitted and making them visible to the kernel, also counting them in order to figure out what to_submit should be. Must be thread-safe with submission. Has to interact with the Arbiter if there are external submissions. Can't simply use a protected queue because adding to the array is not safe if the ring is still available for submitters. Flushing must therefore: check if there are external pending requests if so, ask the arbiter to flush otherwise use the fast flush operation. … … 357 405 358 406 % Handle: process all the produced cqe. No need to interact with any of the submission operations or the arbiter. 359 360 361 407 362 408 … … 404 450 405 451 \section{Interface} 406 Finally, the last important part of the \io subsystem is it's interface. There are multiple approaches that can be offered to programmers, each with advantages and disadvantages. The new \io subsystem can replace the C runtime's API or extend it. And in the later case the interface can go from very similar to vastly different. The following sections discuss some useful options using @read@ as an example. The standard Linux interface for C is : 407 408 @ssize_t read(int fd, void *buf, size_t count);@ 452 The last important part of the \io subsystem is its interface. 453 There are multiple approaches that can be offered to programmers, each with advantages and disadvantages. 454 The new \io subsystem can replace the C runtime API or extend it, and in the later case, the interface can go from very similar to vastly different. 455 The following sections discuss some useful options using @read@ as an example. 456 The standard Linux interface for C is : 457 \begin{cfa} 458 ssize_t read(int fd, void *buf, size_t count); 459 \end{cfa} 409 460 410 461 \subsection{Replacement} 411 462 Replacing the C \glsxtrshort{api} is the more intrusive and draconian approach. 412 463 The goal is to convince the compiler and linker to replace any calls to @read@ to direct them to the \CFA implementation instead of glibc's. 413 This has the advantage of potentiallyworking transparently and supporting existing binaries without needing recompilation.464 This rerouting has the advantage of working transparently and supporting existing binaries without needing recompilation. 414 465 It also offers a, presumably, well known and familiar API that C programmers can simply continue to work with. 415 However, this approach also entails a plethora of subtle technical challenges which generally boils down to making a perfect replacement.466 However, this approach also entails a plethora of subtle technical challenges, which generally boils down to making a perfect replacement. 416 467 If the \CFA interface replaces only \emph{some} of the calls to glibc, then this can easily lead to esoteric concurrency bugs. 417 Since the gcc ecosystems does not offer a scheme for suchperfect replacement, this approach was rejected as being laudable but infeasible.468 Since the gcc ecosystems does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible. 418 469 419 470 \subsection{Synchronous Extension} 420 An other interface option is to simply offer an interface that is different in name only. For example: 421 422 @ssize_t cfa_read(int fd, void *buf, size_t count);@ 423 424 \noindent This is much more feasible but still familiar to C programmers. 425 It comes with the caveat that any code attempting to use it must be recompiled, which can be a big problem considering the amount of existing legacy C binaries. 471 Another interface option is to offer an interface different in name only. 472 For example: 473 \begin{cfa} 474 ssize_t cfa_read(int fd, void *buf, size_t count); 475 \end{cfa} 476 This approach is feasible and still familiar to C programmers. 477 It comes with the caveat that any code attempting to use it must be recompiled, which is a problem considering the amount of existing legacy C binaries. 426 478 However, it has the advantage of implementation simplicity. 479 Finally, there is a certain irony to using a blocking synchronous interfaces for a feature often referred to as ``non-blocking'' \io. 427 480 428 481 \subsection{Asynchronous Extension} 429 It is important to mention that there is a certain irony to using only synchronous, therefore blocking, interfaces for a feature often referred to as ``non-blocking'' \io. 430 A fairly traditional way of doing this is using futures\cit{wikipedia futures}. 431 As simple way of doing so is as follows: 432 433 @future(ssize_t) read(int fd, void *buf, size_t count);@ 434 435 \noindent Note that this approach is not necessarily the most idiomatic usage of futures. 436 The definition of read above ``returns'' the read content through an output parameter which cannot be synchronized on. 437 A more classical asynchronous API could look more like: 438 439 @future([ssize_t, void *]) read(int fd, size_t count);@ 440 441 \noindent However, this interface immediately introduces memory lifetime challenges since the call must effectively allocate a buffer to be returned. 442 Because of the performance implications of this, the first approach is considered preferable as it is more familiar to C programmers. 443 444 \subsection{Interface directly to \lstinline{io_uring}} 445 Finally, an other interface that can be relevant is to simply expose directly the underlying \texttt{io\_uring} interface. For example: 446 447 @array(SQE, want) cfa_io_allocate(int want);@ 448 449 @void cfa_io_submit( const array(SQE, have) & );@ 450 451 \noindent This offers more flexibility to users wanting to fully use all of the \texttt{io\_uring} features. 482 A fairly traditional way of providing asynchronous interactions is using a future mechanism~\cite{multilisp}, \eg: 483 \begin{cfa} 484 future(ssize_t) read(int fd, void *buf, size_t count); 485 \end{cfa} 486 where the generic @future@ is fulfilled when the read completes and it contains the number of bytes read, which may be less than the number of bytes requested. 487 The data read is placed in @buf@. 488 The problem is that both the bytes read and data form the synchronization object, not just the bytes read. 489 Hence, the buffer cannot be reused until the operation completes but the synchronization does not cover the buffer. 490 A classical asynchronous API is: 491 \begin{cfa} 492 future([ssize_t, void *]) read(int fd, size_t count); 493 \end{cfa} 494 where the future tuple covers the components that require synchronization. 495 However, this interface immediately introduces memory lifetime challenges since the call must effectively allocate a buffer to be returned. 496 Because of the performance implications of this API, the first approach is considered preferable as it is more familiar to C programmers. 497 498 \subsection{Direct \lstinline{io_uring} Interface} 499 The last interface directly exposes the underlying @io_uring@ interface, \eg: 500 \begin{cfa} 501 array(SQE, want) cfa_io_allocate(int want); 502 void cfa_io_submit( const array(SQE, have) & ); 503 \end{cfa} 504 where the generic @array@ contains an array of SQEs with a size that may be less than the request. 505 This offers more flexibility to users wanting to fully utilize all of the @io_uring@ features. 452 506 However, it is not the most user-friendly option. 453 It obviously imposes a strong dependency between user code and \texttt{io\_uring} but at the same time restricting users to usages that are compatible with how \CFA internally uses \texttt{io\_uring}. 454 455 507 It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricting users to usages that are compatible with how \CFA internally uses @io_uring@. -
doc/theses/thierry_delisle_PhD/thesis/text/practice.tex
r4e2befe3 rdef751f 1 1 \chapter{Scheduling in practice}\label{practice} 2 The scheduling algorithm d iscribed in Chapter~\ref{core} addresses scheduling in a stable state.3 However, it does not address problems that occur when the system changes state.2 The scheduling algorithm described in Chapter~\ref{core} addresses scheduling in a stable state. 3 This chapter addresses problems that occur when the system state changes. 4 4 Indeed the \CFA runtime, supports expanding and shrinking the number of \procs, both manually and, to some extent, automatically. 5 This entails that the scheduling algorithm must support these transitions. 6 7 More precise \CFA supports adding \procs using the RAII object @processor@. 8 These objects can be created at any time and can be destroyed at any time. 9 They are normally created as automatic stack variables, but this is not a requirement. 10 11 The consequence is that the scheduler and \io subsystems must support \procs comming in and out of existence. 5 These changes affect the scheduling algorithm, which must dynamically alter its behaviour. 6 7 In detail, \CFA supports adding \procs using the type @processor@, in both RAII and heap coding scenarios. 8 \begin{cfa} 9 { 10 processor p[4]; // 4 new kernel threads 11 ... // execute on 4 processors 12 processor * dp = new( processor, 6 ); // 6 new kernel threads 13 ... // execute on 10 processors 14 delete( dp ); // delete 6 kernel threads 15 ... // execute on 4 processors 16 } // delete 4 kernel threads 17 \end{cfa} 18 Dynamically allocated processors can be deleted an any time, \ie their lifetime exceeds the block of creation. 19 The consequence is that the scheduler and \io subsystems must know when these \procs come in and out of existence and roll them into the appropriate scheduling algorithms. 12 20 13 21 \section{Manual Resizing} 14 22 Manual resizing is expected to be a rare operation. 15 Programmers are mostly expected to resize clusters on startup orteardown.16 Therefore dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state.17 As such all internal arrays that are sized based on the number of \procs need to be \texttt{realloc}ed.18 This also means that any references into these arrays, pointers or indexes, may need to be fixed when shrinking\footnote{Indexes may still need fixing when shrinkingbecause some indexes are expected to refer to dense contiguous resources and there is no guarantee the resource being removed has the highest index.}.23 Programmers normally create/delete processors on a clusters at startup/teardown. 24 Therefore, dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state. 25 As such, all internal scheduling arrays that are sized based on the number of \procs need to be @realloc@ed. 26 This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or for any other reason. 19 27 20 28 There are no performance requirements, within reason, for resizing since it is expected to be rare. 21 However, this operation has strict correctness requirements since shrinking and idle sleep can easily lead to deadlocks.29 However, this operation has strict correctness requirements since updating and idle sleep can easily lead to deadlocks. 22 30 It should also avoid as much as possible any effect on performance when the number of \procs remain constant. 23 31 This later requirement prohibits naive solutions, like simply adding a global lock to the ready-queue arrays. 24 32 25 33 \subsection{Read-Copy-Update} 26 One solution is to use the Read-Copy-Update\cite{wiki:rcu} pattern. 27 In this pattern, resizing is done by creating a copy of the internal data strucures, updating the copy with the desired changes, and then attempt an Idiana Jones Switch to replace the original witht the copy. 28 This approach potentially has the advantage that it may not need any synchronization to do the switch. 29 However, there is a race where \procs could still use the previous, original, data structure after the copy was switched in. 30 This race not only requires some added memory reclamation scheme, it also requires that operations made on the stale original version be eventually moved to the copy. 31 32 For linked-lists, enqueing is only somewhat problematic, \ats enqueued to the original queues need to be transferred to the new, which might not preserve ordering. 33 Dequeing is more challenging. 34 Dequeing from the original will not necessarily update the copy which could lead to multiple \procs dequeing the same \at. 35 Fixing this requires more synchronization or more indirection on the queues. 36 37 Another challenge is that the original must be kept until all \procs have witnessed the change. 38 This is a straight forward memory reclamation challenge but it does mean that every operation will need \emph{some} form of synchronization. 39 If each of these operation does need synchronization then it is possible a simpler solution achieves the same performance. 40 Because in addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges. 34 One solution is to use the Read-Copy-Update pattern~\cite{wiki:rcu}. 35 In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempt an Indiana Jones Switch to replace the original with the copy. 36 This approach has the advantage that it may not need any synchronization to do the switch. 37 However, there is a race where \procs still use the original data structure after the copy is switched. 38 This race not only requires adding a memory-reclamation scheme, it also requires that operations made on the stale original version are eventually moved to the copy. 39 40 Specifically, the original data structure must be kept until all \procs have witnessed the change. 41 This requirement is the \newterm{memory reclamation challenge} and means every operation needs \emph{some} form of synchronization. 42 If all operations need synchronization, then the overall cost of this technique is likely to be similar to an uncontended lock approach. 43 In addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges. 41 44 Especially merging subqueues while having a minimal impact on fairness and locality. 42 45 43 \subsection{Read-Writer Lock} 44 A simpler approach would be to use a \newterm{Readers-Writer Lock}\cite{wiki:rwlock} where the resizing requires acquiring the lock as a writer while simply enqueing/dequeing \ats requires acquiring the lock as a reader. 46 For example, given a linked-list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure. 47 If the list supports arbitrary insertions, then inconsistencies in the tail pointer do not break the list; 48 however, ordering may not be preserved. 49 Furthermore, nodes enqueued to the original queues eventually need to be uniquely transferred to the new queues, which may further perturb ordering. 50 Dequeuing is more challenging when nodes appear on both lists because of pending reclamation: dequeuing a node from one list does not remove it from the other nor is that node in the same place on the other list. 51 This situation can lead to multiple \procs dequeuing the same \at. 52 Fixing these challenges requires more synchronization or more indirection to the queues, plus coordinated searching to ensure unique elements. 53 54 \subsection{Readers-Writer Lock} 55 A simpler approach is to use a \newterm{Readers-Writer Lock}~\cite{wiki:rwlock}, where the resizing requires acquiring the lock as a writer while simply enqueueing/dequeuing \ats requires acquiring the lock as a reader. 45 56 Using a Readers-Writer lock solves the problem of dynamically resizing and leaves the challenge of finding or building a lock with sufficient good read-side performance. 46 Since this is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken. 47 48 To maximize reader scalability, the readers should not contend with eachother when attempting to acquire and release the critical sections. 49 This effectively requires that each reader have its own piece of memory to mark as locked and unlocked. 50 Reades then acquire the lock wait for writers to finish the critical section and then acquire their local spinlocks. 51 Writers acquire the global lock, so writers have mutual exclusion among themselves, and then acquires each of the local reader locks. 52 Acquiring all the local locks guarantees mutual exclusion between the readers and the writer, while the wait on the read side prevents readers from continously starving the writer. 53 \todo{reference listings} 54 55 \begin{lstlisting} 57 Since this approach is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken. 58 59 To maximize reader scalability, readers should not contend with each other when attempting to acquire and release a critical section. 60 To achieve this goal requires each reader to have its own memory to mark as locked and unlocked. 61 The read acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock. 62 The write acquire acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks. 63 Acquiring all the local read locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer. 64 65 Figure~\ref{f:SpecializedReadersWriterLock} shows the outline for this specialized readers-writer lock. 66 The lock in nonblocking, so both readers and writers spin while the lock is held. 67 \todo{finish explanation} 68 69 \begin{figure} 70 \begin{cfa} 56 71 void read_lock() { 57 72 // Step 1 : make sure no writers in 58 73 while write_lock { Pause(); } 59 60 // May need fence here61 62 74 // Step 2 : acquire our local lock 63 while atomic_xchg( tls.lock ) { 64 Pause(); 65 } 66 } 67 75 while atomic_xchg( tls.lock ) { Pause(); } 76 } 68 77 void read_unlock() { 69 78 tls.lock = false; 70 79 } 71 \end{lstlisting}72 73 \begin{lstlisting}74 80 void write_lock() { 75 81 // Step 1 : lock global lock 76 while atomic_xchg( write_lock ) { 77 Pause(); 78 } 79 82 while atomic_xchg( write_lock ) { Pause(); } 80 83 // Step 2 : lock per-proc locks 81 84 for t in all_tls { 82 while atomic_xchg( t.lock ) { 83 Pause(); 84 } 85 while atomic_xchg( t.lock ) { Pause(); } 85 86 } 86 87 } 87 88 88 void write_unlock() { 89 89 // Step 1 : release local locks 90 for t in all_tls { 91 t.lock = false; 92 } 93 90 for t in all_tls { t.lock = false; } 94 91 // Step 2 : release global lock 95 92 write_lock = false; 96 93 } 97 \end{lstlisting} 98 99 \section{Idle-Sleep} 100 In addition to users manually changing the number of \procs, it is desireable to support ``removing'' \procs when there is not enough \ats for all the \procs to be useful. 101 While manual resizing is expected to be rare, the number of \ats is expected to vary much more which means \procs may need to be ``removed'' for only short periods of time. 102 Furthermore, race conditions that spuriously lead to the impression that no \ats are ready are actually common in practice. 103 Therefore resources associated with \procs should not be freed but \procs simply put into an idle state where the \gls{kthrd} is blocked until more \ats become ready. 104 This state is referred to as \newterm{Idle-Sleep}. 94 \end{cfa} 95 \caption{Specialized Readers-Writer Lock} 96 \label{f:SpecializedReadersWriterLock} 97 \end{figure} 98 99 \section{Idle-Sleep}\label{idlesleep} 100 While manual resizing of \procs is expected to be rare, the number of \ats can vary significantly over an application's lifetime, which means there are times when there are too few or too many \procs. 101 For this work, it is the programer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue. 102 This leaves too many \procs when there are not enough \ats for all the \procs to be useful. 103 These idle \procs cannot be removed because their lifetime is controlled by the application, and only the application knows when the number of \ats may increase or decrease. 104 While idle \procs can spin until work appears, this approach wastes energy, unnecessarily produces heat and prevents other applications from using the processor. 105 Therefore, idle \procs are put into an idle state, called \newterm{Idle-Sleep}, where the \gls{kthrd} is blocked until the scheduler deems it is needed. 105 106 106 107 Idle sleep effectively encompasses several challenges. 107 First some data structure needs to keep track of all \procs that are in idle sleep. 108 Because of idle sleep can be spurious, this data structure has strict performance requirements in addition to the strict correctness requirements. 109 Next, some tool must be used to block kernel threads \glspl{kthrd}, \eg \texttt{pthread\_cond\_wait}, pthread semaphores. 110 The complexity here is to support \at parking and unparking, timers, \io operations and all other \CFA features with minimal complexity. 111 Finally, idle sleep also includes a heuristic to determine the appropriate number of \procs to be in idle sleep an any given time. 112 This third challenge is however outside the scope of this thesis because developping a general heuristic is involved enough to justify its own work. 113 The \CFA scheduler simply follows the ``Race-to-Idle'\cit{https://doi.org/10.1137/1.9781611973099.100}' approach where a sleeping \proc is woken any time an \at becomes ready and \procs go to idle sleep anytime they run out of work. 108 First, a data structure needs to keep track of all \procs that are in idle sleep. 109 Because idle sleep is spurious, this data structure has strict performance requirements, in addition to strict correctness requirements. 110 Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ on a pthread semaphore. 111 The complexity here is to support \at parking and unparking, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity. 112 Finally, the scheduler needs a heuristic to determine when to block and unblock an appropriate number of \procs. 113 However, this third challenge is outside the scope of this thesis because developing a general heuristic is complex enough to justify its own work. 114 Therefore, the \CFA scheduler simply follows the ``Race-to-Idle''~\cite{Albers12} approach where a sleeping \proc is woken any time a \at becomes ready and \procs go to idle sleep anytime they run out of work. 115 An interesting sub-part of this heuristic is what to do with bursts of \ats that become ready. 116 Since waking up a sleeping \proc can have notable latency, it is possible multiple \ats become ready while a single \proc is waking up. 117 This facts begs the question, if many \procs are available, how many should be woken? 118 If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelisation. 119 If the ready \ats will run for a short very short time, waking many \procs may be wasteful. 120 As mentioned, a heuristic to handle these complex cases is outside the scope of this thesis, the behaviour of the scheduler in this particular case is left unspecified. 114 121 115 122 \section{Sleeping} 116 123 As usual, the corner-stone of any feature related to the kernel is the choice of system call. 117 In terms of blocking a \gls{kthrd} until some event occurs the linux kernel has many available options: 118 119 \paragraph{\texttt{pthread\_mutex}/\texttt{pthread\_cond}} 120 The most classic option is to use some combination of \texttt{pthread\_mutex} and \texttt{pthread\_cond}. 121 These serve as straight forward mutual exclusion and synchronization tools and allow a \gls{kthrd} to wait on a \texttt{pthread\_cond} until signalled. 122 While this approach is generally perfectly appropriate for \glspl{kthrd} waiting after eachother, \io operations do not signal \texttt{pthread\_cond}s. 123 For \io results to wake a \proc waiting on a \texttt{pthread\_cond} means that a different \glspl{kthrd} must be woken up first, and then the \proc can be signalled. 124 125 \subsection{\texttt{io\_uring} and Epoll} 126 An alternative is to flip the problem on its head and block waiting for \io, using \texttt{io\_uring} or even \texttt{epoll}. 127 This creates the inverse situation, where \io operations directly wake sleeping \procs but waking \proc from a running \gls{kthrd} must use an indirect scheme. 128 This generally takes the form of creating a file descriptor, \eg, a dummy file, a pipe or an event fd, and using that file descriptor when \procs need to wake eachother. 129 This leads to additional complexity because there can be a race between these artificial \io operations and genuine \io operations. 130 If not handled correctly, this can lead to the artificial files going out of sync. 124 In terms of blocking a \gls{kthrd} until some event occurs, the Linux kernel has many available options. 125 126 \subsection{\lstinline{pthread_mutex}/\lstinline{pthread_cond}} 127 The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe park/unpark of a \gls{kthrd} to/from a @pthread_cond@. 128 While this approach works for \glspl{kthrd} waiting among themselves, \io operations do not provide a mechanism to signal @pthread_cond@s. 129 For \io results to wake a \proc waiting on a @pthread_cond@ means a different \glspl{kthrd} must be woken up first, which then signals the \proc. 130 131 \subsection{\lstinline{io_uring} and Epoll} 132 An alternative is to flip the problem on its head and block waiting for \io, using @io_uring@ or @epoll@. 133 This creates the inverse situation, where \io operations directly wake sleeping \procs but waking blocked \procs must use an indirect scheme. 134 This generally takes the form of creating a file descriptor, \eg, dummy file, pipe, or event fd, and using that file descriptor when \procs need to wake each other. 135 This leads to additional complexity because there can be a race between these artificial \io and genuine \io operations. 136 If not handled correctly, this can lead to artificial files getting delayed too long behind genuine files, resulting in longer latency. 131 137 132 138 \subsection{Event FDs} 133 139 Another interesting approach is to use an event file descriptor\cit{eventfd}. 134 This is a Linux feature that is a file descriptor that behaves like \io, \ie, uses \texttt{read} and \texttt{write}, but also behaves like a semaphore. 135 Indeed, all read and writes must use 64bits large values\footnote{On 64-bit Linux, a 32-bit Linux would use 32 bits values.}. 136 Writes add their values to the buffer, that is arithmetic addition and not buffer append, and reads zero out the buffer and return the buffer values so far\footnote{This is without the \texttt{EFD\_SEMAPHORE} flag. This flags changes the behavior of \texttt{read} but is not needed for this work.}. 140 This Linux feature is a file descriptor that behaves like \io, \ie, uses @read@ and @write@, but also behaves like a semaphore. 141 Indeed, all reads and writes must use a word-sized values, \ie 64 or 32 bits. 142 Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero out the buffer and return the buffer values so far.\footnote{ 143 This behaviour is without the \lstinline{EFD_SEMAPHORE} flag, which changes the behaviour of \lstinline{read} but is not needed for this work.} 137 144 If a read is made while the buffer is already 0, the read blocks until a non-0 value is added. 138 What makes this feature particularly interesting is that \texttt{io\_uring} supports the \texttt{IORING\_REGISTER\_EVENTFD} command, to register an event fd to a particular instance. 139 Once that instance is registered, any \io completion will result in \texttt{io\_uring} writing to the event FD. 140 This means that a \proc waiting on the event FD can be \emph{directly} woken up by either other \procs or incomming \io. 145 What makes this feature particularly interesting is that @io_uring@ supports the @IORING_REGISTER_EVENTFD@ command to register an event @fd@ to a particular instance. 146 Once that instance is registered, any \io completion results in @io_uring@ writing to the event @fd@. 147 This means that a \proc waiting on the event @fd@ can be \emph{directly} woken up by either other \procs or incoming \io. 148 149 \section{Tracking Sleepers} 150 Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc. 151 The classic challenge occurs when a \at is made ready while a \proc is going to sleep: there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at. 152 Since \ats can be made ready by timers, \io operations, or other events outside a cluster, this race can occur even if the \proc going to sleep is the only \proc awake. 153 As a result, improper handling of this race leads to all \procs going to sleep when there are ready \ats and the system deadlocks. 154 155 The handshake closing the race is done with both the notifier and the idle \proc executing two ordered steps. 156 The notifier first make sure the newly ready \at is visible to \procs searching for \ats, and then attempt to notify an idle \proc. 157 On the other side, \procs make themselves visible as idle \procs and then search for any \ats they may have missed. 158 Unlike regular work-stealing, this search must be exhaustive to make sure that pre-existing \at is missed. 159 These steps from both sides guarantee that if the search misses a newly ready \at, then the notifier is guaranteed to see at least one idle \proc. 160 Conversly, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search. 161 162 Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers. 163 Contention can be tolerated for \procs attempting to sleep or wake-up because these \procs are not doing useful work, and therefore, not contributing to overall performance. 164 However, notifying, checking if a \proc must be woken-up, and doing so if needed, can significantly affect overall performance and must be low cost. 165 166 \subsection{Sleepers List} 167 Each cluster maintains a list of idle \procs, organized as a stack. 168 This ordering allows \procs at the tail to stay in idle sleep for extended period of times while those at the head of the list wake-up for bursts of activity. 169 Because of unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \procs handle as much of the work as possible. 170 The idle \procs maintain the stack of sleepers among themselves and notifying a sleeping \proc takes as little work as possible. 171 This approach means that maintaining the list is fairly straightforward. 172 The list can simply use a single lock per cluster and only \procs that are getting in and out of the idle state contend for that lock. 173 174 This approach also simplifies notification. 175 Indeed, \procs not only need to be notify when a new \at is readied, but also must be notified during manual resizing, so the \gls{kthrd} can be joined. 176 These requirements mean whichever entity removes idle \procs from the sleeper list must be able to do so in any order. 177 Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure. 178 The single lock also means the notification process simply needs to wake-up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest. 179 180 \subsection{Reducing Latency} 181 As mentioned in this section, \procs going to sleep for extremely short periods of time is likely in certain scenarios. 182 Therefore, the latency of doing a system call to read from and writing to an event @fd@ can negatively affect overall performance in a notable way. 183 Hence, it is important to reduce latency and contention of the notification as much as possible. 184 Figure~\ref{fig:idle1} shows the basic idle-sleep data structure. 185 For the notifiers, this data structure can cause contention on the lock and the event @fd@ syscall can cause notable latency. 141 186 142 187 \begin{figure} … … 144 189 \input{idle1.pstex_t} 145 190 \caption[Basic Idle Sleep Data Structure]{Basic Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock. 146 Each \proc has a private event FD.}191 Each \proc has a private event \lstinline{fd}.} 147 192 \label{fig:idle1} 148 193 \end{figure} 149 194 150 151 \section{Tracking Sleepers} 152 Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc. 153 The classic challenge is when a \at is made ready while a \proc is going to sleep, there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at. 154 Since \ats can be made ready by timers, \io operations or other events outside a clusre, this race can occur even if the \proc going to sleep is the only \proc awake. 155 As a result, improper handling of this race can lead to all \procs going to sleep and the system deadlocking. 156 157 Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers. 158 Contention slowing down \procs attempting to sleep or wake-up can be tolerated. 159 These \procs are not doing useful work and therefore not contributing to overall performance. 160 However, notifying, checking if a \proc must be woken-up and doing so if needed, can significantly affect overall performance and must be low cost. 161 162 \subsection{Sleepers List} 163 Each cluster maintains a list of idle \procs, organized as a stack. 164 This ordering hopefully allows \proc at the tail to stay in idle sleep for extended period of times. 165 Because of these unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \proc handle as much of the work as possible. 166 The idle \procs maintain the of sleepers among themselves and notifying a sleeping \proc takes as little work as possible. 167 This approach means that maintaining the list is fairly straightforward. 168 The list can simply use a single lock per cluster and only \procs that are getting in and out of idle state will contend for that lock. 169 170 This approach also simplifies notification. 171 Indeed, \procs need to be notify when a new \at is readied, but they also must be notified during resizing, so the \gls{kthrd} can be joined. 172 This means that whichever entity removes idle \procs from the sleeper list must be able to do so in any order. 173 Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure. 174 The notification process then simply needs to wake-up the desired idle \proc, using \texttt{pthread\_cond\_signal}, \texttt{write} on an fd, etc., and the \proc will handle the rest. 175 176 \subsection{Reducing Latency} 177 As mentioned in this section, \procs going idle for extremely short periods of time is likely in certain common scenarios. 178 Therefore, the latency of doing a system call to read from and writing to the event fd can actually negatively affect overall performance in a notable way. 179 Is it important to reduce latency and contention of the notification as much as possible. 180 Figure~\ref{fig:idle1} shoes the basic idle sleep data structure. 181 For the notifiers, this data structure can cause contention on the lock and the event fd syscall can cause notable latency. 182 183 \begin{figure} 195 Contention occurs because the idle-list lock must be held to access the idle list, \eg by \procs attempting to go to sleep, \procs waking, or notification attempts. 196 The contention from the \procs attempting to go to sleep can be mitigated slightly by using @try_acquire@, so the \procs simply busy wait again searching for \ats if the lock is held. 197 This trick cannot be used when waking \procs since the waker needs to return immediately to what it was doing. 198 199 Interestingly, general notification, \ie waking any idle processor versus a specific one, does not strictly require modifying the list. 200 Here, contention can be reduced notably by having notifiers avoid the lock entirely by adding a pointer to the event @fd@ of the first idle \proc, as in Figure~\ref{fig:idle2}. 201 To avoid contention among notifiers, notifiers atomically exchange the pointer with @NULL@. 202 The first notifier succeeds on the exchange and obtains the @fd@ of an idle \proc; 203 hence, only one notifier contends on the system call. 204 This notifier writes to the @fd@ to wake a \proc. 205 The woken \proc then updates the atomic pointer, while it is updating the head of the list, as it removes itself from the list. 206 Notifiers that obtained a @NULL@ in the exchange simply move on knowing that another notifier is already waking a \proc. 207 This behaviour is equivalent to having multiple notifier write to the @fd@ since reads consume all previous writes. 208 Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares witht the latency of \procs waking up. 209 As mentioned in section~\ref{idlesleep}, there is no optimal approach to handle these bursts. 210 It is therefore difficult to justify the cost of any extra synchronization here. 211 212 \begin{figure}[t] 184 213 \centering 185 214 \input{idle2.pstex_t} 186 \caption[Improved Idle Sleep Data Structure]{Improved Idle Sleep Data Structure \smallskip\newline An atomic pointer is added to the list,pointing to the Event FD of the first \proc on the list.}215 \caption[Improved Idle-Sleep Data Structure]{Improved Idle-Sleep Data Structure \smallskip\newline An atomic pointer is added to the list pointing to the Event FD of the first \proc on the list.} 187 216 \label{fig:idle2} 188 217 \end{figure} 189 218 190 The contention is mostly due to the lock on the list needing to be held to get to the head \proc. 191 That lock can be contended by \procs attempting to go to sleep, \procs waking or notification attempts. 192 The contentention from the \procs attempting to go to sleep can be mitigated slightly by using \texttt{try\_acquire} instead, so the \procs simply continue searching for \ats if the lock is held. 193 This trick cannot be used for waking \procs since they are not in a state where they can run \ats. 194 However, it is worth nothing that notification does not strictly require accessing the list or the head \proc. 195 Therefore, contention can be reduced notably by having notifiers avoid the lock entirely and adding a pointer to the event fd of the first idle \proc, as in Figure~\ref{fig:idle2}. 196 To avoid contention between the notifiers, instead of simply reading the atomic pointer, notifiers atomically exchange it to \texttt{null} so only only notifier will contend on the system call. 219 The next optimization is to avoid the latency of the event @fd@, which can be done by adding what is effectively a binary benaphore\cit{benaphore} in front of the event @fd@. 220 The benaphore over the event @fd@ logically provides a three state flag to avoid unnecessary system calls, where the states are expressed explicit in Figure~\ref{fig:idle:state}. 221 A \proc begins its idle sleep by adding itself to the idle list before searching for an \at. 222 In the process of adding itself to the idle list, it sets the state flag to @SEARCH@. 223 If no \ats can be found during the search, the \proc then confirms it is going to sleep by atomically swapping the state to @SLEEP@. 224 If the previous state is still @SEARCH@, then the \proc does read the event @fd@. 225 Meanwhile, notifiers atomically exchange the state to @AWAKE@ state. 226 If the previous state is @SLEEP@, then the notifier must write to the event @fd@. 227 However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event @fd@ can be omitted, which reduces latency notably. 228 These extensions leads to the final data structure shown in Figure~\ref{fig:idle}. 197 229 198 230 \begin{figure} 199 231 \centering 200 232 \input{idle_state.pstex_t} 201 \caption[Improved Idle Sleep Data Structure]{Improved Idle Sleep Data Structure \smallskip\newline An atomic pointer is added to the list, pointing to the Event FD of the first \proc on the list.}233 \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three state flag is added to the event \lstinline{fd}.} 202 234 \label{fig:idle:state} 203 235 \end{figure} 204 205 The next optimization that can be done is to avoid the latency of the event fd when possible.206 This can be done by adding what is effectively a benaphore\cit{benaphore} in front of the event fd.207 A simple three state flag is added beside the event fd to avoid unnecessary system calls, as shown in Figure~\ref{fig:idle:state}.208 The flag starts in state \texttt{SEARCH}, while the \proc is searching for \ats to run.209 The \proc then confirms the sleep by atomically swaping the state to \texttt{SLEEP}.210 If the previous state was still \texttt{SEARCH}, then the \proc does read the event fd.211 Meanwhile, notifiers atomically exchange the state to \texttt{AWAKE} state.212 if the previous state was \texttt{SLEEP}, then the notifier must write to the event fd.213 However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event fd can be omitted, which reduces latency notably.214 This leads to the final data structure shown in Figure~\ref{fig:idle}.215 236 216 237 \begin{figure} … … 218 239 \input{idle.pstex_t} 219 240 \caption[Low-latency Idle Sleep Data Structure]{Low-latency Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock. 220 Each \proc has a private event FDwith a benaphore in front of it.221 The list also has an atomic pointer to the event fdand benaphore of the first \proc on the list.}241 Each \proc has a private event \lstinline{fd} with a benaphore in front of it. 242 The list also has an atomic pointer to the event \lstinline{fd} and benaphore of the first \proc on the list.} 222 243 \label{fig:idle} 223 244 \end{figure} -
doc/theses/thierry_delisle_PhD/thesis/thesis.tex
r4e2befe3 rdef751f 108 108 citecolor=OliveGreen, % color of links to bibliography 109 109 filecolor=magenta, % color of file links 110 urlcolor=cyan % color of external links 110 urlcolor=blue, % color of external links 111 breaklinks=true 111 112 } 112 113 \ifthenelse{\boolean{PrintVersion}}{ % for improved print quality, change some hyperref options -
libcfa/Makefile.am
r4e2befe3 rdef751f 18 18 ACLOCAL_AMFLAGS = -I automake 19 19 SUBDIRS = prelude src # order important 20 21 DISTCLEANFILES = config.data -
libcfa/src/Makefile.am
r4e2befe3 rdef751f 216 216 nobase_cfa_include_HEADERS = ${stdhdr} ${inst_headers_src} ${inst_headers_nosrc} ${inst_thread_headers_src} ${inst_thread_headers_nosrc} 217 217 EXTRA_DIST = stdhdr 218 DISTCLEANFILES = $(libdeps) $(thread_libdeps) 218 219 219 220 #---------------------------------------------------------------------------------------------------------------- … … 221 222 -rm -rf ${CFA_INCDIR} ${CFA_LIBDIR} 222 223 223 distclean-local:224 find ${builddir} -path '*.Plo' -delete224 #distclean-local: 225 # find ${builddir} -path '*.Plo' -delete 225 226 226 227 -
src/AST/Expr.cpp
r4e2befe3 rdef751f 272 272 // Adjust the length of the string for the terminator. 273 273 const Expr * strSize = from_ulong( loc, str.size() + 1 ); 274 const Type * strType = new ArrayType( charType, strSize, FixedLen, StaticDim );274 const Type * strType = new ArrayType( charType, strSize, FixedLen, DynamicDim ); 275 275 const std::string strValue = "\"" + str + "\""; 276 276 return new ConstantExpr( loc, strType, strValue, std::nullopt ); -
src/AST/Pass.hpp
r4e2befe3 rdef751f 264 264 __pass::result1<ast::Stmt> call_accept_as_compound(const ast::Stmt *); 265 265 266 // requests type environment to be updated (why is it implemented like this?) 267 __pass::result1<ast::Expr> call_accept_top(const ast::Expr *); 268 266 269 template< template <class...> class container_t > 267 270 __pass::resultNstmt<container_t> call_accept( const container_t< ptr<Stmt> > & ); … … 277 280 template<typename node_t, typename parent_t, typename field_t> 278 281 void maybe_accept_as_compound(const node_t * &, field_t parent_t::* field); 282 283 template<typename node_t, typename parent_t, typename field_t> 284 void maybe_accept_top(const node_t * &, field_t parent_t::* field); 279 285 280 286 private: -
src/AST/Pass.impl.hpp
r4e2befe3 rdef751f 155 155 __pedantic_pass_assert( expr ); 156 156 157 const ast::TypeSubstitution ** typeSubs_ptr = __pass::typeSubs( core, 0 );158 if ( typeSubs_ptr && expr->env ) {159 *typeSubs_ptr = expr->env;160 }161 162 157 auto nval = expr->accept( *this ); 163 158 return { nval != expr, nval }; … … 171 166 const ast::Stmt * nval = stmt->accept( *this ); 172 167 return { nval != stmt, nval }; 168 } 169 170 template< typename core_t > 171 __pass::template result1<ast::Expr> ast::Pass< core_t >::call_accept_top( const ast::Expr * expr ) { 172 __pedantic_pass_assert( __visit_children() ); 173 __pedantic_pass_assert( expr ); 174 175 const ast::TypeSubstitution ** typeSubs_ptr = __pass::typeSubs( core, 0 ); 176 if ( typeSubs_ptr && expr->env ) { 177 *typeSubs_ptr = expr->env; 178 } 179 180 auto nval = expr->accept( *this ); 181 return { nval != expr, nval }; 173 182 } 174 183 … … 410 419 411 420 auto new_val = call_accept( old_val ); 421 422 static_assert( !std::is_same<const ast::Node *, decltype(new_val)>::value /* || std::is_same<int, decltype(old_val)>::value */, "ERROR"); 423 424 if( new_val.differs ) { 425 auto new_parent = __pass::mutate<core_t>(parent); 426 new_val.apply(new_parent, field); 427 parent = new_parent; 428 } 429 } 430 431 template< typename core_t > 432 template<typename node_t, typename super_t, typename field_t> 433 void ast::Pass< core_t >::maybe_accept_top( 434 const node_t * & parent, 435 field_t super_t::*field 436 ) { 437 static_assert( std::is_base_of<super_t, node_t>::value, "Error deducing member object" ); 438 439 if(__pass::skip(parent->*field)) return; 440 const auto & old_val = __pass::get(parent->*field, 0); 441 442 static_assert( !std::is_same<const ast::Node * &, decltype(old_val)>::value, "ERROR"); 443 444 auto new_val = call_accept_top( old_val ); 412 445 413 446 static_assert( !std::is_same<const ast::Node *, decltype(new_val)>::value /* || std::is_same<int, decltype(old_val)>::value */, "ERROR"); … … 756 789 757 790 if ( __visit_children() ) { 758 maybe_accept ( node, &StaticAssertDecl::cond );791 maybe_accept_top( node, &StaticAssertDecl::cond ); 759 792 maybe_accept( node, &StaticAssertDecl::msg ); 760 793 } … … 798 831 799 832 if ( __visit_children() ) { 800 maybe_accept ( node, &ExprStmt::expr );833 maybe_accept_top( node, &ExprStmt::expr ); 801 834 } 802 835 … … 839 872 guard_symtab guard { *this }; 840 873 maybe_accept( node, &IfStmt::inits ); 841 maybe_accept ( node, &IfStmt::cond );874 maybe_accept_top( node, &IfStmt::cond ); 842 875 maybe_accept_as_compound( node, &IfStmt::then ); 843 876 maybe_accept_as_compound( node, &IfStmt::else_ ); … … 857 890 guard_symtab guard { *this }; 858 891 maybe_accept( node, &WhileDoStmt::inits ); 859 maybe_accept ( node, &WhileDoStmt::cond );892 maybe_accept_top( node, &WhileDoStmt::cond ); 860 893 maybe_accept_as_compound( node, &WhileDoStmt::body ); 861 894 } … … 875 908 // xxx - old ast does not create WithStmtsToAdd scope for loop inits. should revisit this later. 876 909 maybe_accept( node, &ForStmt::inits ); 877 maybe_accept ( node, &ForStmt::cond );878 maybe_accept ( node, &ForStmt::inc );910 maybe_accept_top( node, &ForStmt::cond ); 911 maybe_accept_top( node, &ForStmt::inc ); 879 912 maybe_accept_as_compound( node, &ForStmt::body ); 880 913 } … … 890 923 891 924 if ( __visit_children() ) { 892 maybe_accept ( node, &SwitchStmt::cond );925 maybe_accept_top( node, &SwitchStmt::cond ); 893 926 maybe_accept( node, &SwitchStmt::cases ); 894 927 } … … 904 937 905 938 if ( __visit_children() ) { 906 maybe_accept ( node, &CaseClause::cond );939 maybe_accept_top( node, &CaseClause::cond ); 907 940 maybe_accept( node, &CaseClause::stmts ); 908 941 } … … 926 959 927 960 if ( __visit_children() ) { 928 maybe_accept ( node, &ReturnStmt::expr );961 maybe_accept_top( node, &ReturnStmt::expr ); 929 962 } 930 963 … … 971 1004 guard_symtab guard { *this }; 972 1005 maybe_accept( node, &CatchClause::decl ); 973 maybe_accept ( node, &CatchClause::cond );1006 maybe_accept_top( node, &CatchClause::cond ); 974 1007 maybe_accept_as_compound( node, &CatchClause::body ); 975 1008 } … … 2058 2091 2059 2092 if ( __visit_children() ) { 2060 maybe_accept ( node, &SingleInit::value );2093 maybe_accept_top( node, &SingleInit::value ); 2061 2094 } 2062 2095 -
src/AST/SymbolTable.cpp
r4e2befe3 rdef751f 65 65 66 66 Expr * SymbolTable::IdData::combine( const CodeLocation & loc, ResolvExpr::Cost & cost ) const { 67 Expr * ret = ( baseExpr ) ? 68 (Expr *)new MemberExpr{ loc, id, referenceToRvalueConversion( baseExpr, cost ) } : 69 (Expr *)new VariableExpr{ loc, id }; 67 Expr * ret; 68 if ( baseExpr ) { 69 if (baseExpr->env) { 70 Expr * base = shallowCopy(baseExpr); 71 const TypeSubstitution * subs = baseExpr->env; 72 base->env = nullptr; 73 ret = new MemberExpr{loc, id, referenceToRvalueConversion( base, cost )}; 74 ret->env = subs; 75 } 76 else { 77 ret = new MemberExpr{ loc, id, referenceToRvalueConversion( baseExpr, cost ) }; 78 } 79 } 80 else { 81 ret = new VariableExpr{ loc, id }; 82 } 70 83 if ( deleter ) { ret = new DeletedExpr{ loc, ret, deleter }; } 71 84 return ret; … … 772 785 && ! dynamic_cast<const UnionInstType *>(rty) ) continue; 773 786 ResolvExpr::Cost cost = ResolvExpr::Cost::zero; 787 ast::ptr<ast::TypeSubstitution> tmp = expr->env; 788 expr = mutate_field(expr, &Expr::env, nullptr); 774 789 const Expr * base = ResolvExpr::referenceToRvalueConversion( expr, cost ); 790 base = mutate_field(base, &Expr::env, tmp); 791 775 792 addMembers( 776 793 rty->aggr(), new MemberExpr{ base->location, dwt, base }, handleConflicts ); -
src/AST/TypeSubstitution.cpp
r4e2befe3 rdef751f 97 97 TypeSubstitution * newEnv; 98 98 EnvTrimmer( const TypeSubstitution * env, TypeSubstitution * newEnv ) : env( env ), newEnv( newEnv ){} 99 void previsit( FunctionType * ftype ) {99 void previsit( const FunctionType * ftype ) { 100 100 // transfer known bindings for seen type variables 101 101 for (auto & formal : ftype->forall) { -
src/CodeGen/FixNames.cc
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixNames.cc -- 7 // FixNames.cc -- Adjustments to typed declarations. 8 8 // 9 9 // Author : Richard C. Bilson 10 10 // Created On : Mon May 18 07:44:20 2015 11 11 // Last Modified By : Andrew Beach 12 // Last Modified On : Fri Oct 29 15:49:00 202113 // Update Count : 2 312 // Last Modified On : Wed Jul 20 11:49:00 2022 13 // Update Count : 24 14 14 // 15 15 … … 87 87 88 88 /// Does work with the main function and scopeLevels. 89 class FixNames_new : public ast::WithGuards{89 class FixNames_new final { 90 90 int scopeLevel = 1; 91 91 … … 103 103 104 104 const ast::FunctionDecl *postvisit( const ast::FunctionDecl *functionDecl ) { 105 // This store is used to ensure a maximum of one call to mutate.106 ast::FunctionDecl * mutDecl = nullptr;105 if ( FixMain::isMain( functionDecl ) ) { 106 auto mutDecl = ast::mutate( functionDecl ); 107 107 108 if ( shouldSetScopeLevel( functionDecl ) ) { 109 mutDecl = ast::mutate( functionDecl ); 110 mutDecl->scopeLevel = scopeLevel; 111 } 112 113 if ( FixMain::isMain( functionDecl ) ) { 114 if ( !mutDecl ) { mutDecl = ast::mutate( functionDecl ); } 108 if ( shouldSetScopeLevel( mutDecl ) ) { 109 mutDecl->scopeLevel = scopeLevel; 110 } 115 111 116 112 int nargs = mutDecl->params.size(); … … 124 120 ) 125 121 ); 122 123 return mutDecl; 124 } else if ( shouldSetScopeLevel( functionDecl ) ) { 125 return ast::mutate_field( functionDecl, &ast::FunctionDecl::scopeLevel, scopeLevel ); 126 } else { 127 return functionDecl; 126 128 } 127 return mutDecl ? mutDecl : functionDecl;128 129 } 129 130 130 131 void previsit( const ast::CompoundStmt * ) { 131 GuardValue( scopeLevel ) += 1; 132 scopeLevel += 1; 133 } 134 135 void postvisit( const ast::CompoundStmt * ) { 136 scopeLevel -= 1; 132 137 } 133 138 }; -
src/CodeGen/FixNames.h
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixNames.h -- 7 // FixNames.h -- Adjustments to typed declarations. 8 8 // 9 9 // Author : Richard C. Bilson … … 26 26 /// mangles object and function names 27 27 void fixNames( std::list< Declaration* > & translationUnit ); 28 void fixNames( ast::TranslationUnit & translationUnit ); 28 /// Sets scope levels and fills in main's default return. 29 void fixNames( ast::TranslationUnit & translationUnit ); 29 30 } // namespace CodeGen 30 31 -
src/Concurrency/Keywords.h
r4e2befe3 rdef751f 28 28 void implementThreadStarter( std::list< Declaration * > & translationUnit ); 29 29 30 /// Implement the sue-like keywords and the suspend keyword. 30 /// Implement the sue-like keywords and the suspend keyword. Pre-Autogen 31 31 void implementKeywords( ast::TranslationUnit & translationUnit ); 32 /// Implement the mutex parameters and mutex statement. 32 /// Implement the mutex parameters and mutex statement. Post-Autogen 33 33 void implementMutex( ast::TranslationUnit & translationUnit ); 34 /// Add the thread starter code to constructors. 34 /// Add the thread starter code to constructors. Post-Autogen 35 35 void implementThreadStarter( ast::TranslationUnit & translationUnit ); 36 36 }; -
src/ControlStruct/ExceptDecl.cc
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // ExceptDecl.cc -- 7 // ExceptDecl.cc -- Handles declarations of exception types. 8 8 // 9 9 // Author : Henry Xue -
src/ControlStruct/ExceptDecl.h
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // ExceptDecl.h -- 7 // ExceptDecl.h -- Handles declarations of exception types. 8 8 // 9 9 // Author : Henry Xue 10 10 // Created On : Tue Jul 20 04:10:50 2021 11 // Last Modified By : Henry Xue12 // Last Modified On : Tue Jul 20 04:10:50 202113 // Update Count : 111 // Last Modified By : Andrew Beach 12 // Last Modified On : Tue Jul 12 15:49:00 2022 13 // Update Count : 2 14 14 // 15 15 … … 20 20 class Declaration; 21 21 22 namespace ast { 23 class TranslationUnit; 24 } 25 22 26 namespace ControlStruct { 23 void translateExcept( std::list< Declaration *> & translationUnit ); 27 /// Unfold exception declarations into raw structure declarations. 28 /// Also builds vtable declarations and converts vtable types. 29 void translateExcept( std::list< Declaration *> & translationUnit ); 30 void translateExcept( ast::TranslationUnit & translationUnit ); 24 31 } -
src/ControlStruct/HoistControlDecls.hpp
r4e2befe3 rdef751f 21 21 22 22 namespace ControlStruct { 23 // Hoist declarations out of control flow statements into compound statement. 23 /// Hoist declarations out of control flow statements into compound statement. 24 /// Must happen before auto-gen routines are added. 24 25 void hoistControlDecls( ast::TranslationUnit & translationUnit ); 25 26 } // namespace ControlStruct -
src/ControlStruct/MultiLevelExit.cpp
r4e2befe3 rdef751f 149 149 }; 150 150 151 NullStmt * labelledNullStmt( 152 const CodeLocation & cl, const Label & label ) { 151 NullStmt * labelledNullStmt( const CodeLocation & cl, const Label & label ) { 153 152 return new NullStmt( cl, vector<Label>{ label } ); 154 153 } … … 164 163 165 164 const CompoundStmt * MultiLevelExitCore::previsit( 166 const CompoundStmt * stmt ) {165 const CompoundStmt * stmt ) { 167 166 visit_children = false; 168 167 … … 189 188 } 190 189 191 size_t getUnusedIndex( 192 const Stmt * stmt, const Label & originalTarget ) { 190 size_t getUnusedIndex( const Stmt * stmt, const Label & originalTarget ) { 193 191 const size_t size = stmt->labels.size(); 194 192 … … 210 208 } 211 209 212 const Stmt * addUnused( 213 const Stmt * stmt, const Label & originalTarget ) { 210 const Stmt * addUnused( const Stmt * stmt, const Label & originalTarget ) { 214 211 size_t i = getUnusedIndex( stmt, originalTarget ); 215 212 if ( i == stmt->labels.size() ) { … … 356 353 357 354 // Mimic what the built-in push_front would do anyways. It is O(n). 358 void push_front( 359 vector<ptr<Stmt>> & vec, const Stmt * element ) { 355 void push_front( vector<ptr<Stmt>> & vec, const Stmt * element ) { 360 356 vec.emplace_back( nullptr ); 361 357 for ( size_t i = vec.size() - 1 ; 0 < i ; --i ) { … … 590 586 591 587 ptr<Stmt> else_stmt = nullptr; 592 Stmt * loop_kid = nullptr;588 const Stmt * loop_kid = nullptr; 593 589 // check if loop node and if so add else clause if it exists 594 const WhileDoStmt * whilePtr = dynamic_cast<const WhileDoStmt *>(kid.get());595 if ( whilePtr && whilePtr->else_ ) {590 const WhileDoStmt * whilePtr = kid.as<WhileDoStmt>(); 591 if ( whilePtr && whilePtr->else_ ) { 596 592 else_stmt = whilePtr->else_; 597 WhileDoStmt * mutate_ptr = mutate(whilePtr); 598 mutate_ptr->else_ = nullptr; 599 loop_kid = mutate_ptr; 600 } 601 const ForStmt * forPtr = dynamic_cast<const ForStmt *>(kid.get()); 602 if ( forPtr && forPtr->else_) { 593 loop_kid = mutate_field( whilePtr, &WhileDoStmt::else_, nullptr ); 594 } 595 const ForStmt * forPtr = kid.as<ForStmt>(); 596 if ( forPtr && forPtr->else_ ) { 603 597 else_stmt = forPtr->else_; 604 ForStmt * mutate_ptr = mutate(forPtr); 605 mutate_ptr->else_ = nullptr; 606 loop_kid = mutate_ptr; 598 loop_kid = mutate_field( forPtr, &ForStmt::else_, nullptr ); 607 599 } 608 600 -
src/ControlStruct/module.mk
r4e2befe3 rdef751f 17 17 SRC += \ 18 18 ControlStruct/ExceptDecl.cc \ 19 ControlStruct/ExceptDeclNew.cpp \ 19 20 ControlStruct/ExceptDecl.h \ 20 21 ControlStruct/ExceptTranslateNew.cpp \ -
src/GenPoly/Box.cc
r4e2befe3 rdef751f 189 189 /// Enters a new scope for type-variables, adding the type variables from ty 190 190 void beginTypeScope( Type *ty ); 191 /// Exits the type-variable scope192 void endTypeScope();193 191 /// Enters a new scope for knowLayouts and knownOffsets and queues exit calls 194 192 void beginGenericScope(); … … 198 196 UniqueName bufNamer; ///< Namer for VLA buffers 199 197 Expression * addrMember = nullptr; ///< AddressExpr argument is MemberExpr? 198 bool expect_func_type = false; ///< used to avoid recursing too deep in type decls 200 199 }; 201 200 … … 1419 1418 void PolyGenericCalculator::beginGenericScope() { 1420 1419 GuardScope( *this ); 1420 // We expect the first function type see to be the type relating to this scope 1421 // but any further type is probably some unrelated function pointer 1422 // keep track of which is the first 1423 GuardValue( expect_func_type ); 1424 expect_func_type = true; 1421 1425 } 1422 1426 … … 1468 1472 void PolyGenericCalculator::premutate( FunctionType *funcType ) { 1469 1473 beginTypeScope( funcType ); 1474 1475 GuardValue( expect_func_type ); 1476 1477 if(!expect_func_type) { 1478 GuardAction( [this]() { 1479 knownLayouts.endScope(); 1480 knownOffsets.endScope(); 1481 }); 1482 // If this is the first function type we see 1483 // Then it's the type of the declaration and we care about it 1484 knownLayouts.beginScope(); 1485 knownOffsets.beginScope(); 1486 } 1487 1488 // The other functions type we will see in this scope are probably functions parameters 1489 // they don't help us with the layout and offsets so don't mark them as known in this scope 1490 expect_func_type = false; 1470 1491 1471 1492 // make sure that any type information passed into the function is accounted for … … 1746 1767 } 1747 1768 1769 // std::cout << "TRUE 2" << std::endl; 1770 1748 1771 return true; 1749 1772 } else if ( UnionInstType *unionTy = dynamic_cast< UnionInstType* >( ty ) ) { -
src/GenPoly/Specialize.h
r4e2befe3 rdef751f 17 17 18 18 #include <list> // for list 19 #include "AST/TranslationUnit.hpp" 19 20 20 21 class Declaration; … … 23 24 /// generates thunks where needed 24 25 void convertSpecializations( std::list< Declaration* >& translationUnit ); 26 27 void convertSpecializations( ast::TranslationUnit & translationUnit ); 25 28 } // namespace GenPoly 26 29 -
src/GenPoly/module.mk
r4e2befe3 rdef751f 34 34 GenPoly/ScrubTyVars.h \ 35 35 GenPoly/Specialize.cc \ 36 GenPoly/SpecializeNew.cpp \ 36 37 GenPoly/Specialize.h 37 38 -
src/InitTweak/FixInitNew.cpp
r4e2befe3 rdef751f 73 73 /// wrap function application expressions as ImplicitCopyCtorExpr nodes so that it is easy to identify which 74 74 /// function calls need their parameters to be copy constructed 75 struct InsertImplicitCalls : public ast::With ConstTypeSubstitution, public ast::WithShortCircuiting {75 struct InsertImplicitCalls : public ast::WithShortCircuiting { 76 76 const ast::Expr * postvisit( const ast::ApplicationExpr * appExpr ); 77 77 … … 457 457 // is needed to obtain the type of temporary variables so that copy 458 458 // constructor calls can be resolved. 459 assert( typeSubs );460 459 expr->env = tmp; 461 460 return expr; -
src/InitTweak/GenInit.cc
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // GenInit.cc -- 7 // GenInit.cc -- Generate initializers, and other stuff. 8 8 // 9 9 // Author : Rob Schluntz -
src/InitTweak/GenInit.h
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // GenInit.h -- 7 // GenInit.h -- Generate initializers, and other stuff. 8 8 // 9 9 // Author : Rodolfo G. Esteves … … 29 29 void genInit( ast::TranslationUnit & translationUnit ); 30 30 31 /// Converts return statements into copy constructor calls on the hidden return variable 31 /// Converts return statements into copy constructor calls on the hidden return variable. 32 /// This pass must happen before auto-gen. 32 33 void fixReturnStatements( std::list< Declaration * > & translationUnit ); 33 34 void fixReturnStatements( ast::TranslationUnit & translationUnit ); -
src/ResolvExpr/CandidateFinder.cpp
r4e2befe3 rdef751f 1263 1263 newExpr, copy( tenv ), ast::OpenVarSet{}, ast::AssertionSet{}, Cost::zero, 1264 1264 cost ); 1265 1266 if (newCand->expr->env) { 1267 newCand->env.add(*newCand->expr->env); 1268 auto mutExpr = newCand->expr.get_and_mutate(); 1269 mutExpr->env = nullptr; 1270 newCand->expr = mutExpr; 1271 } 1272 1265 1273 PRINT( 1266 1274 std::cerr << "decl is "; -
src/ResolvExpr/Resolver.cc
r4e2befe3 rdef751f 1555 1555 if ( type->dimension ) { 1556 1556 ast::ptr< ast::Type > sizeType = context.global.sizeType; 1557 ast::ptr< ast::Expr > dimension = findSingleExpression( type->dimension, sizeType, context ); 1558 assertf(dimension->env->empty(), "array dimension expr has nonempty env"); 1559 dimension.get_and_mutate()->env = nullptr; 1557 1560 ast::mutate_field( 1558 1561 type, &PtrType::dimension, 1559 findSingleExpression( type->dimension, sizeType, context ));1562 dimension); 1560 1563 } 1561 1564 return type; … … 2008 2011 tmp->accept( *visitor ); 2009 2012 } 2013 else if (expr->env && expr->env->empty()) { 2014 expr = ast::mutate_field(expr.get(), &ast::Expr::env, nullptr); 2015 } 2010 2016 } 2011 2017 } -
src/Tuples/Tuples.cc
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // Tuples. h --7 // Tuples.cc -- A collection of tuple operations. 8 8 // 9 9 // Author : Andrew Beach -
src/Tuples/Tuples.h
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // Tuples.h -- 7 // Tuples.h -- A collection of tuple operations. 8 8 // 9 9 // Author : Rodolfo G. Esteves -
src/Validate/Autogen.hpp
r4e2befe3 rdef751f 22 22 namespace Validate { 23 23 24 /// Generate routines for all data types in the translation unit. 25 /// A lot of passes have to happen either before or after this pass. 24 26 void autogenerateRoutines( ast::TranslationUnit & translationUnit ); 25 27 -
src/Validate/CompoundLiteral.hpp
r4e2befe3 rdef751f 23 23 24 24 /// Use variables to implement compound literals. 25 /// Must happen after auto-gen routines are added. 25 26 void handleCompoundLiterals( ast::TranslationUnit & translationUnit ); 26 27 -
src/Validate/EnumAndPointerDecay.cpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // EnumAndPointerDecay.cpp -- 7 // EnumAndPointerDecay.cpp -- Normalizes enumerations and types in functions. 8 8 // 9 9 // Author : Andrew Beach -
src/Validate/EnumAndPointerDecay.hpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // EnumAndPointerDecay.hpp -- 7 // EnumAndPointerDecay.hpp -- Normalizes enumerations and types in functions. 8 8 // 9 9 // Author : Andrew Beach … … 22 22 namespace Validate { 23 23 24 /// Fix the parameter and return types of functions. Also assigns types to 25 /// enumeration values. This must happen before Link Reference to Types, 26 /// it needs correct types for mangling, and before auto-gen. 24 27 void decayEnumsAndPointers( ast::TranslationUnit & translationUnit ); 25 28 -
src/Validate/FindSpecialDecls.h
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FindSpecialDeclarations.h -- 7 // FindSpecialDeclarations.h -- Find special declarations used in the compiler. 8 8 // 9 9 // Author : Rob Schluntz … … 43 43 void findSpecialDecls( std::list< Declaration * > & translationUnit ); 44 44 45 /// find and remember some of the special declarations that are useful for45 /// Find and remember some of the special declarations that are useful for 46 46 /// generating code, so that they do not have to be discovered multiple times. 47 47 void findGlobalDecls( ast::TranslationUnit & translationUnit ); -
src/Validate/FixQualifiedTypes.cpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixQualifiedTypes.cpp -- 7 // FixQualifiedTypes.cpp -- Replace the qualified type with a direct type. 8 8 // 9 9 // Author : Andrew Beach … … 76 76 ret->qualifiers = type->qualifiers; 77 77 ast::TypeSubstitution sub( aggr->params, instp->params ); 78 // = parent->genericSubstitution();79 78 auto result = sub.apply(ret); 80 79 return result.node.release(); -
src/Validate/FixQualifiedTypes.hpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixQualifiedTypes.hpp -- 7 // FixQualifiedTypes.hpp -- Replace the qualified type with a direct type. 8 8 // 9 9 // Author : Andrew Beach … … 22 22 namespace Validate { 23 23 24 /// Replaces qualified types with an unqualified NamedTypeDecl. 25 /// Must happen after Link References To Types, 26 /// because aggregate members are accessed. 24 27 void fixQualifiedTypes( ast::TranslationUnit & translationUnit ); 25 28 -
src/Validate/FixReturnTypes.cpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixReturnTypes.cpp -- 7 // FixReturnTypes.cpp -- Unifies the representation of return types. 8 8 // 9 9 // Author : Andrew Beach -
src/Validate/FixReturnTypes.hpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // FixReturnTypes.hpp -- 7 // FixReturnTypes.hpp -- Unifies the representation of return types. 8 8 // 9 9 // Author : Andrew Beach … … 22 22 namespace Validate { 23 23 24 // This pass needs to happen early so that other passes can find tuple types 25 // in the right places, especially for function return types. 24 /// This pass needs to happen early so that other passes can find tuple types 25 /// in the right places, especially for function return types. 26 /// Must happen before auto-gen. 26 27 void fixReturnTypes( ast::TranslationUnit & translationUnit ); 27 28 -
src/Validate/ForallPointerDecay.hpp
r4e2befe3 rdef751f 29 29 /// Also checks that operator names are used properly on functions and 30 30 /// assigns unique IDs. This is a "legacy" pass. 31 /// Must be after implement concurrent keywords; because uniqueIds must be 32 /// set on declaration before resolution. 33 /// Must happen before auto-gen routines are added. 31 34 void decayForallPointers( ast::TranslationUnit & transUnit ); 32 35 -
src/Validate/GenericParameter.cpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // GenericParameter.hpp -- 7 // GenericParameter.hpp -- Generic parameter related passes. 8 8 // 9 9 // Author : Andrew Beach -
src/Validate/GenericParameter.hpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // GenericParameter.hpp -- 7 // GenericParameter.hpp -- Generic parameter related passes. 8 8 // 9 9 // Author : Andrew Beach … … 23 23 24 24 /// Perform substutions for generic parameters and fill in defaults. 25 /// Check as early as possible, but it can't happen before Link References to 26 /// Types and observed failing when attempted before eliminate typedef. 25 27 void fillGenericParameters( ast::TranslationUnit & translationUnit ); 26 28 -
src/Validate/HoistStruct.hpp
r4e2befe3 rdef751f 22 22 namespace Validate { 23 23 24 /// Flattens nested type declarations. 24 /// Flattens nested type declarations. (Run right after Fix Qualified Types.) 25 25 void hoistStruct( ast::TranslationUnit & translationUnit ); 26 26 -
src/Validate/HoistTypeDecls.cpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // HoistTypeDecls.cpp -- 7 // HoistTypeDecls.cpp -- Hoists declarations of implicitly declared types. 8 8 // 9 9 // Author : Andrew Beach -
src/Validate/HoistTypeDecls.hpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // HoistTypeDecls.hpp -- 7 // HoistTypeDecls.hpp -- Hoists declarations of implicitly declared types. 8 8 // 9 9 // Author : Andrew Beach … … 22 22 namespace Validate { 23 23 24 /// There are some places where a type can be declared but are usually only 25 /// referenced (with an *InstType). This inserts the declarations before 26 /// they are referenced. 24 27 void hoistTypeDecls( ast::TranslationUnit & translationUnit ); 25 28 -
src/Validate/LabelAddressFixer.cpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // LabelAddressFixer.cpp -- 7 // LabelAddressFixer.cpp -- Create label address expressions. 8 8 // 9 9 // Author : Andrew Beach -
src/Validate/LabelAddressFixer.hpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // LabelAddressFixer.hpp -- 7 // LabelAddressFixer.hpp -- Create label address expressions. 8 8 // 9 9 // Author : Andrew Beach … … 20 20 namespace Validate { 21 21 22 /// Label addresses are not actually created in the parser, this pass finds 23 /// the patterns that represent the label address expression. 22 24 void fixLabelAddresses( ast::TranslationUnit & translationUnit ); 23 25 -
src/Validate/LinkReferenceToTypes.hpp
r4e2befe3 rdef751f 22 22 namespace Validate { 23 23 24 /// Fills in the base value of various instance types, and some related 25 /// adjustments, such as setting the sized flag. 26 /// Because of the sized flag, it must happen before auto-gen. 24 27 void linkReferenceToTypes( ast::TranslationUnit & translationUnit ); 25 28 -
src/Validate/ReplaceTypedef.cpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // ReplaceTypedef.cpp -- 7 // ReplaceTypedef.cpp -- Fill in all typedefs with the underlying type. 8 8 // 9 9 // Author : Andrew Beach 10 10 // Created On : Tue Jun 29 14:59:00 2022 11 11 // Last Modified By : Andrew Beach 12 // Last Modified On : Mon Jul 12 14:17:00 202213 // Update Count : 012 // Last Modified On : Wed Jul 13 14:45:00 2022 13 // Update Count : 1 14 14 // 15 15 … … 63 63 void previsit( ast::TraitDecl const * ); 64 64 65 void previsit( ast::FunctionType const * );66 67 65 template<typename AggrDecl> 68 66 void addImplicitTypedef( AggrDecl * aggDecl ); … … 78 76 CodeLocation const * nearestLocation = nullptr; 79 77 int scopeLevel; 80 bool i nFunctionType= false;78 bool isAtFunctionTop = false; 81 79 }; 82 80 … … 105 103 ast::Type * ret = ast::deepCopy( def->second.first->base ); 106 104 ret->qualifiers |= type->qualifiers; 107 // GCC ignores certain attributes if they arrive by typedef, 108 // this mimics that. 109 // TODO: This might cover too much, it should just cover arguments 110 // and return values of a function. 111 if ( visitor->isInFunction() ) { 105 // We ignore certain attributes on function parameters if they arrive 106 // by typedef. GCC appears to do the same thing. 107 if ( isAtFunctionTop ) { 112 108 erase_if( ret->attributes, isNonParameterAttribute ); 113 109 } … … 207 203 GuardScope( typedefNames ); 208 204 GuardScope( typedeclNames ); 205 GuardValue( isAtFunctionTop ) = true; 209 206 } 210 207 … … 262 259 GuardScope( typedefNames ); 263 260 GuardScope( typedeclNames ); 261 GuardValue( isAtFunctionTop ) = false; 264 262 scopeLevel += 1; 265 263 } … … 292 290 GuardScope( typedefNames ); 293 291 GuardScope( typedeclNames ); 294 }295 296 void ReplaceTypedefCore::previsit( ast::FunctionType const * ) {297 GuardValue( inFunctionType ) = true;298 292 } 299 293 -
src/Validate/ReplaceTypedef.hpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // ReplaceTypedef.hpp -- 7 // ReplaceTypedef.hpp -- Fill in all typedefs with the underlying type. 8 8 // 9 9 // Author : Andrew Beach … … 22 22 namespace Validate { 23 23 24 /// Uses of typedef are replaced with the type in the typedef. 24 25 void replaceTypedef( ast::TranslationUnit & translationUnit ); 25 26 -
src/Validate/VerifyCtorDtorAssign.cpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // VerifyCtorDtorAssign.cpp -- 7 // VerifyCtorDtorAssign.cpp -- Check the form of operators. 8 8 // 9 9 // Author : Andrew Beach -
src/Validate/VerifyCtorDtorAssign.hpp
r4e2befe3 rdef751f 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // VerifyCtorDtorAssign.hpp -- 7 // VerifyCtorDtorAssign.hpp -- Check the form of operators. 8 8 // 9 9 // Author : Andrew Beach … … 22 22 namespace Validate { 23 23 24 /// Check that constructors, destructors and assignments all have the correct 25 /// form. Must happen before auto-gen or anything that examines operators. 24 26 void verifyCtorDtorAssign( ast::TranslationUnit & translationUnit ); 25 27 -
src/Virtual/Tables.h
r4e2befe3 rdef751f 19 19 #include "AST/Fwd.hpp" 20 20 class Declaration; 21 class Expression; 22 class FunctionDecl; 23 class Initializer; 24 class ObjectDecl; 21 25 class StructDecl; 22 class Expression; 26 class StructInstType; 27 class Type; 23 28 24 29 namespace Virtual { -
src/main.cc
r4e2befe3 rdef751f 10 10 // Created On : Fri May 15 23:12:02 2015 11 11 // Last Modified By : Andrew Beach 12 // Last Modified On : Tue Jul 12 12:02:00 202213 // Update Count : 67 512 // Last Modified On : Mon Jul 18 11:08:00 2022 13 // Update Count : 676 14 14 // 15 15 … … 330 330 Stats::Time::StopBlock(); 331 331 332 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) );333 if ( exdeclp ) {334 dump( translationUnit );335 return EXIT_SUCCESS;336 } // if337 338 CodeTools::fillLocations( translationUnit );339 340 332 if( useNewAST ) { 341 CodeTools::fillLocations( translationUnit );342 343 333 if (Stats::Counters::enabled) { 344 334 ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New"); … … 349 339 forceFillCodeLocations( transUnit ); 350 340 351 // Must happen before auto-gen, or anything that examines ops. 341 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) ); 342 if ( exdeclp ) { 343 dump( move( transUnit ) ); 344 return EXIT_SUCCESS; 345 } 346 352 347 PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) ); 353 354 348 PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) ); 355 349 // Hoist Type Decls pulls some declarations out of contexts where … … 359 353 360 354 PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) ); 361 362 // Must happen before auto-gen.363 355 PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) ); 364 365 // Must happen before Link Reference to Types, it needs correct366 // types for mangling.367 356 PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) ); 368 357 369 // Must happen before auto-gen, because it uses the sized flag.370 358 PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) ); 371 359 372 // Must happen after Link References To Types,373 // because aggregate members are accessed.374 360 PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) ); 375 376 361 PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) ); 377 362 PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) ); 378 379 // Check as early as possible. Can't happen before380 // LinkReferenceToType, observed failing when attempted381 // before eliminateTypedef382 363 PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) ); 383 384 364 PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) ); 385 365 PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) ); 386 387 // Must happen before Autogen.388 366 PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) ); 389 390 367 PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) ); 391 392 // Must be after implement concurrent keywords; because uniqueIds393 // must be set on declaration before resolution.394 // Must happen before autogen routines are added.395 368 PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) ); 396 397 // Must happen before autogen routines are added.398 369 PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) ); 399 370 400 // Must be after enum and pointer decay.401 // Must be before compound literals.402 371 PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) ); 403 372 … … 470 439 PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) ); 471 440 PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) ); 441 PASS( "Convert Specializations", GenPoly::convertSpecializations( transUnit ) ); // needs to happen before tuple types are expanded 442 472 443 473 444 translationUnit = convert( move( transUnit ) ); 474 445 } else { 446 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) ); 447 if ( exdeclp ) { 448 dump( translationUnit ); 449 return EXIT_SUCCESS; 450 } // if 451 475 452 // add the assignment statement after the initialization of a type parameter 476 453 PASS( "Validate", SymTab::validate( translationUnit ) ); … … 538 515 PASS( "Translate Tries", ControlStruct::translateTries( translationUnit ) ); 539 516 PASS( "Gen Waitfor", Concurrency::generateWaitFor( translationUnit ) ); 517 PASS( "Convert Specializations", GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded 518 540 519 } 541 520 542 PASS( "Convert Specializations", GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded 521 522 // PASS( "Convert Specializations", GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded 543 523 544 524 PASS( "Expand Tuples", Tuples::expandTuples( translationUnit ) ); // xxx - is this the right place for this? -
tests/alloc2.cfa
r4e2befe3 rdef751f 11 11 typedef struct S1 T1; 12 12 13 void test_base( void * ip, size_t size, size_t align ) {13 void test_base( void * ip, size_t size, size_t align ) { 14 14 tests_total += 1; 15 // printf( "DEBUG: starting test %d\n", tests_total);16 bool passed = (malloc_size( ip) == size) && (malloc_usable_size(ip) >= size) && (malloc_alignment(ip) == align) && ((uintptr_t)ip % align == 0);17 if ( !passed) {18 printf( "failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size(ip), malloc_usable_size(ip), malloc_alignment(ip));15 // printf( "DEBUG: starting test %d\n", tests_total); 16 bool passed = (malloc_size( ip ) == size) && (malloc_usable_size( ip ) >= size) && (malloc_alignment( ip ) == align) && ((uintptr_t)ip % align == 0); 17 if ( ! passed ) { 18 printf( "failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size( ip ), malloc_usable_size( ip ), malloc_alignment( ip ) ); 19 19 tests_failed += 1; 20 } 21 // printf( "DEBUG: done test %d\n", tests_total);20 } // if 21 // printf( "DEBUG: done test %d\n", tests_total); 22 22 } 23 23 24 void test_fill( void * ip_, size_t start, size_t end, char fill ) {24 void test_fill( void * ip_, size_t start, size_t end, char fill ) { 25 25 tests_total += 1; 26 // printf( "DEBUG: starting test %d\n", tests_total);26 // printf( "DEBUG: starting test %d\n", tests_total ); 27 27 bool passed = true; 28 28 char * ip = (char *) ip_; 29 for ( i; start ~ end) passed = passed && (ip[i] == fill);30 if ( !passed) {31 printf( "failed test %3d: fill C\n", tests_total);29 for ( i; start ~ end ) passed = passed && (ip[i] == fill); 30 if ( ! passed ) { 31 printf( "failed test %3d: fill C\n", tests_total ); 32 32 tests_failed += 1; 33 } 34 // printf( "DEBUG: done test %d\n", tests_total);33 } // if 34 // printf( "DEBUG: done test %d\n", tests_total ); 35 35 } 36 36 37 void test_fill( void * ip_, size_t start, size_t end, int fill ) {37 void test_fill( void * ip_, size_t start, size_t end, int fill ) { 38 38 tests_total += 1; 39 // printf( "DEBUG: starting test %d\n", tests_total);39 // printf( "DEBUG: starting test %d\n", tests_total ); 40 40 bool passed = true; 41 int * ip = (int *) 42 for (i; start ~ end ) passed = passed && (ip[i] == fill);43 if ( !passed) {44 printf( "failed test %3d: fill int\n", tests_total);41 int * ip = (int *)ip_; 42 for (i; start ~ end ) passed = passed && (ip[i] == fill); 43 if ( ! passed ) { 44 printf( "failed test %3d: fill int\n", tests_total ); 45 45 tests_failed += 1; 46 } 47 // printf( "DEBUG: done test %d\n", tests_total);46 } // if 47 // printf( "DEBUG: done test %d\n", tests_total ); 48 48 } 49 49 50 void test_fill( void * ip_, size_t start, size_t end, int * fill ) {50 void test_fill( void * ip_, size_t start, size_t end, int * fill ) { 51 51 tests_total += 1; 52 // printf( "DEBUG: starting test %d\n", tests_total);53 bool passed = (memcmp((void*)((uintptr_t)ip_ + start), (void*)fill, end) == 0);54 if ( !passed) {55 printf( "failed test %3d: fill int A\n", tests_total);52 // printf( "DEBUG: starting test %d\n", tests_total ); 53 bool passed = memcmp((void*)((uintptr_t )ip_ + start ), (void*)fill, end ) == 0; 54 if ( ! passed ) { 55 printf( "failed test %3d: fill int A\n", tests_total ); 56 56 tests_failed += 1; 57 } 58 // printf( "DEBUG: done test %d\n", tests_total);57 } // if 58 // printf( "DEBUG: done test %d\n", tests_total ); 59 59 } 60 60 61 void test_fill( void * ip_, size_t start, size_t end, T1 fill ) {61 void test_fill( void * ip_, size_t start, size_t end, T1 fill ) { 62 62 tests_total += 1; 63 // printf( "DEBUG: starting test %d\n", tests_total);63 // printf( "DEBUG: starting test %d\n", tests_total ); 64 64 bool passed = true; 65 65 T1 * ip = (T1 *) ip_; 66 for ( i; start ~ end) passed = passed && (ip[i].data == fill.data);67 if ( !passed) {68 printf( "failed test %3d: fill T1\n", tests_total);66 for ( i; start ~ end ) passed = passed && (ip[i].data == fill.data ); 67 if ( ! passed ) { 68 printf( "failed test %3d: fill T1\n", tests_total ); 69 69 tests_failed += 1; 70 } 71 // printf( "DEBUG: done test %d\n", tests_total);70 } // if 71 // printf( "DEBUG: done test %d\n", tests_total ); 72 72 } 73 73 74 void test_fill( void * ip_, size_t start, size_t end, T1 * fill ) {74 void test_fill( void * ip_, size_t start, size_t end, T1 * fill ) { 75 75 tests_total += 1; 76 // printf( "DEBUG: starting test %d\n", tests_total);77 bool passed = (memcmp((void*)((uintptr_t)ip_ + start), (void*)fill, end) == 0);78 if ( !passed) {79 printf( "failed test %3d: fill T1 A\n", tests_total);76 // printf( "DEBUG: starting test %d\n", tests_total ); 77 bool passed = memcmp( (void*)((uintptr_t )ip_ + start ), (void*)fill, end ) == 0; 78 if ( ! passed ) { 79 printf( "failed test %3d: fill T1 A\n", tests_total ); 80 80 tests_failed += 1; 81 } 82 // printf( "DEBUG: done test %d\n", tests_total);81 } // if 82 // printf( "DEBUG: done test %d\n", tests_total ); 83 83 } 84 84 85 void test_use( int * ip, size_t dim ) {85 void test_use( int * ip, size_t dim ) { 86 86 tests_total += 1; 87 // printf( "DEBUG: starting test %d\n", tests_total);87 // printf( "DEBUG: starting test %d\n", tests_total ); 88 88 bool passed = true; 89 for ( i; 0 ~ dim) ip[i] = 0xdeadbeef;90 for ( i; 0 ~ dim) passed = passed && (ip[i] == 0xdeadbeef);91 if ( !passed) {92 printf( "failed test %3d: use int\n", tests_total);89 for ( i; 0 ~ dim ) ip[i] = 0xdeadbeef; 90 for ( i; 0 ~ dim ) passed = passed && (ip[i] == 0xdeadbeef); 91 if ( ! passed ) { 92 printf( "failed test %3d: use int\n", tests_total ); 93 93 tests_failed += 1; 94 } 95 // printf( "DEBUG: done test %d\n", tests_total);94 } // if 95 // printf( "DEBUG: done test %d\n", tests_total ); 96 96 } 97 97 98 void test_use( T1 * ip, size_t dim ) {98 void test_use( T1 * ip, size_t dim ) { 99 99 tests_total += 1; 100 // printf( "DEBUG: starting test %d\n", tests_total);100 // printf( "DEBUG: starting test %d\n", tests_total ); 101 101 bool passed = true; 102 for ( i; 0 ~ dim) ip[i].data = 0xdeadbeef;103 for ( i; 0 ~ dim) passed = passed && (ip[i].data == 0xdeadbeef);104 if ( !passed) {105 printf( "failed test %3d: use T1\n", tests_total);102 for ( i; 0 ~ dim ) ip[i].data = 0xdeadbeef; 103 for ( i; 0 ~ dim ) passed = passed && (ip[i].data == 0xdeadbeef); 104 if ( ! passed ) { 105 printf( "failed test %3d: use T1\n", tests_total ); 106 106 tests_failed += 1; 107 } 108 // printf( "DEBUG: done test %d\n", tests_total);107 } // if 108 // printf( "DEBUG: done test %d\n", tests_total ); 109 109 } 110 110 111 111 int main( void ) { 112 enum { dim = 8, align = 64, libAlign = libAlign() }; 112 113 size_t elemSize = sizeof(int); 113 size_t dim = 8;114 114 size_t size = dim * elemSize; 115 size_t align = 64; 116 const size_t libAlign = libAlign(); 117 118 int FillT = 9; 119 char FillC = 'a'; 120 int * FillA = calloc(dim / 4); 121 T1 FillT1 = { FillT }; 122 T1 * FillT1A = (T1 *)(void *) malloc( (dim / 4) * sizeof(T1) ); 123 for (i; 0 ~ (dim / 4) ) FillT1A[i] = FillT1; 124 125 int * ip; 126 int * op; 127 double * dp; 128 T1 * t1p; 129 T1 * t1op; 115 116 int FillT = 9; 117 char FillC = 'a'; 118 int * FillA = calloc( dim / 4 ); 119 T1 FillT1 = { FillT }; 120 T1 * FillT1A = (T1 *)(void *) malloc( (dim / 4) * sizeof(T1) ); 121 for ( i; 0 ~ (dim / 4) ) FillT1A[i] = FillT1; 122 123 int * ip; 124 int * op; 125 double * dp; 126 T1 * t1p; 127 T1 * t1op; 130 128 131 129 // testing alloc … … 136 134 137 135 ip = alloc(); 138 test_base( ip, elemSize, libAlign);139 test_use( ip, elemSize / elemSize);140 free( ip);136 test_base( ip, elemSize, libAlign ); 137 test_use( ip, elemSize / elemSize ); 138 free( ip ); 141 139 142 140 ip = alloc( dim ); 143 test_base( ip, size, libAlign);144 test_use( ip, size / elemSize);145 free( ip);141 test_base( ip, size, libAlign ); 142 test_use( ip, size / elemSize ); 143 free( ip ); 146 144 147 145 ip = alloc( 0 ); 148 test_base( ip, 0, libAlign);149 free( ip);146 test_base( ip, 0, libAlign ); 147 free( ip ); 150 148 151 149 dp = alloc( dim ); 152 150 ip = alloc( dp`resize ); 153 test_base( ip, elemSize, libAlign);154 test_use( ip, elemSize / elemSize);155 free( ip);156 157 ip = alloc( ((double *)0p)`resize );158 test_base( ip, elemSize, libAlign);159 test_use( ip, elemSize / elemSize);160 free( ip);151 test_base( ip, elemSize, libAlign ); 152 test_use( ip, elemSize / elemSize ); 153 free( ip ); 154 155 ip = alloc( ((double *)0p)`resize ); 156 test_base( ip, elemSize, libAlign ); 157 test_use( ip, elemSize / elemSize ); 158 free( ip ); 161 159 162 160 dp = alloc( dim ); 163 161 ip = alloc( dim, dp`resize ); 164 test_base( ip, size, libAlign);165 test_use( ip, size / elemSize);166 free( ip);162 test_base( ip, size, libAlign ); 163 test_use( ip, size / elemSize ); 164 free( ip ); 167 165 168 166 dp = alloc( dim ); 169 167 ip = alloc( 0, dp`resize ); 170 test_base( ip, 0, libAlign);171 free( ip);172 173 ip = alloc( dim, ((double*)0p)`resize );174 test_base( ip, size, libAlign);175 test_use( ip, size / elemSize);176 free( ip);177 178 ip = alloc( 0, ((double*)0p)`resize );179 test_base( ip, 0, libAlign);180 free( ip);181 182 op = alloc( dim, ((int)0xdeadbeef)`fill );168 test_base( ip, 0, libAlign ); 169 free( ip ); 170 171 ip = alloc( dim, 0p`resize ); 172 test_base( ip, size, libAlign ); 173 test_use( ip, size / elemSize ); 174 free( ip ); 175 176 ip = alloc( 0, 0p`resize ); 177 test_base( ip, 0, libAlign ); 178 free( ip ); 179 180 op = alloc( dim, 0xdeadbeefN`fill ); 183 181 ip = alloc( dim, op`realloc ); 184 test_base( ip, size, libAlign);185 test_fill( ip, 0, dim, (int)0xdeadbeef);186 test_use( ip, size / elemSize);187 free( ip);188 189 op = alloc( dim, ((int)0xdeadbeef)`fill );182 test_base( ip, size, libAlign ); 183 test_fill( ip, 0, dim, 0xdeadbeefN ); 184 test_use( ip, size / elemSize ); 185 free( ip ); 186 187 op = alloc( dim, 0xdeadbeefN`fill ); 190 188 ip = alloc( 0, op`realloc ); 191 test_base( ip, 0, libAlign);192 free( ip);193 194 ip = alloc( dim, ((int*)0p)`realloc );195 test_base( ip, size, libAlign);196 test_use( ip, size / elemSize);197 free( ip);198 199 ip = alloc( 0, ((int*)0p)`realloc );200 test_base( ip, 0, libAlign);201 free( ip);202 203 op = alloc( dim, ((int)0xdeadbeef)`fill );189 test_base( ip, 0, libAlign ); 190 free( ip ); 191 192 ip = alloc( dim, 0p`realloc ); 193 test_base( ip, size, libAlign ); 194 test_use( ip, size / elemSize ); 195 free( ip ); 196 197 ip = alloc( 0, 0p`realloc ); 198 test_base( ip, 0, libAlign ); 199 free( ip ); 200 201 op = alloc( dim, 0xdeadbeefN`fill ); 204 202 ip = alloc( dim, op`resize ); 205 test_base( ip, size, libAlign);206 test_use( ip, size / elemSize);207 free( ip);203 test_base( ip, size, libAlign ); 204 test_use( ip, size / elemSize ); 205 free( ip ); 208 206 209 207 ip = alloc( FillC`fill ); 210 test_base( ip, elemSize, libAlign);211 test_fill( ip, 0, elemSize, FillC);212 test_use( ip, elemSize / elemSize);213 free( ip);208 test_base( ip, elemSize, libAlign ); 209 test_fill( ip, 0, elemSize, FillC ); 210 test_use( ip, elemSize / elemSize ); 211 free( ip ); 214 212 215 213 ip = alloc( FillT`fill ); 216 test_base( ip, elemSize, libAlign);217 test_fill( ip, 0, 1, FillT);218 test_use( ip, elemSize / elemSize);219 free( ip);214 test_base( ip, elemSize, libAlign ); 215 test_fill( ip, 0, 1, FillT ); 216 test_use( ip, elemSize / elemSize ); 217 free( ip ); 220 218 221 219 ip = alloc( dim, FillC`fill ); 222 test_base( ip, size, libAlign);223 test_fill( ip, 0, size, FillC);224 test_use( ip, size / elemSize);225 free( ip);220 test_base( ip, size, libAlign ); 221 test_fill( ip, 0, size, FillC ); 222 test_use( ip, size / elemSize ); 223 free( ip ); 226 224 227 225 ip = alloc( 0, FillC`fill ); 228 test_base( ip, 0, libAlign);229 free( ip);226 test_base( ip, 0, libAlign ); 227 free( ip ); 230 228 231 229 ip = alloc( dim, FillT`fill ); 232 test_base( ip, size, libAlign);233 test_fill( ip, 0, dim, FillT);234 test_use( ip, size / elemSize);235 free( ip);230 test_base( ip, size, libAlign ); 231 test_fill( ip, 0, dim, FillT ); 232 test_use( ip, size / elemSize ); 233 free( ip ); 236 234 237 235 ip = alloc( 0, FillT`fill ); 238 test_base( ip, 0, libAlign);239 free( ip);236 test_base( ip, 0, libAlign ); 237 free( ip ); 240 238 241 239 ip = alloc( dim, [FillA, dim/4]`fill ); 242 test_base( ip, size, libAlign);243 test_fill( ip, 0, size/4, FillA);244 test_use( ip, size / elemSize);245 free( ip);240 test_base( ip, size, libAlign ); 241 test_fill( ip, 0, size/4, FillA ); 242 test_use( ip, size / elemSize ); 243 free( ip ); 246 244 247 245 ip = alloc( 0, [FillA, dim/4]`fill ); 248 test_base( ip, 0, libAlign);249 free( ip);250 251 op = alloc( dim, ((int)0xdeadbeef)`fill );246 test_base( ip, 0, libAlign ); 247 free( ip ); 248 249 op = alloc( dim, 0xdeadbeefN`fill ); 252 250 ip = alloc( dim, op`realloc, FillC`fill ); 253 test_base( ip, size, libAlign);254 test_fill( ip, 0, dim, (int)0xdeadbeef);255 test_use( ip, size / elemSize);256 free( ip);257 258 op = alloc( dim, ((int)0xdeadbeef)`fill );251 test_base( ip, size, libAlign ); 252 test_fill( ip, 0, dim, 0xdeadbeefN ); 253 test_use( ip, size / elemSize ); 254 free( ip ); 255 256 op = alloc( dim, 0xdeadbeefN`fill ); 259 257 ip = alloc( dim / 4, op`realloc, FillC`fill ); 260 test_base( ip, size / 4, libAlign);261 test_fill( ip, 0, dim / 4, (int)0xdeadbeef);262 test_use( ip, size / 4 / elemSize);263 free( ip);264 265 op = alloc( dim, ((int)0xdeadbeef)`fill );258 test_base( ip, size / 4, libAlign ); 259 test_fill( ip, 0, dim / 4, 0xdeadbeefN ); 260 test_use( ip, size / 4 / elemSize ); 261 free( ip ); 262 263 op = alloc( dim, 0xdeadbeefN`fill ); 266 264 ip = alloc( dim * 4, op`realloc, FillC`fill ); 267 test_base( ip, size * 4, libAlign);268 test_fill( ip, 0, dim, (int)0xdeadbeef);269 test_fill( ip, size, size * 4, FillC);270 test_use( ip, size * 4 / elemSize);271 free( ip);272 273 op = alloc( dim, ((int)0xdeadbeef)`fill );265 test_base( ip, size * 4, libAlign ); 266 test_fill( ip, 0, dim, 0xdeadbeefN ); 267 test_fill( ip, size, size * 4, FillC ); 268 test_use( ip, size * 4 / elemSize ); 269 free( ip ); 270 271 op = alloc( dim, 0xdeadbeefN`fill ); 274 272 ip = alloc( 0, op`realloc, FillC`fill ); 275 test_base( ip, 0, libAlign);276 free( ip);277 278 ip = alloc( dim, ((int*)0p)`realloc, FillC`fill );279 test_base( ip, size, libAlign);280 test_fill( ip, 0, size, FillC);281 test_use( ip, size / elemSize);282 free( ip);283 284 ip = alloc( 0, ((int*)0p)`realloc, FillC`fill );285 test_base( ip, 0, libAlign);286 free( ip);287 288 op = alloc( dim, ((int)0xdeadbeef)`fill );273 test_base( ip, 0, libAlign ); 274 free( ip ); 275 276 ip = alloc( dim, 0p`realloc, FillC`fill ); 277 test_base( ip, size, libAlign ); 278 test_fill( ip, 0, size, FillC ); 279 test_use( ip, size / elemSize ); 280 free( ip ); 281 282 ip = alloc( 0, 0p`realloc, FillC`fill ); 283 test_base( ip, 0, libAlign ); 284 free( ip ); 285 286 op = alloc( dim, 0xdeadbeefN`fill ); 289 287 ip = alloc( dim, op`realloc, FillT`fill ); 290 test_base( ip, size, libAlign);291 test_fill( ip, 0, dim, (int)0xdeadbeef);292 test_use( ip, size / elemSize);293 free( ip);294 295 op = alloc( dim, ((int)0xdeadbeef)`fill );288 test_base( ip, size, libAlign ); 289 test_fill( ip, 0, dim, 0xdeadbeefN ); 290 test_use( ip, size / elemSize ); 291 free( ip ); 292 293 op = alloc( dim, 0xdeadbeefN`fill ); 296 294 ip = alloc( dim / 4, op`realloc, FillT`fill ); 297 test_base( ip, size / 4, libAlign);298 test_fill( ip, 0, dim / 4, (int)0xdeadbeef);299 test_use( ip, size / 4 / elemSize);300 free( ip);301 302 op = alloc( dim, ((int)0xdeadbeef)`fill );295 test_base( ip, size / 4, libAlign ); 296 test_fill( ip, 0, dim / 4, 0xdeadbeefN ); 297 test_use( ip, size / 4 / elemSize ); 298 free( ip ); 299 300 op = alloc( dim, 0xdeadbeefN`fill ); 303 301 ip = alloc( dim * 4, op`realloc, FillT`fill ); 304 test_base( ip, size * 4, libAlign);305 test_fill( ip, 0, dim, (int)0xdeadbeef);306 test_fill( ip, dim, dim * 4, FillT);307 test_use( ip, size * 4 / elemSize);308 free( ip);309 310 op = alloc( dim, ((int)0xdeadbeef)`fill );302 test_base( ip, size * 4, libAlign ); 303 test_fill( ip, 0, dim, 0xdeadbeefN ); 304 test_fill( ip, dim, dim * 4, FillT ); 305 test_use( ip, size * 4 / elemSize ); 306 free( ip ); 307 308 op = alloc( dim, 0xdeadbeefN`fill ); 311 309 ip = alloc( 0, op`realloc, FillT`fill ); 312 test_base( ip, 0, libAlign);313 free( ip);314 315 ip = alloc( dim, ((int*)0p)`realloc, FillT`fill );316 test_base( ip, size, libAlign);317 test_fill( ip, 0, dim, FillT);318 test_use( ip, size / elemSize);319 free( ip);320 321 ip = alloc( 0, ((int*)0p)`realloc, FillT`fill );322 test_base( ip, 0, libAlign);323 free( ip);310 test_base( ip, 0, libAlign ); 311 free( ip ); 312 313 ip = alloc( dim, 0p`realloc, FillT`fill ); 314 test_base( ip, size, libAlign ); 315 test_fill( ip, 0, dim, FillT ); 316 test_use( ip, size / elemSize ); 317 free( ip ); 318 319 ip = alloc( 0, 0p`realloc, FillT`fill ); 320 test_base( ip, 0, libAlign ); 321 free( ip ); 324 322 325 323 ip = alloc( align`align ); 326 test_base( ip, elemSize, align);327 test_use( ip, elemSize / elemSize);328 free( ip);324 test_base( ip, elemSize, align ); 325 test_use( ip, elemSize / elemSize ); 326 free( ip ); 329 327 330 328 ip = alloc( dim, align`align ); 331 test_base( ip, size, align);332 test_use( ip, size / elemSize);333 free( ip);329 test_base( ip, size, align ); 330 test_use( ip, size / elemSize ); 331 free( ip ); 334 332 335 333 ip = alloc( 0, align`align ); 336 test_base( ip, 0, libAlign);337 free( ip);338 339 op = alloc( dim, ((int)0xdeadbeef)`fill );334 test_base( ip, 0, libAlign ); 335 free( ip ); 336 337 op = alloc( dim, 0xdeadbeefN`fill ); 340 338 ip = alloc( op`realloc, align`align ); 341 test_base( ip, elemSize, align);342 test_fill( ip, 0, 1, (int)0xdeadbeef);343 test_use( ip, elemSize / elemSize);344 free( ip);345 346 ip = alloc( ((int*)0p)`realloc, align`align );347 test_base( ip, elemSize, align);348 test_use( ip, elemSize / elemSize);349 free( ip);339 test_base( ip, elemSize, align ); 340 test_fill( ip, 0, 1, 0xdeadbeefN ); 341 test_use( ip, elemSize / elemSize ); 342 free( ip ); 343 344 ip = alloc( 0p`realloc, align`align ); 345 test_base( ip, elemSize, align ); 346 test_use( ip, elemSize / elemSize ); 347 free( ip ); 350 348 351 349 dp = alloc( dim ); 352 350 ip = alloc( dp`resize, align`align ); 353 test_base( ip, elemSize, align);354 test_use( ip, elemSize / elemSize);355 free( ip);356 357 ip = alloc( ((double*)0p)`resize, align`align );358 test_base( ip, elemSize, align);359 test_use( ip, elemSize / elemSize);360 free( ip);361 362 op = alloc( dim, ((int)0xdeadbeef)`fill);351 test_base( ip, elemSize, align ); 352 test_use( ip, elemSize / elemSize ); 353 free( ip ); 354 355 ip = alloc( 0p`resize, align`align ); 356 test_base( ip, elemSize, align ); 357 test_use( ip, elemSize / elemSize ); 358 free( ip ); 359 360 op = alloc( dim, 0xdeadbeefN`fill ); 363 361 ip = alloc( dim, op`realloc, align`align ); 364 test_base( ip, size, align);365 test_fill( ip, 0, dim, (int)0xdeadbeef);366 test_use( ip, size / elemSize);367 free( ip);368 369 op = alloc( dim, ((int)0xdeadbeef)`fill );362 test_base( ip, size, align ); 363 test_fill( ip, 0, dim, 0xdeadbeefN ); 364 test_use( ip, size / elemSize ); 365 free( ip ); 366 367 op = alloc( dim, 0xdeadbeefN`fill ); 370 368 ip = alloc( 0, op`realloc, align`align ); 371 test_base( ip, 0, libAlign);372 free( ip);373 374 ip = alloc( dim, ((int*)0p)`realloc, align`align );375 test_base( ip, size, align);376 test_use( ip, size / elemSize);377 free( ip);378 379 ip = alloc( 0, ((int*)0p)`realloc, align`align );380 test_base( ip, 0, libAlign);381 free( ip);369 test_base( ip, 0, libAlign ); 370 free( ip ); 371 372 ip = alloc( dim, 0p`realloc, align`align ); 373 test_base( ip, size, align ); 374 test_use( ip, size / elemSize ); 375 free( ip ); 376 377 ip = alloc( 0, 0p`realloc, align`align ); 378 test_base( ip, 0, libAlign ); 379 free( ip ); 382 380 383 381 ip = alloc( align`align, FillC`fill ); 384 test_base( ip, elemSize, align);385 test_fill( ip, 0, elemSize, FillC);386 test_use( ip, elemSize / elemSize);387 free( ip);382 test_base( ip, elemSize, align ); 383 test_fill( ip, 0, elemSize, FillC ); 384 test_use( ip, elemSize / elemSize ); 385 free( ip ); 388 386 389 387 ip = alloc( align`align, FillT`fill ); 390 test_base( ip, elemSize, align);391 test_fill( ip, 0, 1, FillT);392 test_use( ip, elemSize / elemSize);393 free( ip);388 test_base( ip, elemSize, align ); 389 test_fill( ip, 0, 1, FillT ); 390 test_use( ip, elemSize / elemSize ); 391 free( ip ); 394 392 395 393 ip = alloc( dim, align`align, FillC`fill ); 396 test_base( ip, size, align);397 test_fill( ip, 0, size, FillC);398 test_use( ip, size / elemSize);399 free( ip);394 test_base( ip, size, align ); 395 test_fill( ip, 0, size, FillC ); 396 test_use( ip, size / elemSize ); 397 free( ip ); 400 398 401 399 ip = alloc( 0, align`align, FillC`fill ); 402 test_base( ip, 0, libAlign);403 free( ip);400 test_base( ip, 0, libAlign ); 401 free( ip ); 404 402 405 403 ip = alloc( dim, align`align, FillT`fill ); 406 test_base( ip, size, align);407 test_fill( ip, 0, dim, FillT);408 test_use( ip, size / elemSize);409 free( ip);404 test_base( ip, size, align ); 405 test_fill( ip, 0, dim, FillT ); 406 test_use( ip, size / elemSize ); 407 free( ip ); 410 408 411 409 ip = alloc( 0, align`align, FillT`fill ); 412 test_base( ip, 0, libAlign);413 free( ip);410 test_base( ip, 0, libAlign ); 411 free( ip ); 414 412 415 413 ip = alloc( dim, align`align, [FillA, dim/4]`fill ); 416 test_base( ip, size, align);417 test_fill( ip, 0, size/4, FillA);418 test_use( ip, size / elemSize);419 free( ip);414 test_base( ip, size, align ); 415 test_fill( ip, 0, size/4, FillA ); 416 test_use( ip, size / elemSize ); 417 free( ip ); 420 418 421 419 ip = alloc( 0, align`align, [FillA, dim/4]`fill ); 422 test_base( ip, 0, libAlign);423 free( ip);424 425 op = alloc( dim, ((int)0xdeadbeef)`fill );420 test_base( ip, 0, libAlign ); 421 free( ip ); 422 423 op = alloc( dim, 0xdeadbeefN`fill ); 426 424 ip = alloc( dim, op`realloc, align`align, FillC`fill ); 427 test_base( ip, size, align);428 test_fill( ip, 0, dim, (int)0xdeadbeef);429 test_use( ip, size / elemSize);430 free( ip);431 432 op = alloc( dim, ((int)0xdeadbeef)`fill );425 test_base( ip, size, align ); 426 test_fill( ip, 0, dim, 0xdeadbeefN ); 427 test_use( ip, size / elemSize ); 428 free( ip ); 429 430 op = alloc( dim, 0xdeadbeefN`fill ); 433 431 ip = alloc( dim / 4, op`realloc, align`align, FillC`fill ); 434 test_base( ip, size / 4, align);435 test_fill( ip, 0, dim / 4, (int)0xdeadbeef);436 test_use( ip, size / 4 / elemSize);437 free( ip);438 439 op = alloc( dim, ((int)0xdeadbeef)`fill );432 test_base( ip, size / 4, align ); 433 test_fill( ip, 0, dim / 4, 0xdeadbeefN ); 434 test_use( ip, size / 4 / elemSize ); 435 free( ip ); 436 437 op = alloc( dim, 0xdeadbeefN`fill ); 440 438 ip = alloc( dim * 4, op`realloc, align`align, FillC`fill ); 441 test_base( ip, size * 4, align);442 test_fill( ip, 0, dim, (int)0xdeadbeef);443 test_fill( ip, size, size * 4, FillC);444 test_use( ip, size * 4 / elemSize);445 free( ip);446 447 op = alloc( dim, ((int)0xdeadbeef)`fill );439 test_base( ip, size * 4, align ); 440 test_fill( ip, 0, dim, 0xdeadbeefN ); 441 test_fill( ip, size, size * 4, FillC ); 442 test_use( ip, size * 4 / elemSize ); 443 free( ip ); 444 445 op = alloc( dim, 0xdeadbeefN`fill ); 448 446 ip = alloc( 0, op`realloc, align`align, FillC`fill ); 449 test_base( ip, 0, libAlign);450 free( ip);451 452 ip = alloc( dim, ((int*)0p)`realloc, align`align, FillC`fill );453 test_base( ip, size, align);454 test_fill( ip, 0, size, FillC);455 test_use( ip, size / elemSize);456 free( ip);457 458 ip = alloc( 0, ((int*)0p)`realloc, align`align, FillC`fill );459 test_base( ip, 0, libAlign);460 free( ip);461 462 op = alloc( dim, ((int)0xdeadbeef)`fill );447 test_base( ip, 0, libAlign ); 448 free( ip ); 449 450 ip = alloc( dim, 0p`realloc, align`align, FillC`fill ); 451 test_base( ip, size, align ); 452 test_fill( ip, 0, size, FillC ); 453 test_use( ip, size / elemSize ); 454 free( ip ); 455 456 ip = alloc( 0, 0p`realloc, align`align, FillC`fill ); 457 test_base( ip, 0, libAlign ); 458 free( ip ); 459 460 op = alloc( dim, 0xdeadbeefN`fill ); 463 461 ip = alloc( dim, op`realloc, align`align, FillT`fill ); 464 test_base( ip, size, align);465 test_fill( ip, 0, dim, (int)0xdeadbeef);466 test_use( ip, size / elemSize);467 free( ip);468 469 op = alloc( dim, ((int)0xdeadbeef)`fill );462 test_base( ip, size, align ); 463 test_fill( ip, 0, dim, 0xdeadbeefN ); 464 test_use( ip, size / elemSize ); 465 free( ip ); 466 467 op = alloc( dim, 0xdeadbeefN`fill ); 470 468 ip = alloc( dim / 4, op`realloc, align`align, FillT`fill ); 471 test_base( ip, size / 4, align);472 test_fill( ip, 0, dim / 4, (int)0xdeadbeef);473 test_use( ip, size / 4 / elemSize);474 free( ip);475 476 op = alloc( dim, ((int)0xdeadbeef)`fill );469 test_base( ip, size / 4, align ); 470 test_fill( ip, 0, dim / 4, 0xdeadbeefN ); 471 test_use( ip, size / 4 / elemSize ); 472 free( ip ); 473 474 op = alloc( dim, 0xdeadbeefN`fill ); 477 475 ip = alloc( dim * 4, op`realloc, align`align, FillT`fill ); 478 test_base( ip, size * 4, align);479 test_fill( ip, 0, dim, (int)0xdeadbeef);480 test_fill( ip, dim, dim * 4, FillT);481 test_use( ip, size * 4 / elemSize);482 free( ip);483 484 op = alloc( dim, ((int)0xdeadbeef)`fill );476 test_base( ip, size * 4, align ); 477 test_fill( ip, 0, dim, 0xdeadbeefN ); 478 test_fill( ip, dim, dim * 4, FillT ); 479 test_use( ip, size * 4 / elemSize ); 480 free( ip ); 481 482 op = alloc( dim, 0xdeadbeefN`fill ); 485 483 ip = alloc( 0, op`realloc, align`align, FillT`fill ); 486 test_base( ip, 0, libAlign);487 free( ip);488 489 ip = alloc( dim, ((int*)0p)`realloc, align`align, FillT`fill );490 test_base( ip, size, align);491 test_fill( ip, 0, dim, FillT);492 test_use( ip, size / elemSize);493 free( ip);494 495 ip = alloc( 0, ((int*)0p)`realloc, align`align, FillT`fill );496 test_base( ip, 0, libAlign);497 free( ip);498 499 if ( tests_failed == 0) printf("PASSED alloc tests\n\n");500 else printf( "failed alloc tests : %d/%d\n\n", tests_failed, tests_total);501 502 // testing alloc ( aligned struct)484 test_base( ip, 0, libAlign ); 485 free( ip ); 486 487 ip = alloc( dim, 0p`realloc, align`align, FillT`fill ); 488 test_base( ip, size, align ); 489 test_fill( ip, 0, dim, FillT ); 490 test_use( ip, size / elemSize ); 491 free( ip ); 492 493 ip = alloc( 0, 0p`realloc, align`align, FillT`fill ); 494 test_base( ip, 0, libAlign ); 495 free( ip ); 496 497 if ( tests_failed == 0 ) printf( "PASSED alloc tests\n\n" ); 498 else printf( "failed alloc tests : %d/%d\n\n", tests_failed, tests_total ); 499 500 // testing alloc ( aligned struct ) 503 501 504 502 elemSize = sizeof(T1); … … 509 507 510 508 t1p = alloc(); 511 test_base( t1p, elemSize, tAlign);512 test_use( t1p, elemSize / elemSize);513 free( t1p);509 test_base( t1p, elemSize, tAlign ); 510 test_use( t1p, elemSize / elemSize ); 511 free( t1p ); 514 512 515 513 t1p = alloc( dim ); 516 test_base( t1p, size, tAlign);517 test_use( t1p, size / elemSize);518 free( t1p);514 test_base( t1p, size, tAlign ); 515 test_use( t1p, size / elemSize ); 516 free( t1p ); 519 517 520 518 t1p = alloc( 0 ); 521 test_base( t1p, 0, libAlign);522 free( t1p);519 test_base( t1p, 0, libAlign ); 520 free( t1p ); 523 521 524 522 dp = alloc( dim ); 525 523 t1p = alloc( dp`resize ); 526 test_base( t1p, elemSize, tAlign);527 test_use( t1p, elemSize / elemSize);528 free( t1p);529 530 t1p = alloc( ((double*)0p)`resize );531 test_base( t1p, elemSize, tAlign);532 test_use( t1p, elemSize / elemSize);533 free( t1p);524 test_base( t1p, elemSize, tAlign ); 525 test_use( t1p, elemSize / elemSize ); 526 free( t1p ); 527 528 t1p = alloc( 0p`resize ); 529 test_base( t1p, elemSize, tAlign ); 530 test_use( t1p, elemSize / elemSize ); 531 free( t1p ); 534 532 535 533 dp = alloc( dim ); 536 534 t1p = alloc( dim, dp`resize ); 537 test_base( t1p, size, tAlign);538 test_use( t1p, size / elemSize);539 free( t1p);535 test_base( t1p, size, tAlign ); 536 test_use( t1p, size / elemSize ); 537 free( t1p ); 540 538 541 539 dp = alloc( dim ); 542 540 t1p = alloc( 0, dp`resize ); 543 test_base( t1p, 0, libAlign);544 free( t1p);545 546 t1p = alloc( dim, ((double*)0p)`resize );547 test_base( t1p, size, tAlign);548 test_use( t1p, size / elemSize);549 free( t1p);550 551 t1p = alloc( 0, ((double*)0p)`resize );552 test_base( t1p, 0, libAlign);553 free( t1p);541 test_base( t1p, 0, libAlign ); 542 free( t1p ); 543 544 t1p = alloc( dim, 0p`resize ); 545 test_base( t1p, size, tAlign ); 546 test_use( t1p, size / elemSize ); 547 free( t1p ); 548 549 t1p = alloc( 0, 0p`resize ); 550 test_base( t1p, 0, libAlign ); 551 free( t1p ); 554 552 555 553 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 556 554 t1p = alloc( dim, t1op`realloc ); 557 test_base( t1p, size, tAlign);558 test_fill( t1p, 0, dim, (T1){0xdeadbeef});559 test_use( t1p, size / elemSize);560 free( t1p);555 test_base( t1p, size, tAlign ); 556 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 557 test_use( t1p, size / elemSize ); 558 free( t1p ); 561 559 562 560 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 563 561 t1p = alloc( 0, t1op`realloc ); 564 test_base( t1p, 0, libAlign);565 free( t1p);566 567 t1p = alloc( dim, ((T1*)0p)`realloc );568 test_base( t1p, size, tAlign);569 test_use( t1p, size / elemSize);570 free( t1p);571 572 t1p = alloc( 0, ((T1*)0p)`realloc );573 test_base( t1p, 0, libAlign);574 free( t1p);562 test_base( t1p, 0, libAlign ); 563 free( t1p ); 564 565 t1p = alloc( dim, 0p`realloc ); 566 test_base( t1p, size, tAlign ); 567 test_use( t1p, size / elemSize ); 568 free( t1p ); 569 570 t1p = alloc( 0, 0p`realloc ); 571 test_base( t1p, 0, libAlign ); 572 free( t1p ); 575 573 576 574 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 577 575 t1p = alloc( dim, t1op`resize ); 578 test_base( t1p, size, tAlign);579 test_use( t1p, size / elemSize);580 free( t1p);576 test_base( t1p, size, tAlign ); 577 test_use( t1p, size / elemSize ); 578 free( t1p ); 581 579 582 580 t1p = alloc( FillC`fill ); 583 test_base( t1p, elemSize, tAlign);584 test_fill( t1p, 0, elemSize, FillC);585 test_use( t1p, elemSize / elemSize);586 free( t1p);581 test_base( t1p, elemSize, tAlign ); 582 test_fill( t1p, 0, elemSize, FillC ); 583 test_use( t1p, elemSize / elemSize ); 584 free( t1p ); 587 585 588 586 t1p = alloc( FillT1`fill ); 589 test_base( t1p, elemSize, tAlign);590 test_fill( t1p, 0, 1, FillT1);591 test_use( t1p, elemSize / elemSize);592 free( t1p);587 test_base( t1p, elemSize, tAlign ); 588 test_fill( t1p, 0, 1, FillT1); 589 test_use( t1p, elemSize / elemSize ); 590 free( t1p ); 593 591 594 592 t1p = alloc( dim, FillC`fill ); 595 test_base( t1p, size, tAlign);596 test_fill( t1p, 0, size, FillC);597 test_use( t1p, size / elemSize);598 free( t1p);593 test_base( t1p, size, tAlign ); 594 test_fill( t1p, 0, size, FillC ); 595 test_use( t1p, size / elemSize ); 596 free( t1p ); 599 597 600 598 t1p = alloc( 0, FillC`fill ); 601 test_base( t1p, 0, libAlign);602 free( t1p);599 test_base( t1p, 0, libAlign ); 600 free( t1p ); 603 601 604 602 t1p = alloc( dim, FillT1`fill ); 605 test_base( t1p, size, tAlign);606 test_fill( t1p, 0, dim, FillT1);607 test_use( t1p, size / elemSize);608 free( t1p);603 test_base( t1p, size, tAlign ); 604 test_fill( t1p, 0, dim, FillT1); 605 test_use( t1p, size / elemSize ); 606 free( t1p ); 609 607 610 608 t1p = alloc( 0, FillT1`fill ); 611 test_base( t1p, 0, libAlign);612 free( t1p);609 test_base( t1p, 0, libAlign ); 610 free( t1p ); 613 611 614 612 t1p = alloc( dim, [FillT1A, dim / 4]`fill ); 615 test_base( t1p, size, tAlign);616 test_fill( t1p, 0, size/4, FillT1A);617 test_use( t1p, size / elemSize);618 free( t1p);613 test_base( t1p, size, tAlign ); 614 test_fill( t1p, 0, size/4, FillT1A ); 615 test_use( t1p, size / elemSize ); 616 free( t1p ); 619 617 620 618 t1p = alloc( 0, [FillT1A, dim / 4]`fill ); 621 test_base( t1p, 0, libAlign);622 free( t1p);619 test_base( t1p, 0, libAlign ); 620 free( t1p ); 623 621 624 622 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 625 623 t1p = alloc( dim, t1op`realloc, FillC`fill ); 626 test_base( t1p, size, tAlign);627 test_fill( t1p, 0, dim, (T1){0xdeadbeef});628 test_use( t1p, size / elemSize);629 free( t1p);624 test_base( t1p, size, tAlign ); 625 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 626 test_use( t1p, size / elemSize ); 627 free( t1p ); 630 628 631 629 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 632 630 t1p = alloc( dim / 4, t1op`realloc, FillC`fill ); 633 test_base( t1p, size / 4, tAlign);634 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});635 test_use( t1p, size / 4 / elemSize);636 free( t1p);631 test_base( t1p, size / 4, tAlign ); 632 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef}); 633 test_use( t1p, size / 4 / elemSize ); 634 free( t1p ); 637 635 638 636 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 639 637 t1p = alloc( dim * 4, t1op`realloc, FillC`fill ); 640 test_base( t1p, size * 4, tAlign);641 test_fill( t1p, 0, dim, (T1){0xdeadbeef});642 test_fill( t1p, size, size * 4, FillC);643 test_use( t1p, size * 4 / elemSize);644 free( t1p);638 test_base( t1p, size * 4, tAlign ); 639 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 640 test_fill( t1p, size, size * 4, FillC ); 641 test_use( t1p, size * 4 / elemSize ); 642 free( t1p ); 645 643 646 644 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 647 645 t1p = alloc( 0, t1op`realloc, FillC`fill ); 648 test_base( t1p, 0, libAlign);649 free( t1p);650 651 t1p = alloc( dim, ((T1*)0p)`realloc, FillC`fill );652 test_base( t1p, size, tAlign);653 test_fill( t1p, 0, size, FillC);654 test_use( t1p, size / elemSize);655 free( t1p);656 657 t1p = alloc( 0, ((T1*)0p)`realloc, FillC`fill );658 test_base( t1p, 0, libAlign);659 free( t1p);646 test_base( t1p, 0, libAlign ); 647 free( t1p ); 648 649 t1p = alloc( dim, 0p`realloc, FillC`fill ); 650 test_base( t1p, size, tAlign ); 651 test_fill( t1p, 0, size, FillC ); 652 test_use( t1p, size / elemSize ); 653 free( t1p ); 654 655 t1p = alloc( 0, 0p`realloc, FillC`fill ); 656 test_base( t1p, 0, libAlign ); 657 free( t1p ); 660 658 661 659 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 662 660 t1p = alloc( dim, t1op`realloc, FillT1`fill ); 663 test_base( t1p, size, tAlign);664 test_fill( t1p, 0, dim, (T1){0xdeadbeef});665 test_use( t1p, size / elemSize);666 free( t1p);661 test_base( t1p, size, tAlign ); 662 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 663 test_use( t1p, size / elemSize ); 664 free( t1p ); 667 665 668 666 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 669 667 t1p = alloc( dim / 4, t1op`realloc, FillT1`fill ); 670 test_base( t1p, size / 4, tAlign);671 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});672 test_use( t1p, size / 4 / elemSize);673 free( t1p);668 test_base( t1p, size / 4, tAlign ); 669 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef}); 670 test_use( t1p, size / 4 / elemSize ); 671 free( t1p ); 674 672 675 673 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 676 674 t1p = alloc( dim * 4, t1op`realloc, FillT1`fill ); 677 test_base( t1p, size * 4, tAlign);678 test_fill( t1p, 0, dim, (T1){0xdeadbeef});679 test_fill( t1p, dim, dim * 4, FillT1);680 test_use( t1p, size * 4 / elemSize);681 free( t1p);675 test_base( t1p, size * 4, tAlign ); 676 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 677 test_fill( t1p, dim, dim * 4, FillT1); 678 test_use( t1p, size * 4 / elemSize ); 679 free( t1p ); 682 680 683 681 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 684 682 t1p = alloc( 0, t1op`realloc, FillT1`fill ); 685 test_base( t1p, 0, libAlign);686 free( t1p);687 688 t1p = alloc( dim, ((T1*)0p)`realloc, FillT1`fill );689 test_base( t1p, size, tAlign);690 test_fill( t1p, 0, dim, FillT1);691 test_use( t1p, size / elemSize);692 free( t1p);693 694 t1p = alloc( 0, ((T1*)0p)`realloc, FillT1`fill );695 test_base( t1p, 0, libAlign);696 free( t1p);683 test_base( t1p, 0, libAlign ); 684 free( t1p ); 685 686 t1p = alloc( dim, 0p`realloc, FillT1`fill ); 687 test_base( t1p, size, tAlign ); 688 test_fill( t1p, 0, dim, FillT1); 689 test_use( t1p, size / elemSize ); 690 free( t1p ); 691 692 t1p = alloc( 0, 0p`realloc, FillT1`fill ); 693 test_base( t1p, 0, libAlign ); 694 free( t1p ); 697 695 698 696 t1p = alloc( align`align ); 699 test_base( t1p, elemSize, align);700 test_use( t1p, elemSize / elemSize);701 free( t1p);697 test_base( t1p, elemSize, align ); 698 test_use( t1p, elemSize / elemSize ); 699 free( t1p ); 702 700 703 701 t1p = alloc( dim, align`align ); 704 test_base( t1p, size, align);705 test_use( t1p, size / elemSize);706 free( t1p);702 test_base( t1p, size, align ); 703 test_use( t1p, size / elemSize ); 704 free( t1p ); 707 705 708 706 t1p = alloc( 0, align`align ); 709 test_base( t1p, 0, libAlign);710 free( t1p);707 test_base( t1p, 0, libAlign ); 708 free( t1p ); 711 709 712 710 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 713 711 t1p = alloc( t1op`realloc, align`align ); 714 test_base( t1p, elemSize, align);715 test_fill( t1p, 0, 1, (T1){0xdeadbeef});716 test_use( t1p, elemSize / elemSize);717 free( t1p);718 719 t1p = alloc( ((T1*)0p)`realloc, align`align );720 test_base( t1p, elemSize, align);721 test_use( t1p, elemSize / elemSize);722 free( t1p);712 test_base( t1p, elemSize, align ); 713 test_fill( t1p, 0, 1, (T1){0xdeadbeef}); 714 test_use( t1p, elemSize / elemSize ); 715 free( t1p ); 716 717 t1p = alloc( 0p`realloc, align`align ); 718 test_base( t1p, elemSize, align ); 719 test_use( t1p, elemSize / elemSize ); 720 free( t1p ); 723 721 724 722 dp = alloc( dim ); 725 723 t1p = alloc( dp`resize, align`align ); 726 test_base( t1p, elemSize, align);727 test_use( t1p, elemSize / elemSize);728 free( t1p);729 730 t1p = alloc( ((double*)0p)`resize, align`align );731 test_base( t1p, elemSize, align);732 test_use( t1p, elemSize / elemSize);733 free( t1p);724 test_base( t1p, elemSize, align ); 725 test_use( t1p, elemSize / elemSize ); 726 free( t1p ); 727 728 t1p = alloc( 0p`resize, align`align ); 729 test_base( t1p, elemSize, align ); 730 test_use( t1p, elemSize / elemSize ); 731 free( t1p ); 734 732 735 733 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 736 734 t1p = alloc( dim, t1op`realloc, align`align ); 737 test_base( t1p, size, align);738 test_fill( t1p, 0, dim, (T1){0xdeadbeef});739 test_use( t1p, size / elemSize);740 free( t1p);735 test_base( t1p, size, align ); 736 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 737 test_use( t1p, size / elemSize ); 738 free( t1p ); 741 739 742 740 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 743 741 t1p = alloc( 0, t1op`realloc, align`align ); 744 test_base( t1p, 0, libAlign);745 free( t1p);746 747 t1p = alloc( dim, ((T1*)0p)`realloc, align`align );748 test_base( t1p, size, align);749 test_use( t1p, size / elemSize);750 free( t1p);751 752 t1p = alloc( 0, ((T1*)0p)`realloc, align`align );753 test_base( t1p, 0, libAlign);754 free( t1p);742 test_base( t1p, 0, libAlign ); 743 free( t1p ); 744 745 t1p = alloc( dim, 0p`realloc, align`align ); 746 test_base( t1p, size, align ); 747 test_use( t1p, size / elemSize ); 748 free( t1p ); 749 750 t1p = alloc( 0, 0p`realloc, align`align ); 751 test_base( t1p, 0, libAlign ); 752 free( t1p ); 755 753 756 754 t1p = alloc( align`align, FillC`fill ); 757 test_base( t1p, elemSize, align);758 test_fill( t1p, 0, elemSize, FillC);759 test_use( t1p, elemSize / elemSize);760 free( t1p);755 test_base( t1p, elemSize, align ); 756 test_fill( t1p, 0, elemSize, FillC ); 757 test_use( t1p, elemSize / elemSize ); 758 free( t1p ); 761 759 762 760 t1p = alloc( align`align, FillT1`fill ); 763 test_base( t1p, elemSize, align);764 test_fill( t1p, 0, 1, FillT1);765 test_use( t1p, elemSize / elemSize);766 free( t1p);761 test_base( t1p, elemSize, align ); 762 test_fill( t1p, 0, 1, FillT1); 763 test_use( t1p, elemSize / elemSize ); 764 free( t1p ); 767 765 768 766 t1p = alloc( dim, align`align, FillC`fill ); 769 test_base( t1p, size, align);770 test_fill( t1p, 0, size, FillC);771 test_use( t1p, size / elemSize);772 free( t1p);767 test_base( t1p, size, align ); 768 test_fill( t1p, 0, size, FillC ); 769 test_use( t1p, size / elemSize ); 770 free( t1p ); 773 771 774 772 t1p = alloc( 0, align`align, FillC`fill ); 775 test_base( t1p, 0, libAlign);776 free( t1p);773 test_base( t1p, 0, libAlign ); 774 free( t1p ); 777 775 778 776 t1p = alloc( dim, align`align, FillT1`fill ); 779 test_base( t1p, size, align);780 test_fill( t1p, 0, dim, FillT1);781 test_use( t1p, size / elemSize);782 free( t1p);777 test_base( t1p, size, align ); 778 test_fill( t1p, 0, dim, FillT1); 779 test_use( t1p, size / elemSize ); 780 free( t1p ); 783 781 784 782 t1p = alloc( 0, align`align, FillT1`fill ); 785 test_base( t1p, 0, libAlign);786 free( t1p);783 test_base( t1p, 0, libAlign ); 784 free( t1p ); 787 785 788 786 t1p = alloc( dim, align`align, [FillT1A, dim / 4]`fill ); 789 test_base( t1p, size, align);790 test_fill( t1p, 0, size/4, FillT1A);791 test_use( t1p, size / elemSize);792 free( t1p);787 test_base( t1p, size, align ); 788 test_fill( t1p, 0, size/4, FillT1A ); 789 test_use( t1p, size / elemSize ); 790 free( t1p ); 793 791 794 792 t1p = alloc( 0, align`align, [FillT1A, dim / 4]`fill ); 795 test_base( t1p, 0, libAlign);796 free( t1p);793 test_base( t1p, 0, libAlign ); 794 free( t1p ); 797 795 798 796 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 799 797 t1p = alloc( dim, t1op`realloc, align`align, FillC`fill ); 800 test_base( t1p, size, align);801 test_fill( t1p, 0, dim, (T1){0xdeadbeef});802 test_use( t1p, size / elemSize);803 free( t1p);798 test_base( t1p, size, align ); 799 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 800 test_use( t1p, size / elemSize ); 801 free( t1p ); 804 802 805 803 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 806 804 t1p = alloc( dim / 4, t1op`realloc, align`align, FillC`fill ); 807 test_base( t1p, size / 4, align);808 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});809 test_use( t1p, size / 4 / elemSize);810 free( t1p);805 test_base( t1p, size / 4, align ); 806 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef}); 807 test_use( t1p, size / 4 / elemSize ); 808 free( t1p ); 811 809 812 810 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 813 811 t1p = alloc( dim * 4, t1op`realloc, align`align, FillC`fill ); 814 test_base( t1p, size * 4, align);815 test_fill( t1p, 0, dim, (T1){0xdeadbeef});816 test_fill( t1p, size, size * 4, FillC);817 test_use( t1p, size * 4 / elemSize);818 free( t1p);812 test_base( t1p, size * 4, align ); 813 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 814 test_fill( t1p, size, size * 4, FillC ); 815 test_use( t1p, size * 4 / elemSize ); 816 free( t1p ); 819 817 820 818 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 821 819 t1p = alloc( 0, t1op`realloc, align`align, FillC`fill ); 822 test_base( t1p, 0, libAlign);823 free( t1p);824 825 t1p = alloc( dim, ((T1*)0p)`realloc, align`align, FillC`fill );826 test_base( t1p, size, align);827 test_fill( t1p, 0, size, FillC);828 test_use( t1p, size / elemSize);829 free( t1p);830 831 t1p = alloc( 0, ((T1*)0p)`realloc, align`align, FillC`fill );832 test_base( t1p, 0, libAlign);833 free( t1p);834 835 t1op = alloc( dim, ((T1){0xdeadbeef})`fill );820 test_base( t1p, 0, libAlign ); 821 free( t1p ); 822 823 t1p = alloc( dim, 0p`realloc, align`align, FillC`fill ); 824 test_base( t1p, size, align ); 825 test_fill( t1p, 0, size, FillC ); 826 test_use( t1p, size / elemSize ); 827 free( t1p ); 828 829 t1p = alloc( 0, 0p`realloc, align`align, FillC`fill ); 830 test_base( t1p, 0, libAlign ); 831 free( t1p ); 832 833 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 836 834 t1p = alloc( dim, t1op`realloc, align`align, FillT1`fill ); 837 test_base( t1p, size, align);838 test_fill( t1p, 0, dim, (T1){0xdeadbeef});839 test_use( t1p, size / elemSize);840 free( t1p);835 test_base( t1p, size, align ); 836 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 837 test_use( t1p, size / elemSize ); 838 free( t1p ); 841 839 842 840 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 843 841 t1p = alloc( dim / 4, t1op`realloc, align`align, FillT1`fill ); 844 test_base( t1p, size / 4, align);845 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});846 test_use( t1p, size / 4 / elemSize);847 free( t1p);842 test_base( t1p, size / 4, align ); 843 test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef}); 844 test_use( t1p, size / 4 / elemSize ); 845 free( t1p ); 848 846 849 847 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 850 848 t1p = alloc( dim * 4, t1op`realloc, align`align, FillT1`fill ); 851 test_base( t1p, size * 4, align);852 test_fill( t1p, 0, dim, (T1){0xdeadbeef});853 test_fill( t1p, dim, dim * 4, FillT1);854 test_use( t1p, size * 4 / elemSize);855 free( t1p);849 test_base( t1p, size * 4, align ); 850 test_fill( t1p, 0, dim, (T1){0xdeadbeef}); 851 test_fill( t1p, dim, dim * 4, FillT1); 852 test_use( t1p, size * 4 / elemSize ); 853 free( t1p ); 856 854 857 855 t1op = alloc( dim, ((T1){0xdeadbeef})`fill ); 858 856 t1p = alloc( 0, t1op`realloc, align`align, FillT1`fill ); 859 test_base(t1p, 0, libAlign); 860 free(t1p); 861 862 t1p = alloc( dim, ((T1*)0p)`realloc, align`align, FillT1`fill ); 863 test_base(t1p, size, align); 864 test_fill(t1p, 0, dim, FillT1); 865 test_use(t1p, size / elemSize); 866 free(t1p); 867 868 t1p = alloc( 0, ((T1*)0p)`realloc, align`align, FillT1`fill ); 869 test_base(t1p, 0, libAlign); 870 free(t1p); 871 872 if (tests_failed == 0) printf("PASSED alloc tests (aligned struct)\n\n"); 873 else printf("failed alloc tests (aligned struct) : %d/%d\n\n", tests_failed, tests_total); 874 875 printf("(if applicable) alignment error below indicates memory trashing caused by test_use.\n\n"); 876 free(FillA); 877 free(FillT1A); 878 return 0; 857 test_base( t1p, 0, libAlign ); 858 free( t1p ); 859 860 t1p = alloc( dim, 0p`realloc, align`align, FillT1`fill ); 861 test_base( t1p, size, align ); 862 test_fill( t1p, 0, dim, FillT1); 863 test_use( t1p, size / elemSize ); 864 free( t1p ); 865 866 t1p = alloc( 0, 0p`realloc, align`align, FillT1`fill ); 867 test_base( t1p, 0, libAlign ); 868 free( t1p ); 869 870 if ( tests_failed == 0) printf( "PASSED alloc tests (aligned struct)\n\n"); 871 else printf( "failed alloc tests ( aligned struct ) : %d/%d\n\n", tests_failed, tests_total ); 872 873 printf( "(if applicable) alignment error below indicates memory trashing caused by test_use.\n\n"); 874 free( FillA ); 875 free( FillT1A ); 879 876 } // main
Note:
See TracChangeset
for help on using the changeset viewer.