Changes in / [def751f:4e2befe3]


Ignore:
Files:
2 deleted
76 edited

Legend:

Unmodified
Added
Removed
  • Jenkins/FullBuild

    rdef751f r4e2befe3  
    161161        <p>${result}</p>
    162162
     163        <p>- Performance ---------------------------------------------------------</p>
     164
     165        <img src="https://cforall.uwaterloo.ca/jenkins/job/Cforall/job/master/plot/Compilation/getPlot?index=0" >
     166        <img src="https://cforall.uwaterloo.ca/jenkins/job/Cforall/job/master/plot/Compilation/getPlot?index=1" >
     167
    163168        <p>- Logs ----------------------------------------------------------------</p>
    164169        """
  • Jenkinsfile

    rdef751f r4e2befe3  
    209209
    210210                if( Settings.Publish && !Settings.RunBenchmark ) { echo 'No results to publish!!!' }
     211
     212                def groupCompile = new PlotGroup('Compilation', 'duration (s) - lower is better', true)
     213                def groupConcurrency = new PlotGroup('Concurrency', 'duration (n) - lower is better', false)
     214
     215                //Then publish the results
     216                do_plot(Settings.RunBenchmark && Settings.Publish, 'compile'        , groupCompile    , false, 'Compilation')
     217                do_plot(Settings.RunBenchmark && Settings.Publish, 'compile.diff'   , groupCompile    , true , 'Compilation (relative)')
     218                do_plot(Settings.RunBenchmark && Settings.Publish, 'ctxswitch'      , groupConcurrency, false, 'Context Switching')
     219                do_plot(Settings.RunBenchmark && Settings.Publish, 'ctxswitch.diff' , groupConcurrency, true , 'Context Switching (relative)')
     220                do_plot(Settings.RunBenchmark && Settings.Publish, 'mutex'          , groupConcurrency, false, 'Mutual Exclusion')
     221                do_plot(Settings.RunBenchmark && Settings.Publish, 'mutex.diff'     , groupConcurrency, true , 'Mutual Exclusion (relative)')
     222                do_plot(Settings.RunBenchmark && Settings.Publish, 'scheduling'     , groupConcurrency, false, 'Internal and External Scheduling')
     223                do_plot(Settings.RunBenchmark && Settings.Publish, 'scheduling.diff', groupConcurrency, true , 'Internal and External Scheduling (relative)')
    211224        }
    212225}
     
    363376                this.GitNewRef = ''
    364377                this.GitOldRef = ''
     378        }
     379}
     380
     381class PlotGroup implements Serializable {
     382        public String name
     383        public String unit
     384        public boolean log
     385
     386        PlotGroup(String name, String unit, boolean log) {
     387                this.name = name
     388                this.unit = unit
     389                this.log = log
    365390        }
    366391}
     
    451476        }
    452477}
     478
     479def do_plot(boolean new_data, String file, PlotGroup group, boolean relative, String title) {
     480
     481        if(new_data) {
     482                echo "Publishing new data"
     483        }
     484
     485        def series = new_data ? [[
     486                                file: "${file}.csv",
     487                                exclusionValues: '',
     488                                displayTableFlag: false,
     489                                inclusionFlag: 'OFF',
     490                                url: ''
     491                        ]] : [];
     492
     493        echo "file is ${BuildDir}/benchmark/${file}.csv, group ${group}, title ${title}"
     494        dir("${BuildDir}/benchmark/") {
     495                plot csvFileName: "cforall-${env.BRANCH_NAME}-${file}.csv",
     496                        csvSeries: series,
     497                        group: "${group.name}",
     498                        title: "${title}",
     499                        style: 'lineSimple',
     500                        exclZero: false,
     501                        keepRecords: false,
     502                        logarithmic: !relative && group.log,
     503                        numBuilds: '120',
     504                        useDescr: true,
     505                        yaxis: group.unit,
     506                        yaxisMaximum: '',
     507                        yaxisMinimum: ''
     508        }
     509}
  • Makefile.am

    rdef751f r4e2befe3  
    5252        @find libcfa -name config.status -printf "\n%h\n\t" -exec {} --config \; | sed "s/ /\n\t/g; s/\t'/\t/g; s/'\n/\n/g; s/^'//g; s/'$$//g"
    5353
    54 @LIBCFA_TARGET_DIRS@::
    55         $(MAKE) -C $@ $(MAKECMDGOALS)
     54mostlyclean-local: @LIBCFA_TARGET_MAKEFILES@
     55        for dir in @LIBCFA_TARGET_DIRS@; do \
     56                $(MAKE) -C $${dir} mostlyclean; \
     57        done
    5658
    57 mostlyclean clean distclean maintainer-clean: @LIBCFA_TARGET_DIRS@
     59clean-local: @LIBCFA_TARGET_MAKEFILES@
     60        for dir in @LIBCFA_TARGET_DIRS@; do \
     61                $(MAKE) -C $${dir} clean; \
     62        done
     63
     64distclean-local: @LIBCFA_TARGET_MAKEFILES@
     65        for dir in @LIBCFA_TARGET_DIRS@; do \
     66                $(MAKE) -C $${dir} distclean; \
     67                rm $${dir}/config.data; \
     68        done
  • benchmark/readyQ/churn.cfa

    rdef751f r4e2befe3  
    5858
    5959                        threads_left = nthreads;
    60                         BThrd ** threads = alloc(nthreads);
     60                        BThrd * threads[nthreads];
    6161                        for(i; nthreads ) {
    6262                                BThrd & t = *(threads[i] = malloc());
     
    9090
    9191                        free(spots);
    92                         free(threads);
    9392                }
    9493
  • benchmark/readyQ/cycle.cfa

    rdef751f r4e2befe3  
    5252                {
    5353                        threads_left = tthreads;
    54                         BThrd **  threads = alloc(tthreads);
    55                         Partner * thddata = alloc(tthreads);
     54                        BThrd * threads[tthreads];
     55                        Partner thddata[tthreads];
    5656                        for(i; tthreads) {
    57                                 (thddata[i]){};
    5857                                unsigned pi = (i + nthreads) % tthreads;
    5958                                thddata[i].next = &thddata[pi].self;
     
    8483                                delete(threads[i]);
    8584                        }
    86                         free(threads);
    87                         free(thddata);
    8885                }
    8986
  • benchmark/readyQ/cycle.cpp

    rdef751f r4e2befe3  
    3939                {
    4040                        threads_left = tthreads;
    41                         Fibre ** threads = new Fibre *[tthreads]();
    42                         Partner* thddata = new Partner[tthreads]();
     41                        Fibre * threads[tthreads];
     42                        Partner thddata[tthreads];
    4343                        for(unsigned i = 0; i < tthreads; i++) {
    4444                                unsigned pi = (i + nthreads) % tthreads;
     
    6969                                global_blocks  += thddata[i].blocks;
    7070                        }
    71 
    72                         delete[](threads);
    73                         delete[](thddata);
    7471                }
    7572
  • benchmark/readyQ/locality.cfa

    rdef751f r4e2befe3  
    222222                threads_left = nprocs;
    223223                {
    224                         MyThread ** threads = alloc(nthreads);
     224                        MyThread * threads[nthreads];
    225225                        for(i; nthreads) {
    226226                                threads[i] = malloc();
     
    259259                                free( threads[i] );
    260260                        }
    261                         free( threads );
    262261                }
    263262
  • benchmark/readyQ/locality.cpp

    rdef751f r4e2befe3  
    217217        {
    218218                FibreInit(1, nprocs);
    219                 MyData ** data_arrays = new MyData *[nthreads]();
     219                MyData * data_arrays[nthreads];
    220220                for(size_t i = 0; i < nthreads; i++) {
    221221                        data_arrays[i] = new MyData( i, wsize );
     
    228228
    229229                threads_left = nthreads - nspots;
    230                 Fibre ** threads = new Fibre *[nthreads]();
    231                 MyCtx ** thddata = new MyCtx *[nthreads]();
     230                Fibre * threads[nthreads];
     231                MyCtx * thddata[nthreads];
    232232                {
    233233                        for(size_t i = 0; i < nthreads; i++) {
     
    240240                                        i
    241241                                );
    242                                 threads[i] = new Fibre();
    243                                 threads[i]->run( reinterpret_cast<void (*)(MyCtx*)>(thread_main), thddata[i] );
     242                                threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(thread_main), thddata[i] );
    244243                        }
    245244
     
    268267                        delete( data_arrays[i] );
    269268                }
    270                 delete[](data_arrays);
    271269
    272270                for(size_t i = 0; i < nspots; i++) {
    273271                        delete( spots[i] );
    274272                }
    275 
    276                 delete[](threads);
    277                 delete[](thddata);
    278273        }
    279274
  • benchmark/readyQ/yield.cfa

    rdef751f r4e2befe3  
    3434                {
    3535                        threads_left = nthreads;
    36                         Yielder * threads = alloc(nthreads);
    37                         for(i; nthreads) {
    38                                 (threads[i]){};
    39                         }
    40 
     36                        Yielder threads[nthreads];
    4137                        printf("Starting\n");
    4238
     
    5652                                Yielder & y = join( threads[i] );
    5753                                global_counter += y.count;
    58                                 ^(threads[i]){};
    5954                        }
    60                         free(threads);
    6155                }
    6256
  • benchmark/readyQ/yield.cpp

    rdef751f r4e2befe3  
    3333                {
    3434                        threads_left = nthreads;
    35                         Fibre ** threads = new Fibre *[nthreads]();
     35                        Fibre * threads[nthreads];
    3636                        for(unsigned i = 0; i < nthreads; i++) {
    3737                                threads[i] = new Fibre();
     
    5252                                fibre_join( threads[i], nullptr );
    5353                        }
    54                         delete[] threads;
    5554                }
    5655
  • doc/bibliography/pl.bib

    rdef751f r4e2befe3  
    20242024@manual{C++20Coroutine19,
    20252025    keywords    = {coroutine},
    2026     key         = {Coroutines},
    20272026    contributer = {pabuhr@plg},
    20282027    title       = {Coroutines (C++20)},
    20292028    organization= {cppreference.com},
    2030     month       = jun,
    2031     year        = 2022,
     2029    month       = apr,
     2030    year        = 2019,
    20322031    note        = {\href{https://en.cppreference.com/w/cpp/language/coroutines}{https://\-en.cppreference.com/\-w/\-cpp/\-language/\-coroutines}},
    20332032}
     
    69926991% S
    69936992
    6994 @inproceedings{Imam14,
    6995     keywords    = {actor model, performance comparison, java actor libraries, benchmark suite},
    6996     contributer = {pabuhr@plg},
    6997     author      = {Shams M. Imam and Vivek Sarkar},
    6998     title       = {Savina - An Actor Benchmark Suite: Enabling Empirical Evaluation of Actor Libraries},
    6999     year        = {2014},
    7000     publisher   = {ACM},
    7001     address     = {New York, NY, USA},
    7002     booktitle   = {Proceedings of the 4th International Workshop on Programming Based on Actors Agents \& Decentralized Control},
    7003     pages       = {67-80},
    7004     numpages    = {14},
    7005     location    = {Portland, Oregon, USA},
    7006     series      = {AGERE! '14}
    7007 }
    7008 
    70096993@manual{Scala,
    70106994    keywords    = {Scala programming language},
  • doc/theses/mike_brooks_MMath/array.tex

    rdef751f r4e2befe3  
    182182\CFA's array is also the first extension of C to use its tracked bounds to generate the pointer arithmetic implied by advanced allocation patterns.  Other bound-tracked extensions of C either forbid certain C patterns entirely, or address the problem of \emph{verifying} that the user's provided pointer arithmetic is self-consistent.  The \CFA array, applied to accordion structures [TOD: cross-reference] \emph{implies} the necessary pointer arithmetic, generated automatically, and not appearing at all in a user's program.
    183183
    184 \subsection{Safety in a padded room}
     184\subsction{Safety in a padded room}
    185185
    186186Java's array [todo:cite] is a straightforward example of assuring safety against undefined behaviour, at a cost of expressiveness for more applied properties.  Consider the array parameter declarations in:
  • doc/theses/thierry_delisle_PhD/thesis/fig/cycle.fig

    rdef751f r4e2befe3  
    88-2
    991200 2
    10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3150.000 4012.500 2850 4575 3150 4650 3450 4575
    11         1 1 1.00 60.00 120.00
    12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 2268.750 3450.000 1950 3825 1800 3600 1800 3300
    13         1 1 1.00 60.00 120.00
    14 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 4031.250 3450.000 4350 3825 4500 3600 4500 3300
    15         1 1 1.00 60.00 120.00
    16 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 3675.000 2250.000 3750 1725 4050 1875 4200 2175
    17         1 1 1.00 60.00 120.00
    18 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2625.000 2250.000 2550 1725 2250 1875 2100 2175
    19         1 1 1.00 60.00 120.00
    20 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3150 1800 600 600 3150 1800 3750 1800
    21 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1875 2700 600 600 1875 2700 2475 2700
    22 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2400 4200 600 600 2400 4200 3000 4200
    23 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 4200 600 600 3900 4200 4500 4200
    24 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4425 2700 600 600 4425 2700 5025 2700
    25 4 1 0 50 -1 0 11 0.0000 2 165 855 2400 4275 Thread$_3$\001
    26 4 1 0 50 -1 0 11 0.0000 2 165 855 3900 4275 Thread$_4$\001
    27 4 1 0 50 -1 0 11 0.0000 2 165 855 1875 2775 Thread$_2$\001
    28 4 1 0 50 -1 0 11 0.0000 2 165 855 3150 1875 Thread$_1$\001
    29 4 1 0 50 -1 0 11 0.0000 2 165 855 4425 2775 Thread$_5$\001
    30 4 1 0 50 -1 0 11 0.0000 2 180 540 3150 4875 Unpark\001
    31 4 0 0 50 -1 0 11 0.0000 2 180 540 4650 3675 Unpark\001
    32 4 2 0 50 -1 0 11 0.0000 2 180 540 1650 3600 Unpark\001
    33 4 2 0 50 -1 0 11 0.0000 2 180 540 2100 1875 Unpark\001
    34 4 0 0 50 -1 0 11 0.0000 2 180 540 4200 1875 Unpark\001
     105 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3144.643 2341.072 3525 2250 3375 2025 3150 1950
     11        2 0 1.00 60.00 120.00
     125 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 1955.357 2341.072 1950 1950 1725 2025 1575 2250
     13        2 0 1.00 60.00 120.00
     145 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3637.500 3487.500 3750 3750 3900 3600 3900 3375
     15        2 0 1.00 60.00 120.00
     165 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2587.500 4087.500 2325 4500 2550 4575 2850 4500
     17        2 0 1.00 60.00 120.00
     185 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 1612.500 3487.500 1200 3375 1200 3600 1350 3825
     19        2 0 1.00 60.00 120.00
     201 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3675 2850 586 586 3675 2850 4125 3225
     211 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3300 4125 586 586 3300 4125 3750 4500
     221 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1875 4125 586 586 1875 4125 2325 4500
     231 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1425 2850 586 586 1425 2850 1875 3225
     241 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2550 1950 586 586 2550 1950 3000 2325
     254 0 0 50 -1 0 11 0.0000 2 135 720 1125 2925 Thread 2\001
     264 2 0 50 -1 0 11 0.0000 2 165 540 1650 1950 Unpark\001
     274 0 0 50 -1 0 11 0.0000 2 165 540 4050 3600 Unpark\001
     284 2 0 50 -1 0 11 0.0000 2 165 540 1125 3750 Unpark\001
     294 2 0 50 -1 0 11 0.0000 2 165 540 2850 4800 Unpark\001
     304 0 0 50 -1 0 11 0.0000 2 135 720 2250 2025 Thread 1\001
     314 0 0 50 -1 0 11 0.0000 2 135 720 3000 4200 Thread 4\001
     324 0 0 50 -1 0 11 0.0000 2 135 720 1575 4200 Thread 3\001
     334 0 0 50 -1 0 11 0.0000 2 165 540 3525 2025 Unpark\001
     344 0 0 50 -1 0 11 0.0000 2 135 720 3375 2925 Thread 5\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/idle.fig

    rdef751f r4e2befe3  
    88-2
    991200 2
    10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3376.136 2169.318 2250 2625 2775 3225 3525 3375
    11         1 1 1.00 60.00 120.00
    12         7 1 1.00 60.00 60.00
    13 6 3466 2774 3899 3149
     106 5919 5250 6375 5775
     115 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410
     125 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410
     132 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
     14         6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501
     15         6284 5501 6284 5410
     162 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
     17         6102 5410 6102 5501 6192 5501 6192 5410
     18-6
     196 7442 6525 7875 6900
    14202 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    15          3525 2833 3466 3149
     21         7501 6584 7442 6900
    16222 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    17          3880 2833 3860 2952
     23         7856 6584 7836 6703
    18243 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    19          3505 2952 3623 2912 3761 2971 3860 2952
     25         7481 6703 7599 6663 7737 6722 7836 6703
    2026         0.000 -0.500 -0.500 0.000
    21273 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    22          3527 2828 3645 2789 3783 2848 3881 2828
     28         7503 6579 7621 6540 7759 6599 7857 6579
    2329         0.000 -0.500 -0.500 0.000
    2430-6
    25 6 3599 3074 3974 3574
     316 7575 6825 7950 7325
    26322 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    27          3599 3199 3724 3074 3974 3074 3974 3574 3599 3574 3599 3199
    28          3724 3199 3724 3074
     33         7575 6950 7700 6825 7950 6825 7950 7325 7575 7325 7575 6950
     34         7700 6950 7700 6825
    2935-6
    30 6 5116 2774 5549 3149
     366 9092 6525 9525 6900
    31372 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    32          5175 2833 5116 3149
     38         9151 6584 9092 6900
    33392 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    34          5530 2833 5510 2952
     40         9506 6584 9486 6703
    35413 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    36          5155 2952 5273 2912 5411 2971 5510 2952
     42         9131 6703 9249 6663 9387 6722 9486 6703
    3743         0.000 -0.500 -0.500 0.000
    38443 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    39          5177 2828 5295 2789 5433 2848 5531 2828
     45         9153 6579 9271 6540 9409 6599 9507 6579
    4046         0.000 -0.500 -0.500 0.000
    4147-6
    42 6 5249 3074 5625 3574
     486 9225 6825 9600 7325
    43492 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    44          5249 3199 5374 3074 5625 3074 5625 3574 5249 3574 5249 3199
    45          5374 3199 5374 3074
     50         9225 6950 9350 6825 9600 6825 9600 7325 9225 7325 9225 6950
     51         9350 6950 9350 6825
    4652-6
    47 6 6766 2774 7199 3149
     536 10742 6525 11175 6900
    48542 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    49          6825 2833 6766 3149
     55         10801 6584 10742 6900
    50562 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    51          7180 2833 7160 2952
     57         11156 6584 11136 6703
    52583 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    53          6805 2952 6923 2912 7061 2971 7160 2952
     59         10781 6703 10899 6663 11037 6722 11136 6703
    5460         0.000 -0.500 -0.500 0.000
    55613 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    56          6827 2828 6945 2789 7083 2848 7181 2828
     62         10803 6579 10921 6540 11059 6599 11157 6579
    5763         0.000 -0.500 -0.500 0.000
    5864-6
    59 6 6899 3074 7274 3574
     656 10875 6825 11250 7325
    60662 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    61          6899 3199 7024 3074 7274 3074 7274 3574 6899 3574 6899 3199
    62          7024 3199 7024 3074
    63 -6
    64 6 1875 1500 2331 2025
    65 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660
    66 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660
    67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    68          1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751
    69          2240 1751 2240 1660
    70 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
    71          2058 1660 2058 1751 2148 1751 2148 1660
     67         10875 6950 11000 6825 11250 6825 11250 7325 10875 7325 10875 6950
     68         11000 6950 11000 6825
    7269-6
    73702 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    74          1800 2400 2699 2399
     71         5850 6150 6675 6150
     722 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     73         5850 5250 6675 5250 6675 6600 5850 6600 5850 5250
     742 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     75        1 1 1.00 60.00 120.00
     76        7 0 1.00 60.00 60.00
     77         7725 6150 7725 6525
     782 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     79        1 1 1.00 60.00 120.00
     80        7 0 1.00 60.00 60.00
     81         9375 6150 9375 6525
     822 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     83        1 1 1.00 60.00 120.00
     84        7 0 1.00 60.00 60.00
     85         11025 6150 11025 6525
     862 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     87         10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400
     88         10500 5854
     892 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     90         8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400
     91         8850 5854
     922 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     93         7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400
     94         7200 5854
    75952 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7696        1 1 1.00 60.00 120.00
    7797        7 1 1.00 60.00 60.00
    78          3749 2399 3749 2774
     98         6450 5925 7275 5925
    79992 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    80100        1 1 1.00 60.00 120.00
    81101        7 1 1.00 60.00 60.00
    82          5399 2399 5399 2774
     102         8025 5925 8925 5925
    831032 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    84104        1 1 1.00 60.00 120.00
    85105        7 1 1.00 60.00 60.00
    86          2550 2175 3299 2174
     106         9675 5925 10575 5925
    871072 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    88108        1 1 1.00 60.00 120.00
    89109        7 1 1.00 60.00 60.00
    90          4049 2174 4949 2174
     110         10725 5775 9825 5775
    911112 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    92112        1 1 1.00 60.00 120.00
    93113        7 1 1.00 60.00 60.00
    94          5699 2174 6599 2174
    95 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     114         9075 5775 8175 5775
     1153 2 0 1 0 7 50 -1 -1 0.000 0 1 1 4
    96116        1 1 1.00 60.00 120.00
    97117        7 1 1.00 60.00 60.00
    98          6749 2024 5849 2024
    99 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    100         1 1 1.00 60.00 120.00
    101         7 1 1.00 60.00 60.00
    102          5099 2024 4199 2024
    103 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    104          1800 1499 2699 1499 2699 2850 1800 2850 1800 1499
    105 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    106          4950 1650 5850 1650 5850 2550 4950 2550 4950 1650
    107 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    108          3300 1650 4200 1650 4200 2550 3300 2550 3300 1650
    109 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    110          6600 1650 7500 1650 7500 2550 6600 2550 6600 1650
    111 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    112         1 1 1.00 60.00 120.00
    113         7 1 1.00 60.00 60.00
    114          7049 2399 7049 2774
    115 4 0 0 50 -1 0 11 0.0000 2 120 525 1799 3149 Atomic\001
    116 4 0 0 50 -1 0 11 0.0000 2 120 510 1799 3374 Pointer\001
    117 4 0 0 50 -1 0 11 0.0000 2 180 765 3974 2924 Benaphore\001
    118 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3374 Event FD\001
    119 4 0 0 50 -1 0 11 0.0000 2 180 765 5625 2924 Benaphore\001
    120 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3374 Event FD\001
    121 4 0 0 50 -1 0 11 0.0000 2 180 765 7274 2924 Benaphore\001
    122 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3374 Event FD\001
    123 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001
    124 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001
    125 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001
    126 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001
    127 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001
    128 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001
     118         6300 6375 6375 6825 6750 7050 7350 6975
     119         0.000 -0.500 -0.500 0.000
     1204 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001
     1214 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001
     1224 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001
     1234 0 0 50 -1 0 11 0.0000 2 135 540 5775 6900 Atomic\001
     1244 0 0 50 -1 0 11 0.0000 2 135 630 5775 7125 Pointer\001
     1254 0 0 50 -1 0 11 0.0000 2 165 810 7950 6675 Benaphore\001
     1264 0 0 50 -1 0 11 0.0000 2 135 720 8025 7125 Event FD\001
     1274 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001
     1284 0 0 50 -1 0 11 0.0000 2 165 810 9600 6675 Benaphore\001
     1294 0 0 50 -1 0 11 0.0000 2 135 720 9675 7125 Event FD\001
     1304 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001
     1314 0 0 50 -1 0 11 0.0000 2 165 810 11250 6675 Benaphore\001
     1324 0 0 50 -1 0 11 0.0000 2 135 720 11325 7125 Event FD\001
     1334 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/idle1.fig

    rdef751f r4e2befe3  
    88-2
    991200 2
    10 6 1875 1500 2331 2025
    11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660
    12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660
     106 5919 5250 6375 5775
     115 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410
     125 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410
    13132 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    14          1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751
    15          2240 1751 2240 1660
     14         6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501
     15         6284 5501 6284 5410
    16162 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
    17          2058 1660 2058 1751 2148 1751 2148 1660
     17         6102 5410 6102 5501 6192 5501 6192 5410
    1818-6
    19 6 3599 2774 3974 3274
     196 7575 6525 7950 7025
    20202 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    21          3599 2899 3724 2774 3974 2774 3974 3274 3599 3274 3599 2899
    22          3724 2899 3724 2774
     21         7575 6650 7700 6525 7950 6525 7950 7025 7575 7025 7575 6650
     22         7700 6650 7700 6525
    2323-6
    24 6 5249 2774 5625 3274
     246 9225 6525 9600 7025
    25252 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    26          5249 2899 5374 2774 5625 2774 5625 3274 5249 3274 5249 2899
    27          5374 2899 5374 2774
     26         9225 6650 9350 6525 9600 6525 9600 7025 9225 7025 9225 6650
     27         9350 6650 9350 6525
    2828-6
    29 6 6899 2774 7274 3274
     296 10875 6525 11250 7025
    30302 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    31          6899 2899 7024 2774 7274 2774 7274 3274 6899 3274 6899 2899
    32          7024 2899 7024 2774
     31         10875 6650 11000 6525 11250 6525 11250 7025 10875 7025 10875 6650
     32         11000 6650 11000 6525
    3333-6
    34342 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    3535        1 1 1.00 60.00 120.00
    36         7 1 1.00 60.00 60.00
    37          3749 2399 3749 2774
     36        7 0 1.00 60.00 60.00
     37         7725 6150 7725 6525
     382 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     39        1 1 1.00 60.00 120.00
     40        7 0 1.00 60.00 60.00
     41         9375 6150 9375 6525
     422 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     43        1 1 1.00 60.00 120.00
     44        7 0 1.00 60.00 60.00
     45         11025 6150 11025 6525
     462 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     47         10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400
     48         10500 5854
     492 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     50         8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400
     51         8850 5854
     522 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     53         7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400
     54         7200 5854
    38552 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    3956        1 1 1.00 60.00 120.00
    4057        7 1 1.00 60.00 60.00
    41          5399 2399 5399 2774
     58         6450 5925 7275 5925
    42592 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4360        1 1 1.00 60.00 120.00
    4461        7 1 1.00 60.00 60.00
    45          7049 2399 7049 2774
     62         8025 5925 8925 5925
    46632 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4764        1 1 1.00 60.00 120.00
    4865        7 1 1.00 60.00 60.00
    49          2550 2175 3299 2174
     66         9675 5925 10575 5925
    50672 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5168        1 1 1.00 60.00 120.00
    5269        7 1 1.00 60.00 60.00
    53          4049 2174 4949 2174
     70         10725 5775 9825 5775
    54712 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5572        1 1 1.00 60.00 120.00
    5673        7 1 1.00 60.00 60.00
    57          5699 2174 6599 2174
    58 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    59         1 1 1.00 60.00 120.00
    60         7 1 1.00 60.00 60.00
    61          6749 2024 5849 2024
    62 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    63         1 1 1.00 60.00 120.00
    64         7 1 1.00 60.00 60.00
    65          5099 2024 4199 2024
     74         9075 5775 8175 5775
    66752 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    67          4950 1650 5850 1650 5850 2550 4950 2550 4950 1650
    68 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    69          3300 1650 4200 1650 4200 2550 3300 2550 3300 1650
    70 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    71          6600 1650 7500 1650 7500 2550 6600 2550 6600 1650
    72 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    73          1800 1499 2699 1499 2699 2400 1800 2400 1800 1499
    74 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001
    75 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001
    76 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001
    77 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001
    78 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001
    79 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001
    80 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3074 Event FD\001
    81 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3074 Event FD\001
    82 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3074 Event FD\001
     76         5850 5250 6675 5250 6675 6075 5850 6075 5850 5250
     774 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001
     784 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001
     794 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001
     804 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001
     814 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001
     824 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001
     834 0 0 50 -1 0 11 0.0000 2 135 720 8025 6825 Event FD\001
     844 0 0 50 -1 0 11 0.0000 2 135 720 9675 6825 Event FD\001
     854 0 0 50 -1 0 11 0.0000 2 135 720 11325 6825 Event FD\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/idle2.fig

    rdef751f r4e2befe3  
    88-2
    991200 2
    10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3150.000 2106.250 2250 2625 2775 3075 3525 3075
    11         1 1 1.00 60.00 120.00
    12         7 1 1.00 60.00 60.00
    13 6 1875 1500 2331 2025
    14 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660
    15 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660
     106 5919 5250 6375 5775
     115 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410
     125 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410
    16132 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    17          1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751
    18          2240 1751 2240 1660
     14         6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501
     15         6284 5501 6284 5410
    19162 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
    20          2058 1660 2058 1751 2148 1751 2148 1660
     17         6102 5410 6102 5501 6192 5501 6192 5410
    2118-6
    22 6 3599 2774 3974 3274
     196 7575 6525 7950 7025
    23202 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    24          3599 2899 3724 2774 3974 2774 3974 3274 3599 3274 3599 2899
    25          3724 2899 3724 2774
     21         7575 6650 7700 6525 7950 6525 7950 7025 7575 7025 7575 6650
     22         7700 6650 7700 6525
    2623-6
    27 6 5249 2774 5625 3274
     246 9225 6525 9600 7025
    28252 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    29          5249 2899 5374 2774 5625 2774 5625 3274 5249 3274 5249 2899
    30          5374 2899 5374 2774
     26         9225 6650 9350 6525 9600 6525 9600 7025 9225 7025 9225 6650
     27         9350 6650 9350 6525
    3128-6
    32 6 6899 2774 7274 3274
     296 10875 6525 11250 7025
    33302 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    34          6899 2899 7024 2774 7274 2774 7274 3274 6899 3274 6899 2899
    35          7024 2899 7024 2774
     31         10875 6650 11000 6525 11250 6525 11250 7025 10875 7025 10875 6650
     32         11000 6650 11000 6525
    3633-6
    37342 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    38          1800 2400 2699 2399
     35         5850 6150 6675 6150
     362 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     37         5850 5250 6675 5250 6675 6600 5850 6600 5850 5250
     382 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     39        1 1 1.00 60.00 120.00
     40        7 0 1.00 60.00 60.00
     41         7725 6150 7725 6525
     422 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     43        1 1 1.00 60.00 120.00
     44        7 0 1.00 60.00 60.00
     45         9375 6150 9375 6525
     462 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     47        1 1 1.00 60.00 120.00
     48        7 0 1.00 60.00 60.00
     49         11025 6150 11025 6525
     502 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     51         10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400
     52         10500 5854
     532 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     54         8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400
     55         8850 5854
     562 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
     57         7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400
     58         7200 5854
    39592 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4060        1 1 1.00 60.00 120.00
    4161        7 1 1.00 60.00 60.00
    42          3749 2399 3749 2774
     62         6450 5925 7275 5925
    43632 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4464        1 1 1.00 60.00 120.00
    4565        7 1 1.00 60.00 60.00
    46          5399 2399 5399 2774
     66         8025 5925 8925 5925
    47672 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4868        1 1 1.00 60.00 120.00
    4969        7 1 1.00 60.00 60.00
    50          7049 2399 7049 2774
     70         9675 5925 10575 5925
    51712 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5272        1 1 1.00 60.00 120.00
    5373        7 1 1.00 60.00 60.00
    54          2550 2175 3299 2174
     74         10725 5775 9825 5775
    55752 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5676        1 1 1.00 60.00 120.00
    5777        7 1 1.00 60.00 60.00
    58          4049 2174 4949 2174
    59 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     78         9075 5775 8175 5775
     793 2 0 1 0 7 50 -1 -1 0.000 0 1 1 4
    6080        1 1 1.00 60.00 120.00
    6181        7 1 1.00 60.00 60.00
    62          5699 2174 6599 2174
    63 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    64         1 1 1.00 60.00 120.00
    65         7 1 1.00 60.00 60.00
    66          6749 2024 5849 2024
    67 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    68         1 1 1.00 60.00 120.00
    69         7 1 1.00 60.00 60.00
    70          5099 2024 4199 2024
    71 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    72          1800 1499 2699 1499 2699 2850 1800 2850 1800 1499
    73 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    74          4950 1650 5850 1650 5850 2550 4950 2550 4950 1650
    75 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    76          3300 1650 4200 1650 4200 2550 3300 2550 3300 1650
    77 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    78          6600 1650 7500 1650 7500 2550 6600 2550 6600 1650
    79 4 0 0 50 -1 0 11 0.0000 2 120 525 1799 3149 Atomic\001
    80 4 0 0 50 -1 0 11 0.0000 2 120 510 1799 3374 Pointer\001
    81 4 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001
    82 4 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001
    83 4 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001
    84 4 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001
    85 4 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001
    86 4 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001
    87 4 0 0 50 -1 0 11 0.0000 2 120 690 4049 3074 Event FD\001
    88 4 0 0 50 -1 0 11 0.0000 2 120 690 5699 3074 Event FD\001
    89 4 0 0 50 -1 0 11 0.0000 2 120 690 7349 3074 Event FD\001
     82         6300 6375 6375 6825 6900 6975 7500 6750
     83         0.000 -0.500 -0.500 0.000
     844 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001
     854 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001
     864 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001
     874 0 0 50 -1 0 11 0.0000 2 135 540 5775 6900 Atomic\001
     884 0 0 50 -1 0 11 0.0000 2 135 630 5775 7125 Pointer\001
     894 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001
     904 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001
     914 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001
     924 0 0 50 -1 0 11 0.0000 2 135 720 8025 6825 Event FD\001
     934 0 0 50 -1 0 11 0.0000 2 135 720 9675 6825 Event FD\001
     944 0 0 50 -1 0 11 0.0000 2 135 720 11325 6825 Event FD\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/idle_state.fig

    rdef751f r4e2befe3  
    88-2
    991200 2
    10 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3000 3600 600 600 3000 3600 2400 3600
    11 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1800 1800 600 600 1800 1800 1200 1800
    12 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4205 1800 600 600 4205 1800 3605 1800
     101 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 3600 571 571 3900 3600 3375 3375
     111 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 3600 605 605 6300 3600 5775 3300
     121 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 5400 600 600 5100 5400 4500 5400
    13132 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    14         1 1 1.00 60.00 120.00
    15          2100 2325 2625 3150
     14        0 0 1.00 60.00 120.00
     15         4200 4125 4725 4950
    16162 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    17         1 1 1.00 60.00 120.00
    18          2400 1800 3600 1800
     17        0 0 1.00 60.00 120.00
     18         4500 3600 5700 3600
    19192 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    20         1 1 1.00 60.00 120.00
    21          3900 2325 3375 3150
    22 4 1 0 50 -1 0 11 0.0000 2 120 675 3000 3675 AWAKE\001
    23 4 1 0 50 -1 0 11 0.0000 2 120 525 4200 1875 SLEEP\001
    24 4 1 0 50 -1 0 11 0.0000 2 120 720 1800 1875 SEARCH\001
    25 4 2 0 50 -1 0 11 0.0000 2 120 720 2250 2850 CANCEL\001
    26 4 1 0 50 -1 0 11 0.0000 2 120 840 2925 1650 CONFIRM\001
    27 4 0 0 50 -1 0 11 0.0000 2 120 540 3750 2850 WAKE\001
     20        0 0 1.00 60.00 120.00
     21         5923 4125 5475 4875
     224 1 0 50 -1 0 11 0.0000 2 135 450 5100 5475 AWAKE\001
     234 1 0 50 -1 0 11 0.0000 2 135 450 6300 3675 SLEEP\001
     244 1 0 50 -1 0 11 0.0000 2 135 540 3900 3675 SEARCH\001
     254 0 0 50 -1 0 11 0.0000 2 135 360 5775 4650 WAKE\001
     264 2 0 50 -1 0 11 0.0000 2 135 540 4350 4650 CANCEL\001
     274 1 0 50 -1 0 11 0.0000 2 135 630 5025 3450 CONFIRM\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/io_uring.fig

    rdef751f r4e2befe3  
    88-2
    991200 2
    10 6 675 3105 2520 3375
     106 180 3240 2025 3510
    11112 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2
    12          1215 3105 1215 3375
     12         720 3240 720 3510
    13132 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2
    14          945 3105 945 3375
     14         450 3240 450 3510
    15152 2 0 1 0 7 45 -1 20 0.000 0 0 -1 0 0 5
    16          675 3105 1755 3105 1755 3375 675 3375 675 3105
     16         180 3240 1260 3240 1260 3510 180 3510 180 3240
    17172 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2
    18          1485 3105 1485 3375
    19 4 0 0 40 -1 0 12 0.0000 2 165 930 1530 3285 {\\small S3}\001
    20 4 0 0 40 -1 0 12 0.0000 2 165 930 1260 3285 {\\small S2}\001
    21 4 0 0 40 -1 0 12 0.0000 2 165 930 720 3285 {\\small S0}\001
    22 4 0 0 40 -1 0 12 0.0000 2 165 930 990 3285 {\\small S1}\001
     18         990 3240 990 3510
     194 0 0 40 -1 0 12 0.0000 2 165 990 1035 3420 {\\small S3}\001
     204 0 0 40 -1 0 12 0.0000 2 165 990 765 3420 {\\small S2}\001
     214 0 0 40 -1 0 12 0.0000 2 165 990 225 3420 {\\small S0}\001
     224 0 0 40 -1 0 12 0.0000 2 165 990 495 3420 {\\small S1}\001
    2323-6
    24 6 2025 2475 3735 4005
    25 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 2950.714 3240.000 2385 2565 2070 3240 2385 3915
     246 1530 2610 3240 4140
     255 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 2455.714 3375.000 1890 2700 1575 3375 1890 4050
    2626        1 1 1.00 60.00 120.00
    27 1 3 0 1 0 7 40 -1 20 0.000 1 0.0000 2970 3240 315 315 2970 3240 3285 3240
    28 1 3 0 1 0 7 50 -1 20 0.000 1 0.0000 2970 3240 765 765 2970 3240 3735 3240
     271 3 0 1 0 7 40 -1 20 0.000 1 0.0000 2475 3375 315 315 2475 3375 2790 3375
     281 3 0 1 0 7 50 -1 20 0.000 1 0.0000 2475 3375 765 765 2475 3375 3240 3375
    29292 1 0 1 0 7 45 -1 -1 0.000 0 0 -1 0 0 2
    30          2970 3240 2628 2555
     30         2475 3375 2133 2690
    31312 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    32          2970 3240 2264 2958
     32         2475 3375 1769 3093
    33332 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    34          2970 3240 2264 3526
     34         2475 3375 1769 3661
    35352 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    36          2970 3240 2628 3922
     36         2475 3375 2133 4057
    37372 1 1 1 0 7 35 -1 0 4.000 0 0 -1 0 0 2
    38          2700 3240 3240 3240
     38         2205 3375 2745 3375
    3939-6
    40 6 1080 2115 1980 2475
    41 4 2 0 50 -1 0 12 0.0000 2 135 945 1980 2250 Submission\001
    42 4 2 0 50 -1 0 12 0.0000 2 180 405 1980 2445 Ring\001
     406 585 2250 1485 2610
     414 2 0 50 -1 0 12 0.0000 2 135 900 1485 2385 Submission\001
     424 2 0 50 -1 0 12 0.0000 2 165 360 1485 2580 Ring\001
    4343-6
    44 6 4095 2475 5760 4005
    45 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 4879.000 3240.000 5445 3915 5760 3240 5445 2565
     446 3600 2610 5265 4140
     455 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 4384.000 3375.000 4950 4050 5265 3375 4950 2700
    4646        1 1 1.00 60.00 120.00
    47 1 3 0 1 0 7 40 -1 20 0.000 1 3.1416 4860 3240 315 315 4860 3240 4545 3240
    48 1 3 0 1 0 7 50 -1 20 0.000 1 3.1416 4860 3240 765 765 4860 3240 4095 3240
     471 3 0 1 0 7 40 -1 20 0.000 1 3.1416 4365 3375 315 315 4365 3375 4050 3375
     481 3 0 1 0 7 50 -1 20 0.000 1 3.1416 4365 3375 765 765 4365 3375 3600 3375
    49492 1 0 1 0 7 45 -1 -1 0.000 0 0 -1 0 0 2
    50          4860 3240 5202 3925
     50         4365 3375 4707 4060
    51512 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    52          4860 3240 5566 3522
     52         4365 3375 5071 3657
    53532 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    54          4860 3240 5566 2954
     54         4365 3375 5071 3089
    55552 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    56          4860 3240 5202 2558
     56         4365 3375 4707 2693
    57572 1 1 1 0 7 35 -1 0 4.000 0 0 -1 0 0 2
    58          5130 3240 4590 3240
     58         4635 3375 4095 3375
    5959-6
    60 6 5850 2115 6750 2475
    61 4 0 0 50 -1 0 12 0.0000 2 180 405 5850 2445 Ring\001
    62 4 0 0 50 -1 0 12 0.0000 2 180 975 5850 2250 Completion\001
     606 5355 2250 6255 2610
     614 0 0 50 -1 0 12 0.0000 2 165 360 5355 2580 Ring\001
     624 0 0 50 -1 0 12 0.0000 2 165 900 5355 2385 Completion\001
    6363-6
    64642 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    6565        1 1 1.00 60.00 120.00
    66          3420 1890 3045 2351
     66         2925 2025 2550 2486
    67672 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2
    6868        1 1 1.00 60.00 120.00
    69          4770 2340 4320 1890
     69         4275 2475 3825 2025
    70702 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2
    7171        1 1 1.00 60.00 120.00
    72          3060 4095 3600 4410
     72         2751 4268 3066 4538
    73732 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2
    7474        1 1 1.00 60.00 120.00
    75          4275 4410 4770 4095
     75         3780 4545 4275 4230
    76762 1 1 1 0 7 55 -1 -1 4.000 0 0 -1 0 0 2
    77          495 3240 6750 3240
    78 4 0 0 35 -1 0 12 0.0000 2 165 1140 2340 2925 {\\small \\&S2}\001
    79 4 0 0 50 -1 0 12 0.0000 6 135 390 3285 2430 Push\001
    80 4 0 0 50 -1 0 12 0.0000 6 135 330 2520 2430 Tail\001
    81 4 0 0 35 -1 0 12 0.0000 2 165 960 5130 2925 {\\small C0}\001
    82 4 0 0 35 -1 0 12 0.0000 2 165 960 5310 3285 {\\small C1}\001
    83 4 0 0 35 -1 0 12 0.0000 2 165 960 5130 3645 {\\small C2}\001
    84 4 0 0 50 -1 0 12 0.0000 4 135 330 5220 4140 Tail\001
    85 4 0 0 50 -1 0 12 0.0000 6 135 420 5085 2430 Head\001
    86 4 0 0 50 -1 0 12 0.0000 2 135 960 6030 3150 Kernel Line\001
    87 4 0 0 50 -1 0 12 0.0000 2 135 105 495 3150 S\001
    88 4 0 0 35 -1 0 12 0.0000 2 165 1140 2385 3645 {\\small \\&S0}\001
    89 4 0 0 50 -1 0 12 0.0000 6 135 420 2340 4140 Head\001
    90 4 0 0 35 -1 0 12 0.0000 2 165 1140 2250 3285 {\\small \\&S3}\001
    91 4 2 0 50 -1 0 12 0.0000 4 135 390 4500 4140 Push\001
    92 4 1 0 50 -1 0 12 0.0000 2 180 1290 3915 4680 {\\Large Kernel}\001
    93 4 0 0 50 -1 0 12 0.0000 6 180 315 3285 4140 Pop\001
    94 4 1 0 50 -1 0 12 0.0000 2 180 1725 3915 1755 {\\Large Application}\001
    95 4 2 0 50 -1 0 12 0.0000 6 180 315 4545 2430 Pop\001
     77         0 3375 6255 3375
     784 0 0 35 -1 0 12 0.0000 2 165 1170 1845 3060 {\\small \\&S2}\001
     794 0 0 35 -1 0 12 0.0000 2 165 1170 1755 3420 {\\small \\&S3}\001
     804 0 0 35 -1 0 12 0.0000 2 165 1170 1890 3735 {\\small \\&S0}\001
     814 0 0 50 -1 0 12 0.0000 6 135 360 2790 2565 Push\001
     824 0 0 50 -1 0 12 0.0000 6 165 270 2880 4230 Pop\001
     834 0 0 50 -1 0 12 0.0000 6 135 360 2025 4275 Head\001
     844 0 0 50 -1 0 12 0.0000 6 135 360 2025 2565 Tail\001
     854 0 0 35 -1 0 12 0.0000 2 165 990 4635 3060 {\\small C0}\001
     864 0 0 35 -1 0 12 0.0000 2 165 990 4815 3420 {\\small C1}\001
     874 0 0 35 -1 0 12 0.0000 2 165 990 4635 3780 {\\small C2}\001
     884 0 0 50 -1 0 12 0.0000 4 135 360 4725 4275 Tail\001
     894 0 0 50 -1 0 12 0.0000 6 135 360 4590 2565 Head\001
     904 0 0 50 -1 0 12 0.0000 2 135 990 5535 3285 Kernel Line\001
     914 1 0 50 -1 0 12 0.0000 2 180 1350 3375 4815 {\\Large Kernel}\001
     924 1 0 50 -1 0 12 0.0000 2 180 1800 3375 1845 {\\Large Application}\001
     934 0 0 50 -1 0 12 0.0000 6 165 270 3690 2565 Pop\001
     944 0 0 50 -1 0 12 0.0000 4 135 360 3465 4230 Push\001
     954 0 0 50 -1 0 12 0.0000 2 135 90 0 3285 S\001
  • doc/theses/thierry_delisle_PhD/thesis/local.bib

    rdef751f r4e2befe3  
    22% Cforall
    33@misc{cfa:frontpage,
    4   howpublished = {\href{https://cforall.uwaterloo.ca}{https://\-cforall.uwaterloo.ca}}
     4  url = {https://cforall.uwaterloo.ca/}
    55}
    66@article{cfa:typesystem,
     
    481481@misc{MAN:linux/cfs,
    482482  title = {{CFS} Scheduler - The Linux Kernel documentation},
    483   howpublished = {\href{https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}{https://\-www.kernel.org/\-doc/\-html/\-latest/\-scheduler/\-sched-design-CFS.html}}
     483  url = {https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}
    484484}
    485485
     
    489489  year = {2019},
    490490  month = {February},
    491   howpublished = {\href{https://opensource.com/article/19/2/fair-scheduling-linux}{https://\-opensource.com/\-article/\-19/2\-/\-fair-scheduling-linux}}
     491  url = {https://opensource.com/article/19/2/fair-scheduling-linux}
    492492}
    493493
     
    499499}
    500500
    501 @misc{MAN:linux/cfs/balancing,
     501@article{MAN:linux/cfs/balancing,
    502502  title={Reworking {CFS} load balancing},
    503   journal={LWN article},
    504   year={2019},
    505   howpublished = {\href{https://lwn.net/Articles/793427}{https://\-lwn.net/\-Articles/\-793427}},
     503  journal={LWN article, available at: https://lwn.net/Articles/793427/},
     504  year={2013}
    506505}
    507506
     
    524523  title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide},
    525524  organization = {Apple Inc.},
    526   howPublish = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}}
     525  url = {https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}
    527526}
    528527
     
    537536  month = {June},
    538537  series = {Developer Reference},
    539   howpublished = {\href{https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}{https://\-www.microsoftpressstore.com/\-articles/\-article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}}
    540 }
    541 
    542 @misc{GITHUB:go,
     538  url = {https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}
     539}
     540
     541@online{GITHUB:go,
    543542  title = {GitHub - The Go Programming Language},
    544543  author = {The Go Programming Language},
    545   howpublished = {\href{https://github.com/golang/go}{https://\-github.com/\-golang/\-go}},
     544  url = {https://github.com/golang/go},
    546545  version = {Change-Id: If07f40b1d73b8f276ee28ffb8b7214175e56c24d}
    547546}
     
    552551  year = {2019},
    553552  booktitle = {Hydra},
    554   howpublished = {\href{https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}{https://\-www.youtube.com/\-watch?v=-K11rY57K7k&ab_channel=Hydra}}
     553  url = {https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}
    555554}
    556555
     
    560559  year = {2008},
    561560  booktitle = {Erlang User Conference},
    562   howpublished = {\href{http://www.erlang.se/euc/08/euc_smp.pdf}{http://\-www.erlang.se/\-euc/\-08/\-euc_smp.pdf}}
    563 }
     561  url = {http://www.erlang.se/euc/08/euc_smp.pdf}
     562}
     563
     564
    564565
    565566@manual{MAN:tbb/scheduler,
    566567  title = {Scheduling Algorithm - Intel{\textregistered} Threading Building Blocks Developer Reference},
    567568  organization = {Intel{\textregistered}},
    568   howpublished = {\href{https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}{https://\-www.threadingbuildingblocks.org/\-docs/\-help/\-reference/\-task\_scheduler/\-scheduling\_algorithm.html}}
     569  url = {https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}
    569570}
    570571
     
    572573  title = {Quasar Core - Quasar User Manual},
    573574  organization = {Parallel Universe},
    574   howpublished = {\href{https://docs.paralleluniverse.co/quasar}{https://\-docs.paralleluniverse.co/\-quasar}}
     575  url = {https://docs.paralleluniverse.co/quasar/}
    575576}
    576577@misc{MAN:project-loom,
    577   howpublished = {\href{https://www.baeldung.com/openjdk-project-loom}{https://\-www.baeldung.com/\-openjdk-project-loom}}
     578  url = {https://www.baeldung.com/openjdk-project-loom}
    578579}
    579580
    580581@misc{MAN:java/fork-join,
    581   howpublished = {\href{https://www.baeldung.com/java-fork-join}{https://\-www.baeldung.com/\-java-fork-join}}
     582  url = {https://www.baeldung.com/java-fork-join}
    582583}
    583584
     
    632633  month   = "March",
    633634  version = {0,4},
    634   howpublished = {\href{https://kernel.dk/io_uring.pdf}{https://\-kernel.dk/\-io\_uring.pdf}}
     635  howpublished = {\url{https://kernel.dk/io_uring.pdf}}
    635636}
    636637
     
    641642  title = "Control theory --- {W}ikipedia{,} The Free Encyclopedia",
    642643  year = "2020",
    643   howpublished = {\href{https://en.wikipedia.org/wiki/Task_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Task\_parallelism}},
     644  url = "https://en.wikipedia.org/wiki/Task_parallelism",
    644645  note = "[Online; accessed 22-October-2020]"
    645646}
     
    649650  title = "Task parallelism --- {W}ikipedia{,} The Free Encyclopedia",
    650651  year = "2020",
    651   howpublished = "\href{https://en.wikipedia.org/wiki/Control_theory}{https://\-en.wikipedia.org/\-wiki/\-Control\_theory}",
     652  url = "https://en.wikipedia.org/wiki/Control_theory",
    652653  note = "[Online; accessed 22-October-2020]"
    653654}
     
    657658  title = "Implicit parallelism --- {W}ikipedia{,} The Free Encyclopedia",
    658659  year = "2020",
    659   howpublished = "\href{https://en.wikipedia.org/wiki/Implicit_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Implicit\_parallelism}",
     660  url = "https://en.wikipedia.org/wiki/Implicit_parallelism",
    660661  note = "[Online; accessed 23-October-2020]"
    661662}
     
    665666  title = "Explicit parallelism --- {W}ikipedia{,} The Free Encyclopedia",
    666667  year = "2017",
    667   howpublished = "\href{https://en.wikipedia.org/wiki/Explicit_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Explicit\_parallelism}",
     668  url = "https://en.wikipedia.org/wiki/Explicit_parallelism",
    668669  note = "[Online; accessed 23-October-2020]"
    669670}
     
    673674  title = "Linear congruential generator --- {W}ikipedia{,} The Free Encyclopedia",
    674675  year = "2020",
    675   howpublished = "\href{https://en.wikipedia.org/wiki/Linear_congruential_generator}{https://en.wikipedia.org/wiki/Linear\_congruential\_generator}",
     676  url = "https://en.wikipedia.org/wiki/Linear_congruential_generator",
    676677  note = "[Online; accessed 2-January-2021]"
    677678}
     
    681682  title = "Futures and promises --- {W}ikipedia{,} The Free Encyclopedia",
    682683  year = "2020",
    683   howpublished = "\href{https://en.wikipedia.org/wiki/Futures_and_promises}{https://\-en.wikipedia.org/\-wiki/Futures\_and\_promises}",
     684  url = "https://en.wikipedia.org/wiki/Futures_and_promises",
    684685  note = "[Online; accessed 9-February-2021]"
    685686}
     
    689690  title = "Read-copy-update --- {W}ikipedia{,} The Free Encyclopedia",
    690691  year = "2022",
    691   howpublished = "\href{https://en.wikipedia.org/wiki/Linear_congruential_generator}{https://\-en.wikipedia.org/\-wiki/\-Linear\_congruential\_generator}",
     692  url = "https://en.wikipedia.org/wiki/Linear_congruential_generator",
    692693  note = "[Online; accessed 12-April-2022]"
    693694}
     
    697698  title = "Readers-writer lock --- {W}ikipedia{,} The Free Encyclopedia",
    698699  year = "2021",
    699   howpublished = "\href{https://en.wikipedia.org/wiki/Readers-writer_lock}{https://\-en.wikipedia.org/\-wiki/\-Readers-writer\_lock}",
     700  url = "https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock",
    700701  note = "[Online; accessed 12-April-2022]"
    701702}
    702 
    703703@misc{wiki:binpak,
    704704  author = "{Wikipedia contributors}",
    705705  title = "Bin packing problem --- {W}ikipedia{,} The Free Encyclopedia",
    706706  year = "2022",
    707   howpublished = "\href{https://en.wikipedia.org/wiki/Bin_packing_problem}{https://\-en.wikipedia.org/\-wiki/\-Bin\_packing\_problem}",
     707  url = "https://en.wikipedia.org/wiki/Bin_packing_problem",
    708708  note = "[Online; accessed 29-June-2022]"
    709709}
     
    712712% [05/04, 12:36] Trevor Brown
    713713%     i don't know where rmr complexity was first introduced, but there are many many many papers that use the term and define it
    714 % [05/04, 12:37] Trevor Brown
     714% [05/04, 12:37] Trevor Brown
    715715%     here's one paper that uses the term a lot and links to many others that use it... might trace it to something useful there https://drops.dagstuhl.de/opus/volltexte/2021/14832/pdf/LIPIcs-DISC-2021-30.pdf
    716 % [05/04, 12:37] Trevor Brown
     716% [05/04, 12:37] Trevor Brown
    717717%     another option might be to cite a textbook
    718 % [05/04, 12:42] Trevor Brown
     718% [05/04, 12:42] Trevor Brown
    719719%     but i checked two textbooks in the area i'm aware of and i don't see a definition of rmr complexity in either
    720 % [05/04, 12:42] Trevor Brown
     720% [05/04, 12:42] Trevor Brown
    721721%     this one has a nice statement about the prevelance of rmr complexity, as well as some rough definition
    722 % [05/04, 12:42] Trevor Brown
     722% [05/04, 12:42] Trevor Brown
    723723%     https://dl.acm.org/doi/pdf/10.1145/3465084.3467938
    724724
     
    728728%
    729729% https://doi.org/10.1137/1.9781611973099.100
    730 
    731 
    732 @misc{AIORant,
    733   author = "Linus Torvalds",
    734   title = "Re: [PATCH 09/13] aio: add support for async openat()",
    735   year = "2016",
    736   month = jan,
    737   howpublished = "\href{https://lwn.net/Articles/671657}{https://\-lwn.net/\-Articles/671657}",
    738   note = "[Online; accessed 6-June-2022]"
    739 }
    740 
    741 @misc{apache,
    742   key = {Apache Software Foundation},
    743   title = {{T}he {A}pache Web Server},
    744   howpublished = {\href{http://httpd.apache.org}{http://\-httpd.apache.org}},
    745   note = "[Online; accessed 6-June-2022]"
    746 }
    747 
    748 @misc{SeriallyReusable,
    749     author      = {IBM},
    750     title       = {Serially reusable programs},
    751     month       = mar,
    752     howpublished= {\href{https://www.ibm.com/docs/en/ztpf/1.1.0.15?topic=structures-serially-reusable-programs}{https://www.ibm.com/\-docs/\-en/\-ztpf/\-1.1.0.15?\-topic=structures\--serially\--reusable-programs}},
    753     year        = 2021,
    754 }
    755 
    756 @inproceedings{Albers12,
    757     author      = {Susanne Albers and Antonios Antoniadis},
    758     title       = {Race to Idle: New Algorithms for Speed Scaling with a Sleep State},
    759     booktitle   = {Proceedings of the 2012  Annual ACM-SIAM Symposium on Discrete Algorithms (SODA)},
    760     doi         = {10.1137/1.9781611973099.100},
    761     URL         = {https://epubs.siam.org/doi/abs/10.1137/1.9781611973099.100},
    762     eprint      = {https://epubs.siam.org/doi/pdf/10.1137/1.9781611973099.100},
    763     year        = 2012,
    764     month       = jan,
    765     pages       = {1266-1285},
    766 }
  • doc/theses/thierry_delisle_PhD/thesis/text/core.tex

    rdef751f r4e2befe3  
    322322Building a scheduler that is cache aware poses two main challenges: discovering the cache topology and matching \procs to this cache structure.
    323323Unfortunately, there is no portable way to discover cache topology, and it is outside the scope of this thesis to solve this problem.
    324 This work uses the cache topology information from Linux's @/sys/devices/system/cpu@ directory.
     324This work uses the cache topology information from Linux's \texttt{/sys/devices/system/cpu} directory.
    325325This leaves the challenge of matching \procs to cache structure, or more precisely identifying which subqueues of the ready queue are local to which subcomponents of the cache structure.
    326326Once a matching is generated, the helping algorithm is changed to add bias so that \procs more often help subqueues local to the same cache substructure.\footnote{
     
    330330Instead of having each subqueue local to a specific \proc, the system is initialized with subqueues for each hardware hyperthread/core up front.
    331331Then \procs dequeue and enqueue by first asking which CPU id they are executing on, in order to identify which subqueues are the local ones.
    332 \Glspl{proc} can get the CPU id from @sched_getcpu@ or @librseq@.
     332\Glspl{proc} can get the CPU id from \texttt{sched\_getcpu} or \texttt{librseq}.
    333333
    334334This approach solves the performance problems on systems with topologies with narrow L3 caches, similar to Figure \ref{fig:cache-noshare}.
     
    341341
    342342\subsection{Topological Work Stealing}
    343 \label{s:TopologicalWorkStealing}
    344343Therefore, the approach used in the \CFA scheduler is to have per-\proc subqueues, but have an explicit data-structure track which cache substructure each subqueue is tied to.
    345344This tracking requires some finesse because reading this data structure must lead to fewer cache misses than not having the data structure in the first place.
  • doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex

    rdef751f r4e2befe3  
    11\chapter{Micro-Benchmarks}\label{microbench}
    22
    3 The first step in evaluating this work is to test-out small controlled cases to ensure the basics work properly.
    4 This chapter presents five different experimental setup, evaluating some of the basic features of \CFA's scheduler.
     3The first step of evaluation is always to test-out small controlled cases, to ensure that the basics are working properly.
     4This sections presents five different experimental setup, evaluating some of the basic features of \CFA's scheduler.
    55
    66\section{Benchmark Environment}
    7 All benchmarks are run on two distinct hardware platforms.
    8 \begin{description}
    9 \item[AMD] is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM.
    10 The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}.
    11 Each CPU has 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches, respectively.
     7All of these benchmarks are run on two distinct hardware environment, an AMD and an INTEL machine.
     8
     9For all benchmarks, \texttt{taskset} is used to limit the experiment to 1 NUMA Node with no hyper threading.
     10If more \glspl{hthrd} are needed, then 1 NUMA Node with hyperthreading is used.
     11If still more \glspl{hthrd} are needed then the experiment is limited to as few NUMA Nodes as needed.
     12
     13
     14\paragraph{AMD} The AMD machine is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM.
     15The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
     16These EPYCs have 64 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 256 \glspl{hthrd}.
     17The cpus each have 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches respectively.
    1218Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}.
     19
     20\paragraph{Intel} The Intel machine is a server with four Intel Xeon Platinum 8160 CPUs and 384GB of DDR4 RAM.
    1321The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
    14 
    15 \item[Intel] is a server with four Intel Xeon Platinum 8160 CPUs and 384GB of DDR4 RAM.
    16 The Xeon CPU has 24 cores with 2 \glspl{hthrd} per core, for 48 \glspl{hthrd} per socket with 4 sockets for a total of 196 \glspl{hthrd}.
    17 Each CPU has 3 MB, 96 MB and 132 MB of L1, L2 and L3 caches respectively.
     22These Xeon Platinums have 24 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 192 \glspl{hthrd}.
     23The cpus each have 3 MB, 96 MB and 132 MB of L1, L2 and L3 caches respectively.
    1824Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared across the entire CPU, therefore 48 \glspl{hthrd}.
    19 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
    20 \end{description}
    21 
    22 For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA Node with no hyper threading.
    23 If more \glspl{hthrd} are needed, then 1 NUMA Node with hyperthreading is used.
    24 If still more \glspl{hthrd} are needed, then the experiment is limited to as few NUMA Nodes as needed.
    25 
    26 The limited sharing of the last-level cache on the AMD machine is markedly different than the Intel machine.
    27 Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different CPU incur a significant latency, on the AMD it is also the case that cache misses served by a different L3 instance on the same CPU still incur high latency.
     25
     26This limited sharing of the last level cache on the AMD machine is markedly different than the Intel machine. Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different cpu incurr a significant latency, on AMD it is also the case that cache misses served by a different L3 instance on the same cpu still incur high latency.
    2827
    2928
     
    3534        \label{fig:cycle}
    3635\end{figure}
    37 The most basic evaluation of any ready queue is to evaluate the latency needed to push and pop one element from the ready queue.
    38 Since these two operation also describe a @yield@ operation, many systems use this operation as the most basic benchmark.
    39 However, yielding can be treated as a special case by optimizing it away (dead code) since the number of ready \glspl{at} does not change.
    40 Not all systems perform this optimization, but those that do have an artificial performance benefit because the yield becomes a \emph{nop}.
    41 For this reason, I chose a different first benchmark, called \newterm{Cycle Benchmark}.
    42 This benchmark arranges a number of \glspl{at} into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list.
     36The most basic evaluation of any ready queue is to evaluate the latency needed to push and pop one element from the ready-queue.
     37Since these two operation also describe a \texttt{yield} operation, many systems use this as the most basic benchmark.
     38However, yielding can be treated as a special case, since it also carries the information that the number of the ready \glspl{at} will not change.
     39Not all systems use this information, but those which do may appear to have better performance than they would for disconnected push/pop pairs.
     40For this reason, I chose a different first benchmark, which I call the Cycle Benchmark.
     41This benchmark arranges many \glspl{at} into multiple rings of \glspl{at}.
     42Each ring is effectively a circular singly-linked list.
    4343At runtime, each \gls{at} unparks the next \gls{at} before parking itself.
    44 Unparking the next \gls{at} pushes that \gls{at} onto the ready queue as does the ensuing park.
    45 
    46 Hence, the underlying runtime cannot rely on the number of ready \glspl{at} staying constant over the duration of the experiment.
     44This corresponds to the desired pair of ready queue operations.
     45Unparking the next \gls{at} requires pushing that \gls{at} onto the ready queue and the ensuing park will cause the runtime to pop a \gls{at} from the ready-queue.
     46Figure~\ref{fig:cycle} shows a visual representation of this arrangement.
     47
     48The goal of this ring is that the underlying runtime cannot rely on the guarantee that the number of ready \glspl{at} will stay constant over the duration of the experiment.
    4749In fact, the total number of \glspl{at} waiting on the ready queue is expected to vary because of the race between the next \gls{at} unparking and the current \gls{at} parking.
    48 That is, the runtime cannot anticipate that the current task will immediately park.
    49 As well, the size of the cycle is also decided based on this race, \eg a small cycle may see the chain of unparks go full circle before the first \gls{at} parks because of time-slicing or multiple \procs.
    50 Every runtime system must handle this race and cannot optimized away the ready-queue pushes and pops.
    51 To prevent any attempt of silently omitting ready-queue operations, the ring of \glspl{at} is made big enough so the \glspl{at} have time to fully park before being unparked again.
    52 (Note, an unpark is like a V on a semaphore, so the subsequent park (P) may not block.)
    53 Finally, to further mitigate any underlying push/pop optimizations, especially on SMP machines, multiple rings are created in the experiment.
    54 
    55 To avoid this benchmark being affected by idle-sleep handling, the number of rings is multiple times greater than the number of \glspl{proc}.
    56 This design avoids the case where one of the \glspl{proc} runs out of work because of the variation on the number of ready \glspl{at} mentioned above.
    57 
    58 Figure~\ref{fig:cycle:code} shows the pseudo code for this benchmark.
    59 There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw @park@/@unpark@ and carefully picking the order of the @P@ and @V@ with respect to the loop condition.
    60 
    61 \begin{figure}
    62 \begin{cfa}
    63 Thread.main() {
    64         count := 0
    65         for {
    66                 @wait()@
    67                 @this.next.wake()@
    68                 count ++
    69                 if must_stop() { break }
    70         }
    71         global.count += count
    72 }
    73 \end{cfa}
    74 \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code}
    75 \label{fig:cycle:code}
    76 \end{figure}
     50The size of the cycle is also decided based on this race: cycles that are too small may see the chain of unparks go full circle before the first \gls{at} can park.
     51While this would not be a correctness problem, every runtime system must handle that race, it could lead to pushes and pops being optimized away.
     52Since silently omitting ready-queue operations would throw off the measuring of these operations, the ring of \glspl{at} must be big enough so the \glspl{at} have the time to fully park before they are unparked.
     53Note that this problem is only present on SMP machines and is significantly mitigated by the fact that there are multiple rings in the system.
     54
     55To avoid this benchmark from being dominated by the idle sleep handling, the number of rings is kept at least as high as the number of \glspl{proc} available.
     56Beyond this point, adding more rings serves to mitigate even more the idle sleep handling.
     57This is to avoid the case where one of the \glspl{proc} runs out of work because of the variation on the number of ready \glspl{at} mentionned above.
     58
     59The actual benchmark is more complicated to handle termination, but that simply requires using a binary semphore or a channel instead of raw \texttt{park}/\texttt{unpark} and carefully picking the order of the \texttt{P} and \texttt{V} with respect to the loop condition.
     60Figure~\ref{fig:cycle:code} shows pseudo code for this benchmark.
     61
     62\begin{figure}
     63        \begin{lstlisting}
     64                Thread.main() {
     65                        count := 0
     66                        for {
     67                                wait()
     68                                this.next.wake()
     69                                count ++
     70                                if must_stop() { break }
     71                        }
     72                        global.count += count
     73                }
     74        \end{lstlisting}
     75        \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code}
     76        \label{fig:cycle:code}
     77\end{figure}
     78
     79
    7780
    7881\subsection{Results}
    79 Figure~\ref{fig:cycle:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle.
    80 
    8182\begin{figure}
    8283        \subfloat[][Throughput, 100 \ats per \proc]{
     
    105106                \label{fig:cycle:jax:low:ns}
    106107        }
    107         \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput as a function of \proc count with 100 cycles per \proc and 5 \ats per cycle.}
     108        \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput as a function of \proc count, using 100 cycles per \proc, 5 \ats per cycle.}
    108109        \label{fig:cycle:jax}
    109110\end{figure}
     111Figure~\ref{fig:cycle:jax} shows the throughput as a function of \proc count, with the following constants:
     112Each run uses 100 cycles per \proc, 5 \ats per cycle.
    110113
    111114\todo{results discussion}
    112115
    113116\section{Yield}
    114 For completion, the classic yield benchmark is included.
    115 This benchmark is simpler than the cycle test: it creates many \glspl{at} that call @yield@.
    116 As mentioned, this benchmark may not be representative because of optimization shortcuts in @yield@.
    117 The only interesting variable in this benchmark is the number of \glspl{at} per \glspl{proc}, where ratios close to 1 means the ready queue(s) can be empty.
    118 This scenario can put a strain on the idle-sleep handling compared to scenarios where there is plenty of work.
    119 Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, where the @wait/next.wake@ is replaced by @yield@.
    120 
    121 \begin{figure}
    122 \begin{cfa}
    123 Thread.main() {
    124         count := 0
    125         for {
    126                 @yield()@
    127                 count ++
    128                 if must_stop() { break }
    129         }
    130         global.count += count
    131 }
    132 \end{cfa}
    133 \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code}
    134 \label{fig:yield:code}
     117For completion, I also include the yield benchmark.
     118This benchmark is much simpler than the cycle tests, it simply creates many \glspl{at} that call \texttt{yield}.
     119As mentionned in the previous section, this benchmark may be less representative of usages that only make limited use of \texttt{yield}, due to potential shortcuts in the routine.
     120Its only interesting variable is the number of \glspl{at} per \glspl{proc}, where ratios close to 1 means the ready queue(s) could be empty.
     121This sometimes puts more strain on the idle sleep handling, compared to scenarios where there is clearly plenty of work to be done.
     122Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, the ``wait/wake-next'' is simply replaced by a yield.
     123
     124\begin{figure}
     125        \begin{lstlisting}
     126                Thread.main() {
     127                        count := 0
     128                        for {
     129                                yield()
     130                                count ++
     131                                if must_stop() { break }
     132                        }
     133                        global.count += count
     134                }
     135        \end{lstlisting}
     136        \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code}
     137        \label{fig:yield:code}
    135138\end{figure}
    136139
    137140\subsection{Results}
    138 
    139 Figure~\ref{fig:yield:jax} shows the throughput as a function of \proc count, where each run uses 100 \ats per \proc.
    140 
    141141\begin{figure}
    142142        \subfloat[][Throughput, 100 \ats per \proc]{
     
    168168        \label{fig:yield:jax}
    169169\end{figure}
     170Figure~\ref{fig:yield:ops:jax} shows the throughput as a function of \proc count, with the following constants:
     171Each run uses 100 \ats per \proc.
    170172
    171173\todo{results discussion}
    172174
     175
    173176\section{Churn}
    174 The Cycle and Yield benchmark represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application.
    175 In these benchmarks, \glspl{at} can be easily partitioned over the different \glspl{proc} upfront and none of the \glspl{at} communicate with each other.
    176 
    177 The Churn benchmark represents more chaotic execution, where there is no relation between the last \gls{proc} on which a \gls{at} ran and blocked and the \gls{proc} that subsequently unblocks it.
    178 With processor-specific ready-queues, when a \gls{at} is unblocked by a different \gls{proc} that means the unblocking \gls{proc} must either ``steal'' the \gls{at} from another processor or find it on a global queue.
    179 This dequeuing results in either contention on the remote queue and/or \glspl{rmr} on \gls{at} data structure.
    180 In either case, this benchmark aims to highlight how each scheduler handles these cases, since both cases can lead to performance degradation if not handled correctly.
    181 
    182 This benchmark uses a fixed-size array of counting semaphores.
    183 Each \gls{at} picks a random semaphore, @V@s it to unblock any \at waiting, and then @P@s on the semaphore.
     177The Cycle and Yield benchmark represents an ``easy'' scenario for a scheduler, \eg, an embarrassingly parallel application.
     178In these benchmarks, \glspl{at} can be easily partitioned over the different \glspl{proc} up-front and none of the \glspl{at} communicate with each other.
     179
     180The Churn benchmark represents more chaotic usages, where there is no relation between the last \gls{proc} on which a \gls{at} ran and the \gls{proc} that unblocked it.
     181When a \gls{at} is unblocked from a different \gls{proc} than the one on which it last ran, the unblocking \gls{proc} must either ``steal'' the \gls{at} or place it on a remote queue.
     182This results can result in either contention on the remote queue or \glspl{rmr} on \gls{at} data structure.
     183In either case, this benchmark aims to highlight how each scheduler handles these cases, since both cases can lead to performance degradation if they are not handled correctly.
     184
     185To achieve this the benchmark uses a fixed size array of semaphores.
     186Each \gls{at} picks a random semaphore, \texttt{V}s it to unblock a \at waiting and then \texttt{P}s on the semaphore.
    184187This creates a flow where \glspl{at} push each other out of the semaphores before being pushed out themselves.
    185 For this benchmark to work, the number of \glspl{at} must be equal or greater than the number of semaphores plus the number of \glspl{proc}.
    186 Note, the nature of these semaphores mean the counter can go beyond 1, which can lead to nonblocking calls to @P@.
    187 Figure~\ref{fig:churn:code} shows pseudo code for this benchmark, where the @yield@ is replaced by @V@ and @P@.
    188 
    189 \begin{figure}
    190 \begin{cfa}
    191 Thread.main() {
    192         count := 0
    193         for {
    194                 r := random() % len(spots)
    195                 @spots[r].V()@
    196                 @spots[r].P()@
    197                 count ++
    198                 if must_stop() { break }
    199         }
    200         global.count += count
    201 }
    202 \end{cfa}
    203 \caption[Churn Benchmark : Pseudo Code]{Churn Benchmark : Pseudo Code}
    204 \label{fig:churn:code}
    205 \end{figure}
    206 
    207 \subsection{Results}
    208 Figure~\ref{fig:churn:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle.
     188For this benchmark to work however, the number of \glspl{at} must be equal or greater to the number of semaphores plus the number of \glspl{proc}.
     189Note that the nature of these semaphores mean the counter can go beyond 1, which could lead to calls to \texttt{P} not blocking.
     190
     191\todo{code, setup, results}
     192\begin{lstlisting}
     193        Thread.main() {
     194                count := 0
     195                for {
     196                        r := random() % len(spots)
     197                        spots[r].V()
     198                        spots[r].P()
     199                        count ++
     200                        if must_stop() { break }
     201                }
     202                global.count += count
     203        }
     204\end{lstlisting}
    209205
    210206\begin{figure}
     
    234230                \label{fig:churn:jax:low:ns}
    235231        }
    236         \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine.
    237         Throughput is the total operation per second across all cores. Latency is the duration of each operation.}
     232        \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. Throughput is the total operation per second across all cores. Latency is the duration of each opeartion.}
    238233        \label{fig:churn:jax}
    239234\end{figure}
    240235
    241 \todo{results discussion}
    242 
    243236\section{Locality}
    244237
     
    246239
    247240\section{Transfer}
    248 The last benchmark is more of an experiment than a benchmark.
    249 It tests the behaviour of the schedulers for a misbehaved workload.
     241The last benchmark is more exactly characterize as an experiment than a benchmark.
     242It tests the behavior of the schedulers for a particularly misbehaved workload.
    250243In this workload, one of the \gls{at} is selected at random to be the leader.
    251244The leader then spins in a tight loop until it has observed that all other \glspl{at} have acknowledged its leadership.
    252245The leader \gls{at} then picks a new \gls{at} to be the ``spinner'' and the cycle repeats.
    253 The benchmark comes in two flavours for the non-leader \glspl{at}:
    254 once they acknowledged the leader, they either block on a semaphore or spin yielding.
    255 
    256 The experiment is designed to evaluate the short-term load-balancing of a scheduler.
    257 Indeed, schedulers where the runnable \glspl{at} are partitioned on the \glspl{proc} may need to balance the \glspl{at} for this experiment to terminate.
    258 This problem occurs because the spinning \gls{at} is effectively preventing the \gls{proc} from running any other \glspl{thrd}.
    259 In the semaphore flavour, the number of runnable \glspl{at} eventually dwindles down to only the leader.
    260 This scenario is a simpler case to handle for schedulers since \glspl{proc} eventually run out of work.
     246
     247The benchmark comes in two flavours for the behavior of the non-leader \glspl{at}:
     248once they acknowledged the leader, they either block on a semaphore or yield repeatadly.
     249
     250This experiment is designed to evaluate the short term load balancing of the scheduler.
     251Indeed, schedulers where the runnable \glspl{at} are partitioned on the \glspl{proc} may need to balance the \glspl{at} for this experient to terminate.
     252This is because the spinning \gls{at} is effectively preventing the \gls{proc} from runnning any other \glspl{thrd}.
     253In the semaphore flavour, the number of runnable \glspl{at} will eventually dwindle down to only the leader.
     254This is a simpler case to handle for schedulers since \glspl{proc} eventually run out of work.
    261255In the yielding flavour, the number of runnable \glspl{at} stays constant.
    262 This scenario is a harder case to handle because corrective measures must be taken even when work is available.
    263 Note, runtime systems with preemption circumvent this problem by forcing the spinner to yield.
     256This is a harder case to handle because corrective measures must be taken even if work is still available.
     257Note that languages that have mandatory preemption do circumvent this problem by forcing the spinner to yield.
    264258
    265259\todo{code, setup, results}
    266 
    267 \begin{figure}
    268 \begin{cfa}
    269 Thread.lead() {
    270         this.idx_seen = ++lead_idx
    271         if lead_idx > stop_idx {
    272                 done := true
    273                 return
    274         }
    275 
    276         // Wait for everyone to acknowledge my leadership
    277         start: = timeNow()
    278         for t in threads {
    279                 while t.idx_seen != lead_idx {
    280                         asm pause
    281                         if (timeNow() - start) > 5 seconds { error() }
    282                 }
    283         }
    284 
    285         // pick next leader
    286         leader := threads[ prng() % len(threads) ]
    287 
    288         // wake every one
    289         if ! exhaust {
     260\begin{lstlisting}
     261        Thread.lead() {
     262                this.idx_seen = ++lead_idx
     263                if lead_idx > stop_idx {
     264                        done := true
     265                        return
     266                }
     267
     268                // Wait for everyone to acknowledge my leadership
     269                start: = timeNow()
    290270                for t in threads {
    291                         if t != me { t.wake() }
    292                 }
    293         }
    294 }
    295 
    296 Thread.wait() {
    297         this.idx_seen := lead_idx
    298         if exhaust { wait() }
    299         else { yield() }
    300 }
    301 
    302 Thread.main() {
    303         while !done  {
    304                 if leader == me { this.lead() }
    305                 else { this.wait() }
    306         }
    307 }
    308 \end{cfa}
    309 \caption[Transfer Benchmark : Pseudo Code]{Transfer Benchmark : Pseudo Code}
    310 \label{fig:transfer:code}
    311 \end{figure}
    312 
    313 \subsection{Results}
    314 Figure~\ref{fig:transfer:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle.
    315 
    316 \todo{results discussion}
     271                        while t.idx_seen != lead_idx {
     272                                asm pause
     273                                if (timeNow() - start) > 5 seconds { error() }
     274                        }
     275                }
     276
     277                // pick next leader
     278                leader := threads[ prng() % len(threads) ]
     279
     280                // wake every one
     281                if !exhaust {
     282                        for t in threads {
     283                                if t != me { t.wake() }
     284                        }
     285                }
     286        }
     287
     288        Thread.wait() {
     289                this.idx_seen := lead_idx
     290                if exhaust { wait() }
     291                else { yield() }
     292        }
     293
     294        Thread.main() {
     295                while !done  {
     296                        if leader == me { this.lead() }
     297                        else { this.wait() }
     298                }
     299        }
     300\end{lstlisting}
  • doc/theses/thierry_delisle_PhD/thesis/text/existing.tex

    rdef751f r4e2befe3  
    1414
    1515\section{Naming Convention}
    16 Scheduling has been studied by various communities concentrating on different incarnation of the same problems.
    17 As a result, there are no standard naming conventions for scheduling that is respected across these communities.
    18 This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats.
     16Scheduling has been studied by various communities concentrating on different incarnation of the same problems. As a result, there are no standard naming conventions for scheduling that is respected across these communities. This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats.
    1917
    2018\section{Static Scheduling}
     
    2826\section{Dynamic Scheduling}
    2927\newterm{Dynamic schedulers} determine \ats dependencies and costs during scheduling, if at all.
    30 Hence, unlike static scheduling, \ats dependencies are conditional and detected at runtime.
    31 This detection takes the form of observing new \ats(s) in the system and determining dependencies from their behaviour, including suspending or halting a \ats that dynamically detects unfulfilled dependencies.
     28Hence, unlike static scheduling, \ats dependencies are conditional and detected at runtime. This detection takes the form of observing new \ats(s) in the system and determining dependencies from their behaviour, including suspending or halting a \ats that dynamically detects unfulfilled dependencies.
    3229Furthermore, each \ats has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled.
    3330As a consequence, the scheduler often has an incomplete view of the system, seeing only \ats with no pending dependencies.
     
    181178\begin{displayquote}
    182179        \begin{enumerate}
    183                 \item The task returned by \textit{t}@.execute()@
     180                \item The task returned by \textit{t}\texttt{.execute()}
    184181                \item The successor of t if \textit{t} was its last completed predecessor.
    185182                \item A task popped from the end of the thread's own deque.
     
    196193\paragraph{Quasar/Project Loom}
    197194Java has two projects, Quasar~\cite{MAN:quasar} and Project Loom~\cite{MAN:project-loom}\footnote{It is unclear if these are distinct projects.}, that are attempting to introduce lightweight thread\-ing in the form of Fibers.
    198 Both projects seem to be based on the @ForkJoinPool@ in Java, which appears to be a simple incarnation of randomized work-stealing~\cite{MAN:java/fork-join}.
     195Both projects seem to be based on the \texttt{ForkJoinPool} in Java, which appears to be a simple incarnation of randomized work-stealing~\cite{MAN:java/fork-join}.
    199196
    200197\paragraph{Grand Central Dispatch}
     
    207204% http://web.archive.org/web/20090920043909/http://images.apple.com/macosx/technology/docs/GrandCentral_TB_brief_20090903.pdf
    208205
    209 In terms of semantics, the Dispatch Queues seem to be very similar to Intel\textregistered ~TBB @execute()@ and predecessor semantics.
     206In terms of semantics, the Dispatch Queues seem to be very similar to Intel\textregistered ~TBB \texttt{execute()} and predecessor semantics.
    210207
    211208\paragraph{LibFibre}
  • doc/theses/thierry_delisle_PhD/thesis/text/intro.tex

    rdef751f r4e2befe3  
    103103An algorithm for load-balancing and idle sleep of processors, including NUMA awareness.
    104104\item
    105 Support for user-level \glsxtrshort{io} capabilities based on Linux's @io_uring@.
     105Support for user-level \glsxtrshort{io} capabilities based on Linux's \texttt{io\_uring}.
    106106\end{enumerate}
  • doc/theses/thierry_delisle_PhD/thesis/text/io.tex

    rdef751f r4e2befe3  
    11\chapter{User Level \io}
    2 As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations.
     2As mentioned in Section~\ref{prev:io}, User-Level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations.
    33Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating-system.
    44
    55\section{Kernel Interface}
    6 Since this work fundamentally depends on operating-system support, the first step of this design is to discuss the available interfaces and pick one (or more) as the foundation for the non-blocking \io subsystem in this work.
     6Since this work fundamentally depends on operating-system support, the first step of any design is to discuss the available interfaces and pick one (or more) as the foundations of the non-blocking \io subsystem.
    77
    88\subsection{\lstinline{O_NONBLOCK}}
     
    1010In this mode, ``Neither the @open()@ nor any subsequent \io operations on the [opened file descriptor] will cause the calling process to wait''~\cite{MAN:open}.
    1111This feature can be used as the foundation for the non-blocking \io subsystem.
    12 However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be used in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait.\footnote{
    13 In this context, ready means \emph{some} operation can be performed without blocking.
     12However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be use in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait
     13\footnote{In this context, ready means \emph{some} operation can be performed without blocking.
    1414It does not mean an operation returning \lstinline{EAGAIN} succeeds on the next try.
    15 For example, a ready read may only return a subset of requested bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}
     15For example, a ready read may only return a subset of bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}.
    1616This mechanism is also crucial in determining when all \glspl{thrd} are blocked and the application \glspl{kthrd} can now block.
    1717
    18 There are three options to monitor file descriptors in Linux:\footnote{
    19 For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}.
     18There are three options to monitor file descriptors in Linux
     19\footnote{For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}.
    2020The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.},
    2121@select@~\cite{MAN:select}, @poll@~\cite{MAN:poll} and @epoll@~\cite{MAN:epoll}.
    2222All three of these options offer a system call that blocks a \gls{kthrd} until at least one of many file descriptors becomes ready.
    23 The group of file descriptors being waited on is called the \newterm{interest set}.
    24 
    25 \paragraph{\lstinline{select}} is the oldest of these options, and takes as input a contiguous array of bits, where each bit represents a file descriptor of interest.
    26 Hence, the array length must be as long as the largest FD currently of interest.
    27 On return, it outputs the set in place to identify which of the file descriptors changed state.
    28 This destructive change means selecting in a loop requires re-initializing the array for each iteration.
    29 Another limit of @select@ is that calls from different \glspl{kthrd} sharing FDs are independent.
    30 Hence, if one \gls{kthrd} is managing the select calls, other threads can only add/remove to/from the manager's interest set through synchronized calls to update the interest set.
    31 However, these changes are only reflected when the manager makes its next call to @select@.
    32 Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/ttys it is waiting on never get data again.
    33 Often the I/O manager has a timeout, polls, or is sent a signal on changes to mitigate this problem.
    34 
    35 \begin{comment}
    36 From: Tim Brecht <brecht@uwaterloo.ca>
    37 Subject: Re: FD sets
    38 Date: Wed, 6 Jul 2022 00:29:41 +0000
    39 
    40 Large number of open files
    41 --------------------------
    42 
    43 In order to be able to use more than the default number of open file
    44 descriptors you may need to:
    45 
    46 o increase the limit on the total number of open files /proc/sys/fs/file-max
    47   (on Linux systems)
    48 
    49 o increase the size of FD_SETSIZE
    50   - the way I often do this is to figure out which include file __FD_SETSIZE
    51     is defined in, copy that file into an appropriate directory in ./include,
    52     and then modify it so that if you use -DBIGGER_FD_SETSIZE the larger size
    53     gets used
    54 
    55   For example on a RH 9.0 distribution I've copied
    56   /usr/include/bits/typesizes.h into ./include/i386-linux/bits/typesizes.h
    57 
    58   Then I modify typesizes.h to look something like:
    59 
    60   #ifdef BIGGER_FD_SETSIZE
    61   #define __FD_SETSIZE            32767
    62   #else
    63   #define __FD_SETSIZE            1024
    64   #endif
    65 
    66   Note that the since I'm moving and testing the userver on may different
    67   machines the Makefiles are set up to use -I ./include/$(HOSTTYPE)
    68 
    69   This way if you redefine the FD_SETSIZE it will get used instead of the
    70   default original file.
    71 \end{comment}
    72 
    73 \paragraph{\lstinline{poll}} is the next oldest option, and takes as input an array of structures containing the FD numbers rather than their position in an array of bits, allowing a more compact input for interest sets that contain widely spaced FDs.
    74 For small interest sets with densely packed FDs, the @select@ bit mask can take less storage, and hence, copy less information into the kernel.
    75 Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialize on every call.
    76 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \gls{kthrd}, while a manager thread is blocked in @poll@.
    77 
    78 \paragraph{\lstinline{epoll}} follows after @poll@, and places the interest set in the kernel rather than the application, where it is managed by an internal \gls{kthrd}.
    79 There are two separate functions: one to add to the interest set and another to check for FDs with state changes.
     23The group of file descriptors being waited is called the \newterm{interest set}.
     24
     25\paragraph{\lstinline{select}} is the oldest of these options, it takes as an input a contiguous array of bits, where each bits represent a file descriptor of interest.
     26On return, it modifies the set in place to identify which of the file descriptors changed status.
     27This destructive change means that calling select in a loop requires re-initializing the array each time and the number of file descriptors supported has a hard limit.
     28Another limit of @select@ is that once the call is started, the interest set can no longer be modified.
     29Monitoring a new file descriptor generally requires aborting any in progress call to @select@
     30\footnote{Starting a new call to \lstinline{select} is possible but requires a distinct kernel thread, and as a result is not an acceptable multiplexing solution when the interest set is large and highly dynamic unless the number of parallel calls to \lstinline{select} can be strictly bounded.}.
     31
     32\paragraph{\lstinline{poll}} is an improvement over select, which removes the hard limit on the number of file descriptors and the need to re-initialize the input on every call.
     33It works using an array of structures as an input rather than an array of bits, thus allowing a more compact input for small interest sets.
     34Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed while the call is blocked.
     35
     36\paragraph{\lstinline{epoll}} further improves these two functions by allowing the interest set to be dynamically added to and removed from while a \gls{kthrd} is blocked on an @epoll@ call.
    8037This dynamic capability is accomplished by creating an \emph{epoll instance} with a persistent interest set, which is used across multiple calls.
    81 As the interest set is augmented, the changes become implicitly part of the interest set for a blocked manager \gls{kthrd}.
    82 This capability significantly reduces synchronization between \glspl{kthrd} and the manager calling @epoll@.
    83 
    84 However, all three of these I/O systems have limitations.
     38This capability significantly reduces synchronization overhead on the part of the caller (in this case the \io subsystem), since the interest set can be modified when adding or removing file descriptors without having to synchronize with other \glspl{kthrd} potentially calling @epoll@.
     39
     40However, all three of these system calls have limitations.
    8541The @man@ page for @O_NONBLOCK@ mentions that ``[@O_NONBLOCK@] has no effect for regular files and block devices'', which means none of these three system calls are viable multiplexing strategies for these types of \io operations.
    8642Furthermore, @epoll@ has been shown to have problems with pipes and ttys~\cit{Peter's examples in some fashion}.
     
    9753It also supports batching multiple operations in a single system call.
    9854
    99 AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.
     55AIO offers two different approach to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.
    10056For the purpose of \io multiplexing, @aio_suspend@ is the best interface.
    10157However, even if AIO requests can be submitted concurrently, @aio_suspend@ suffers from the same limitation as @select@ and @poll@, \ie, the interest set cannot be dynamically changed while a call to @aio_suspend@ is in progress.
     
    11470
    11571        \begin{flushright}
    116                 -- Linus Torvalds~\cite{AIORant}
     72                -- Linus Torvalds\cit{https://lwn.net/Articles/671657/}
    11773        \end{flushright}
    11874\end{displayquote}
     
    12985A very recent addition to Linux, @io_uring@~\cite{MAN:io_uring}, is a framework that aims to solve many of the problems listed in the above interfaces.
    13086Like AIO, it represents \io operations as entries added to a queue.
    131 But like @epoll@, new requests can be submitted, while a blocking call waiting for requests to complete, is already in progress.
     87But like @epoll@, new requests can be submitted while a blocking call waiting for requests to complete is already in progress.
    13288The @io_uring@ interface uses two ring buffers (referred to simply as rings) at its core: a submit ring to which programmers push \io requests and a completion ring from which programmers poll for completion.
    13389
     
    14197In the worst case, where all \glspl{thrd} are consistently blocking on \io, it devolves into 1-to-1 threading.
    14298However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \glspl{thrd} are ready to run.
    143 This approach is used by languages like Go\cit{Go}, frameworks like libuv\cit{libuv}, and web servers like Apache~\cite{apache} and Nginx~\cite{nginx}, since it has the advantage that it can easily be used across multiple operating systems.
     99This approach is used by languages like Go\cit{Go} and frameworks like libuv\cit{libuv}, since it has the advantage that it can easily be used across multiple operating systems.
    144100This advantage is especially relevant for languages like Go, which offer a homogeneous \glsxtrshort{api} across all platforms.
    145101As opposed to C, which has a very limited standard api for \io, \eg, the C standard library has no networking.
     
    155111\section{Event-Engine}
    156112An event engine's responsibility is to use the kernel interface to multiplex many \io operations onto few \glspl{kthrd}.
    157 In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engine then starts an operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.
     113In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engines then starts the operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.
    158114The parked \glspl{thrd} are then rescheduled by the event engine once the desired operation has completed.
    159115
     
    178134\begin{enumerate}
    179135\item
    180 An SQE is allocated from the pre-allocated array \emph{S}.
     136An SQE is allocated from the pre-allocated array (denoted \emph{S} in Figure~\ref{fig:iouring}).
    181137This array is created at the same time as the @io_uring@ instance, is in kernel-locked memory visible by both the kernel and the application, and has a fixed size determined at creation.
    182 How these entries are allocated is not important for the functioning of @io_uring@;
    183 the only requirement is that no entry is reused before the kernel has consumed it.
     138How these entries are allocated is not important for the functioning of @io_uring@, the only requirement is that no entry is reused before the kernel has consumed it.
    184139\item
    185140The SQE is filled according to the desired operation.
    186 This step is straight forward.
    187 The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries.
     141This step is straight forward, the only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries.
    188142\item
    189143The SQE is submitted to the submission ring by appending the index of the SQE to the ring following regular ring buffer steps: \lstinline{buffer[head] = item; head++}.
    190144Since the head is visible to the kernel, some memory barriers may be required to prevent the compiler from reordering these operations.
    191145Since the submission ring is a regular ring buffer, more than one SQE can be added at once and the head is updated only after all entries are updated.
    192 Note, SQE can be filled and submitted in any order, \eg in Figure~\ref{fig:iouring} the submission order is S0, S3, S2 and S1 has not been submitted.
    193146\item
    194147The kernel is notified of the change to the ring using the system call @io_uring_enter@.
     
    208161The @io_uring_enter@ system call is protected by a lock inside the kernel.
    209162This protection means that concurrent call to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@.
    210 It is possible to do the first three submission steps in parallel;
    211 however, doing so requires careful synchronization.
     163It is possible to do the first three submission steps in parallel, however, doing so requires careful synchronization.
    212164
    213165@io_uring@ also introduces constraints on the number of simultaneous operations that can be ``in flight''.
    214 First, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once.
    215 Second, the @io_uring_enter@ system call can fail because ``The  kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can have pending.''.
     166Obviously, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once.
     167In addition, the @io_uring_enter@ system call can fail because ``The  kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can  have pending.''.
    216168This restriction means \io request bursts may have to be subdivided and submitted in chunks at a later time.
    217169
    218170\subsection{Multiplexing \io: Submission}
    219 
    220171The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made in the submission side.
    221 While there is freedom in designing the submission side, there are some realities of @io_uring@ that must be taken into account.
    222 It is possible to do the first steps of submission in parallel;
    223 however, the duration of the system call scales with the number of entries submitted.
     172While it is possible to do the first steps of submission in parallel, the duration of the system call scales with number of entries submitted.
    224173The consequence is that the amount of parallelism used to prepare submissions for the next system call is limited.
    225174Beyond this limit, the length of the system call is the throughput limiting factor.
    226 I concluded from early experiments that preparing submissions seems to take almost as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}.
    227 Therefore, the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances.
    228 Since completions are sent to the instance where requests were submitted, all instances with pending operations must be polled continuously\footnote{
    229 As described in Chapter~\ref{practice}, this does not translate into constant CPU usage.}.
     175I concluded from early experiments that preparing submissions seems to take at most as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}.
     176Therefore the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances.
     177Similarly to scheduling, this sharding can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.
     178Since completions are sent to the instance where requests were submitted, all instances with pending operations must be polled continously
     179\footnote{As will be described in Chapter~\ref{practice}, this does not translate into constant cpu usage.}.
    230180Note that once an operation completes, there is nothing that ties it to the @io_uring@ instance that handled it.
    231 There is nothing preventing a new operation with, \eg the same file descriptors to a different @io_uring@ instance.
     181There is nothing preventing a new operation with, for example, the same file descriptors to a different @io_uring@ instance.
    232182
    233183A complicating aspect of submission is @io_uring@'s support for chains of operations, where the completion of an operation triggers the submission of the next operation on the link.
    234184SQEs forming a chain must be allocated from the same instance and must be contiguous in the Submission Ring (see Figure~\ref{fig:iouring}).
    235 The consequence of this feature is that filling SQEs can be arbitrarily complex, and therefore, users may need to run arbitrary code between allocation and submission.
    236 Supporting chains is not a requirement of the \io subsystem, but it is still valuable.
    237 Support for this feature can be fulfilled simply by supporting arbitrary user code between allocation and submission.
    238 
    239 Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.
    240 These three sharding approaches are analyzed.
     185The consequence of this feature is that filling SQEs can be arbitrarly complex and therefore users may need to run arbitrary code between allocation and submission.
     186Supporting chains is a requirement of the \io subsystem, but it is still valuable.
     187Support for this feature can be fulfilled simply to supporting arbitrary user code between allocation and submission.
     188
     189\subsubsection{Public Instances}
     190One approach is to have multiple shared instances.
     191\Glspl{thrd} attempting \io operations pick one of the available instances and submit operations to that instance.
     192Since there is no coupling between \glspl{proc} and @io_uring@ instances in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently.
     193Since @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects: the synchronization needed to submit does not induce more contention than @io_uring@ already does and the scheme to route \io requests to specific @io_uring@ instances does not introduce contention.
     194This second aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
     195
     196Allocation in this scheme can be handled fairly easily.
     197Free SQEs, \ie, SQEs that aren't currently being used to represent a request, can be written to safely and have a field called @user_data@ which the kernel only reads to copy to @cqe@s.
     198Allocation also requires no ordering guarantee as all free SQEs are interchangeable.
     199This requires a simple concurrent bag.
     200The only added complexity is that the number of SQEs is fixed, which means allocation can fail.
     201
     202Allocation failures need to be pushed up to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.
     203Furthermore, the routing algorithm should block operations up-front if none of the instances have available SQEs.
     204
     205Once an SQE is allocated, \glspl{thrd} can fill them normally, they simply need to keep track of the SQE index and which instance it belongs to.
     206
     207Once an SQE is filled in, what needs to happen is that the SQE must be added to the submission ring buffer, an operation that is not thread-safe on itself, and the kernel must be notified using the @io_uring_enter@ system call.
     208The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail
     209\footnote{This is because it is invalid to have the same \lstinline{sqe} multiple times in the ring buffer.}.
     210However, as mentioned, the system call itself can fail with the expectation that it will be retried once some of the already submitted operations complete.
     211Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency.
     212Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted.
     213This can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section.
     214
     215In the case of designating a \gls{thrd}, ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests would be batched together and one of the \glspl{thrd} would do the system call on behalf of the others, referred to as the \newterm{submitter}.
     216In practice however, it is important that the \io requests are not left pending indefinitely and as such, it may be required to have a ``next submitter'' that guarentees everything that is missed by the current submitter is seen by the next one.
     217Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call will include their request.
     218Once the system call is done, the submitter must also free SQEs so that the allocator can reused them.
     219
     220Finally, the completion side is much simpler since the @io_uring@ system call enforces a natural synchronization point.
     221Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}.
     222Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}.
     223If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events.
     224A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled.
     225
     226With this pool of instances approach, the big advantage is that it is fairly flexible.
     227It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions.
     228It also can gracefully handle running out of ressources, SQEs or the kernel returning @EBUSY@.
     229The down side to this is that many of the steps used for submitting need complex synchronization to work properly.
     230The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed.
     231The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused and handle the kernel returning @EBUSY@.
     232All this synchronization may have a significant cost and, compared to the next approach presented, this synchronization is entirely overhead.
    241233
    242234\subsubsection{Private Instances}
    243 The private approach creates one ring instance per \gls{proc}, \ie one-to-one coupling.
    244 This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not time-sliced during submission steps.
    245 This requirement is the same as accessing @thread_local@ variables, where a \gls{thrd} is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data.
    246 This failure is the serially reusable problem~\cite{SeriallyReusable}.
    247 Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in allocation order.\footnote{
    248 To remove this requirement, a \gls{thrd} needs the ability to ``yield to a specific \gls{proc}'', \ie, park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.}
    249 From the subsystem's point of view, the allocation and submission are sequential, greatly simplifying both.
    250 In this design, allocation and submission form a partitioned ring buffer as shown in Figure~\ref{fig:pring}.
    251 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to perform the system call.
    252 Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, \etc.
     235Another approach is to simply create one ring instance per \gls{proc}.
     236This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not interrupted in between two submission steps.
     237This is effectively the same requirement as using @thread_local@ variables.
     238Since SQEs that are allocated must be submitted to the same ring, on the same \gls{proc}, this effectively forces the application to submit SQEs in allocation order
     239\footnote{The actual requirement is that \glspl{thrd} cannot context switch between allocation and submission.
     240This requirement means that from the subsystem's point of view, the allocation and submission are sequential.
     241To remove this requirement, a \gls{thrd} would need the ability to ``yield to a specific \gls{proc}'', \ie, park with the promise that it will be run next on a specific \gls{proc}, the \gls{proc} attached to the correct ring.}
     242, greatly simplifying both allocation and submission.
     243In this design, allocation and submission form a partitionned ring buffer as shown in Figure~\ref{fig:pring}.
     244Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to do the system call.
     245Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, etc.
    253246
    254247\begin{figure}
     
    261254\end{figure}
    262255
    263 This approach has the advantage that it does not require much of the synchronization needed in a shared approach.
    264 However, this benefit means \glspl{thrd} submitting \io operations have less flexibility: they cannot park or yield, and several exceptional cases are handled poorly.
    265 Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations.
    266 In this case, the \io \gls{thrd} needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed.
    267 
    268 A more involved version of this approach tries to solve these problems using a pattern called \newterm{helping}.
    269 \Glspl{thrd} that cannot submit \io operations, either because of an allocation failure or migration to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.
    270 While there is still the strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \glspl{thrd} to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request.
    271 
    272 Imagine a simple scenario with two \glspl{thrd} on two \glspl{proc}, where one \gls{thrd} submits an \io operation and then sets a flag, while the other \gls{thrd} spins until the flag is set.
    273 Assume both \glspl{thrd} are running on the same \gls{proc}, and the \io \gls{thrd} is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \gls{thrd}.
    274 In this case, the helping solution has the \io \gls{thrd} append an \io object to the submission list of the first \gls{proc}, where the allocation was made.
     256This approach has the advantage that it does not require much of the synchronization needed in the shared approach.
     257This comes at the cost that \glspl{thrd} submitting \io operations have less flexibility, they cannot park or yield, and several exceptional cases are handled poorly.
     258Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations, in such a case the \gls{thrd} needs to be moved to a different \gls{proc}, the only current way of achieving this would be to @yield()@ hoping to be scheduled on a different \gls{proc}, which is not guaranteed.
     259
     260A more involved version of this approach can seem to solve most of these problems, using a pattern called \newterm{helping}.
     261\Glspl{thrd} that wish to submit \io operations but cannot do so
     262\footnote{either because of an allocation failure or because they were migrate to a different \gls{proc} between allocation and submission}
     263create an object representing what they wish to achieve and add it to a list somewhere.
     264For this particular problem, one solution would be to have a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.
     265The problem with these ``solutions'' is that they are still bound by the strong coupling between \glspl{proc} and @io_uring@ instances.
     266These data structures would allow moving \glspl{thrd} to a specific \gls{proc} when the current \gls{proc} cannot fulfill the \io request.
     267
     268Imagine a simple case with two \glspl{thrd} on two \glspl{proc}, one \gls{thrd} submits an \io operation and then sets a flag, the other \gls{thrd} spins until the flag is set.
     269If the first \gls{thrd} is preempted between allocation and submission and moves to the other \gls{proc}, the original \gls{proc} could start running the spinning \gls{thrd}.
     270If this happens, the helping ``solution'' is for the \io \gls{thrd}to added append an item to the submission list of the \gls{proc} where the allocation was made.
    275271No other \gls{proc} can help the \gls{thrd} since @io_uring@ instances are strongly coupled to \glspl{proc}.
    276 However, the \io \gls{proc} is unable to help because it is executing the spinning \gls{thrd} resulting in a deadlock.
    277 While this example is artificial, in the presence of many \glspl{thrd}, it is possible for this problem to arise ``in the wild''.
    278 Furthermore, this pattern is difficult to reliably detect and avoid.
    279 Once in this situation, the only escape is to interrupted the spinning \gls{thrd}, either directly or via some regular preemption, \eg time slicing.
    280 Having to interrupt \glspl{thrd} for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect.
     272However, in this case, the \gls{proc} is unable to help because it is executing the spinning \gls{thrd} mentioned when first expression this case
     273\footnote{This particular example is completely artificial, but in the presence of many more \glspl{thrd}, it is not impossible that this problem would arise ``in the wild''.
     274Furthermore, this pattern is difficult to reliably detect and avoid.}
     275resulting in a deadlock.
     276Once in this situation, the only escape is to interrupted the execution of the \gls{thrd}, either directly or due to regular preemption, only then can the \gls{proc} take the time to handle the pending request to help.
     277Interrupting \glspl{thrd} for this purpose is far from desireable, the cost is significant and the situation may be hard to detect.
     278However, a more subtle reason why interrupting the \gls{thrd} is not a satisfying solution is that the \gls{proc} is not actually using the instance it is tied to.
     279If it were to use it, then helping could be done as part of the usage.
    281280Interrupts are needed here entirely because the \gls{proc} is tied to an instance it is not using.
    282 Therefore, a more satisfying solution is for the \gls{thrd} submitting the operation to notice that the instance is unused and simply go ahead and use it.
    283 This approach is presented shortly.
    284 
    285 \subsubsection{Public Instances}
    286 The public approach creates decoupled pools of @io_uring@ instances and processors, \ie without one-to-one coupling.
    287 \Glspl{thrd} attempting an \io operation pick one of the available instances and submit the operation to that instance.
    288 Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently.
    289 Because @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects:
    290 \begin{itemize}
    291 \item
    292 The synchronization needed to submit does not induce more contention than @io_uring@ already does.
    293 \item
    294 The scheme to route \io requests to specific @io_uring@ instances does not introduce contention.
    295 This aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
    296 \end{itemize}
    297 
    298 Allocation in this scheme is fairly easy.
    299 Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written to safely and have a field called @user_data@ that the kernel only reads to copy to @cqe@s.
    300 Allocation also requires no ordering guarantee as all free SQEs are interchangeable.
    301 The only added complexity is that the number of SQEs is fixed, which means allocation can fail.
    302 
    303 Allocation failures need to be pushed to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.
    304 Furthermore, the routing algorithm should block operations up-front, if none of the instances have available SQEs.
    305 
    306 Once an SQE is allocated, \glspl{thrd} insert the \io request information, and keep track of the SQE index and the instance it belongs to.
    307 
    308 Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread-safe, and then the kernel must be notified using the @io_uring_enter@ system call.
    309 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean a \lstinline{sqe} multiple times in the ring buffer, which is undefined behaviour.
    310 However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations complete.
    311 
    312 Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency.
    313 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted.
    314 Balancing submission can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section.
    315 
    316 Ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \glspl{thrd} is designated to do the system call on behalf of the others, called the \newterm{submitter}.
    317 However, in practice, \io requests must be handed promptly so there is a need to guarantee everything missed by the current submitter is seen by the next one.
    318 Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call includes their request.
    319 Once the system call is done, the submitter must also free SQEs so that the allocator can reused them.
    320 
    321 Finally, the completion side is much simpler since the @io_uring@ system-call enforces a natural synchronization point.
    322 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}.
    323 Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}.
    324 If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events.
    325 A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled.
    326 
    327 With the pool of SEQ instances approach, the big advantage is that it is fairly flexible.
    328 It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions.
    329 It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@.
    330 The down side to this approach is that many of the steps used for submitting need complex synchronization to work properly.
    331 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed.
    332 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused, and handle the kernel returning @EBUSY@.
    333 All this synchronization has a significant cost, and compared to the private-instance approach, this synchronization is entirely overhead.
     281Therefore a more satisfying solution would be for the \gls{thrd} submitting the operation to simply notice that the instance is unused and simply go ahead and use it.
     282This is the approach presented next.
    334283
    335284\subsubsection{Instance borrowing}
    336 Both of the prior approaches have undesirable aspects that stem from tight or loose coupling between @io_uring@ and \glspl{proc}.
    337 The first approach suffers from tight coupling causing problems when a \gls{proc} does not benefit from the coupling.
    338 The second approach suffers from loose coupling causing operations to have synchronization overhead, which tighter coupling avoids.
    339 When \glspl{proc} are continuously issuing \io operations, tight coupling is valuable since it avoids synchronization costs.
    340 However, in unlikely failure cases or when \glspl{proc} are not using their instances, tight coupling is no longer advantageous.
    341 A compromise between these approaches is to allow tight coupling but have the option to revoke the coupling dynamically when failure cases arise.
    342 I call this approach \newterm{instance borrowing}.\footnote{
    343 While instance borrowing looks similar to work sharing and stealing, I think it is different enough to warrant a different verb to avoid confusion.}
    344 
    345 In this approach, each cluster, see Figure~\ref{fig:system}, owns a pool of @io_uring@ instances managed by an \newterm{arbiter}.
     285Both of the approaches presented above have undesirable aspects that stem from too loose or too tight coupling between @io_uring@ and \glspl{proc}.
     286In the first approach, loose coupling meant that all operations have synchronization overhead that a tighter coupling can avoid.
     287The second approach on the other hand suffers from tight coupling causing problems when the \gls{proc} do not benefit from the coupling.
     288While \glspl{proc} are continously issuing \io operations tight coupling is valuable since it avoids synchronization costs.
     289However, in unlikely failure cases or when \glspl{proc} are not making use of their instance, tight coupling is no longer advantageous.
     290A compromise between these approaches would be to allow tight coupling but have the option to revoke this coupling dynamically when failure cases arise.
     291I call this approach ``instance borrowing''\footnote{While it looks similar to work-sharing and work-stealing, I think it is different enough from either to warrant a different verb to avoid confusion.}.
     292
     293In this approach, each cluster owns a pool of @io_uring@ instances managed by an arbiter.
    346294When a \gls{thrd} attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance.
    347 This instance is now bound to the \gls{proc} the \gls{thrd} is running on.
    348 This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.
    349 This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at a time, akin to the private instances approach.
    350 However, it differs in that revocation by the arbiter means this approach does not suffer from the deadlock scenario described above.
     295However, in doing so it ties to the instance to the \gls{proc} it is currently running on.
     296This coupling is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.
     297This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at any given time, akin to the private instances approach.
     298However, where it differs is that revocation from the arbiter means this approach does not suffer from the deadlock scenario described above.
    351299
    352300Arbitration is needed in the following cases:
    353301\begin{enumerate}
    354         \item The current \gls{proc} does not hold an instance.
     302        \item The current \gls{proc} does not currently hold an instance.
    355303        \item The current instance does not have sufficient SQEs to satisfy the request.
    356         \item The current \gls{proc} has a wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission, called \newterm{external submissions}.
     304        \item The current \gls{proc} has the wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission.
     305        I will refer to these as \newterm{External Submissions}.
    357306\end{enumerate}
    358 However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their instance ownership is not being revoked, which is accomplished by a lock-\emph{less} handshake.\footnote{
    359 Note the handshake is not lock \emph{free} since it lacks the proper progress guarantee.}
     307However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their ownership of the instance is not being revoked.
     308This can be accomplished by a lock-less handshake\footnote{Note that the handshake is not Lock-\emph{Free} since it lacks the proper progress guarantee.}.
    360309A \gls{proc} raises a local flag before using its borrowed instance and checks if the instance is marked as revoked or if the arbiter has raised its flag.
    361 If not, it proceeds, otherwise it delegates the operation to the arbiter.
     310If not it proceeds, otherwise it delegates the operation to the arbiter.
    362311Once the operation is completed, the \gls{proc} lowers its local flag.
    363312
    364 Correspondingly, before revoking an instance, the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag.
     313Correspondingly, before revoking an instance the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag.
    365314Only then does it reclaim the instance and potentially assign it to an other \gls{proc}.
    366315
     
    374323
    375324\paragraph{External Submissions} are handled by the arbiter by revoking the appropriate instance and adding the submission to the submission ring.
    376 However, there is no need to immediately revoke the instance.
     325There is no need to immediately revoke the instance however.
    377326External submissions must simply be added to the ring before the next system call, \ie, when the submission ring is flushed.
    378 This means whoever is responsible for the system call, first checks if the instance has any external submissions.
    379 If so, it asks the arbiter to revoke the instance and add the external submissions to the ring.
    380 
    381 \paragraph{Pending Allocations} are handled by the arbiter when it has available instances and can directly hand over the instance and satisfy the request.
    382 Otherwise, it must hold onto the list of threads until SQEs are made available again.
    383 This handling is more complex when an allocation requires multiple SQEs, since the arbiter must make a decision between satisfying requests in FIFO ordering or for fewer SQEs.
    384 
    385 While an arbiter has the potential to solve many of the problems mentioned above, it also introduces a significant amount of complexity.
     327This means that whoever is responsible for the system call first checks if the instance has any external submissions.
     328If it is the case, it asks the arbiter to revoke the instance and add the external submissions to the ring.
     329
     330\paragraph{Pending Allocations} can be more complicated to handle.
     331If the arbiter has available instances, the arbiter can attempt to directly hand over the instance and satisfy the request.
     332Otherwise it must hold onto the list of threads until SQEs are made available again.
     333This handling becomes that much more complex if pending allocation require more than one SQE, since the arbiter must make a decision between statisfying requests in FIFO ordering or satisfy requests for fewer SQEs first.
     334
     335While this arbiter has the potential to solve many of the problems mentionned in above, it also introduces a significant amount of complexity.
    386336Tracking which processors are borrowing which instances and which instances have SQEs available ends-up adding a significant synchronization prelude to any I/O operation.
    387337Any submission must start with a handshake that pins the currently borrowed instance, if available.
    388338An attempt to allocate is then made, but the arbiter can concurrently be attempting to allocate from the same instance from a different \gls{hthrd}.
    389 Once the allocation is completed, the submission must check that the instance is still burrowed before attempting to flush.
    390 These synchronization steps turn out to have a similar cost to the multiple shared-instances approach.
     339Once the allocation is completed, the submission must still check that the instance is still burrowed before attempt to flush.
     340These extra synchronization steps end-up having a similar cost to the multiple shared instances approach.
    391341Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end-up cycling the processors, which leads to significant cache deterioration.
    392 For these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice.
     342Because of these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice.
    393343
    394344\subsubsection{Private Instances V2}
    395345
     346
     347
    396348% Verbs of this design
    397349
    398350% Allocation: obtaining an sqe from which to fill in the io request, enforces the io instance to use since it must be the one which provided the sqe. Must interact with the arbiter if the instance does not have enough sqe for the allocation. (Typical allocation will ask for only one sqe, but chained sqe must be allocated from the same context so chains of sqe must be allocated in bulks)
    399351
    400 % Submission: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.
     352% Submition: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.
    401353
    402354% Flushing: Taking all the sqes that were submitted and making them visible to the kernel, also counting them in order to figure out what to_submit should be. Must be thread-safe with submission. Has to interact with the Arbiter if there are external submissions. Can't simply use a protected queue because adding to the array is not safe if the ring is still available for submitters. Flushing must therefore: check if there are external pending requests if so, ask the arbiter to flush otherwise use the fast flush operation.
     
    405357
    406358% Handle: process all the produced cqe. No need to interact with any of the submission operations or the arbiter.
     359
     360
    407361
    408362
     
    450404
    451405\section{Interface}
    452 The last important part of the \io subsystem is its interface.
    453 There are multiple approaches that can be offered to programmers, each with advantages and disadvantages.
    454 The new \io subsystem can replace the C runtime API or extend it, and in the later case, the interface can go from very similar to vastly different.
    455 The following sections discuss some useful options using @read@ as an example.
    456 The standard Linux interface for C is :
    457 \begin{cfa}
    458 ssize_t read(int fd, void *buf, size_t count);
    459 \end{cfa}
     406Finally, the last important part of the \io subsystem is it's interface. There are multiple approaches that can be offered to programmers, each with advantages and disadvantages. The new \io subsystem can replace the C runtime's API or extend it. And in the later case the interface can go from very similar to vastly different. The following sections discuss some useful options using @read@ as an example. The standard Linux interface for C is :
     407
     408@ssize_t read(int fd, void *buf, size_t count);@
    460409
    461410\subsection{Replacement}
    462411Replacing the C \glsxtrshort{api} is the more intrusive and draconian approach.
    463412The goal is to convince the compiler and linker to replace any calls to @read@ to direct them to the \CFA implementation instead of glibc's.
    464 This rerouting has the advantage of working transparently and supporting existing binaries without needing recompilation.
     413This has the advantage of potentially working transparently and supporting existing binaries without needing recompilation.
    465414It also offers a, presumably, well known and familiar API that C programmers can simply continue to work with.
    466 However, this approach also entails a plethora of subtle technical challenges, which generally boils down to making a perfect replacement.
     415However, this approach also entails a plethora of subtle technical challenges which generally boils down to making a perfect replacement.
    467416If the \CFA interface replaces only \emph{some} of the calls to glibc, then this can easily lead to esoteric concurrency bugs.
    468 Since the gcc ecosystems does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible.
     417Since the gcc ecosystems does not offer a scheme for such perfect replacement, this approach was rejected as being laudable but infeasible.
    469418
    470419\subsection{Synchronous Extension}
    471 Another interface option is to offer an interface different in name only.
    472 For example:
    473 \begin{cfa}
    474 ssize_t cfa_read(int fd, void *buf, size_t count);
    475 \end{cfa}
    476 This approach is feasible and still familiar to C programmers.
    477 It comes with the caveat that any code attempting to use it must be recompiled, which is a problem considering the amount of existing legacy C binaries.
     420An other interface option is to simply offer an interface that is different in name only. For example:
     421
     422@ssize_t cfa_read(int fd, void *buf, size_t count);@
     423
     424\noindent This is much more feasible but still familiar to C programmers.
     425It comes with the caveat that any code attempting to use it must be recompiled, which can be a big problem considering the amount of existing legacy C binaries.
    478426However, it has the advantage of implementation simplicity.
    479 Finally, there is a certain irony to using a blocking synchronous interfaces for a feature often referred to as ``non-blocking'' \io.
    480427
    481428\subsection{Asynchronous Extension}
    482 A fairly traditional way of providing asynchronous interactions is using a future mechanism~\cite{multilisp}, \eg:
    483 \begin{cfa}
    484 future(ssize_t) read(int fd, void *buf, size_t count);
    485 \end{cfa}
    486 where the generic @future@ is fulfilled when the read completes and it contains the number of bytes read, which may be less than the number of bytes requested.
    487 The data read is placed in @buf@.
    488 The problem is that both the bytes read and data form the synchronization object, not just the bytes read.
    489 Hence, the buffer cannot be reused until the operation completes but the synchronization does not cover the buffer.
    490 A classical asynchronous API is:
    491 \begin{cfa}
    492 future([ssize_t, void *]) read(int fd, size_t count);
    493 \end{cfa}
    494 where the future tuple covers the components that require synchronization.
    495 However, this interface immediately introduces memory lifetime challenges since the call must effectively allocate a buffer to be returned.
    496 Because of the performance implications of this API, the first approach is considered preferable as it is more familiar to C programmers.
    497 
    498 \subsection{Direct \lstinline{io_uring} Interface}
    499 The last interface directly exposes the underlying @io_uring@ interface, \eg:
    500 \begin{cfa}
    501 array(SQE, want) cfa_io_allocate(int want);
    502 void cfa_io_submit( const array(SQE, have) & );
    503 \end{cfa}
    504 where the generic @array@ contains an array of SQEs with a size that may be less than the request.
    505 This offers more flexibility to users wanting to fully utilize all of the @io_uring@ features.
     429It is important to mention that there is a certain irony to using only synchronous, therefore blocking, interfaces for a feature often referred to as ``non-blocking'' \io.
     430A fairly traditional way of doing this is using futures\cit{wikipedia futures}.
     431As simple way of doing so is as follows:
     432
     433@future(ssize_t) read(int fd, void *buf, size_t count);@
     434
     435\noindent Note that this approach is not necessarily the most idiomatic usage of futures.
     436The definition of read above ``returns'' the read content through an output parameter which cannot be synchronized on.
     437A more classical asynchronous API could look more like:
     438
     439@future([ssize_t, void *]) read(int fd, size_t count);@
     440
     441\noindent However, this interface immediately introduces memory lifetime challenges since the call must effectively allocate a buffer to be returned.
     442Because of the performance implications of this, the first approach is considered preferable as it is more familiar to C programmers.
     443
     444\subsection{Interface directly to \lstinline{io_uring}}
     445Finally, an other interface that can be relevant is to simply expose directly the underlying \texttt{io\_uring} interface. For example:
     446
     447@array(SQE, want) cfa_io_allocate(int want);@
     448
     449@void cfa_io_submit( const array(SQE, have) & );@
     450
     451\noindent This offers more flexibility to users wanting to fully use all of the \texttt{io\_uring} features.
    506452However, it is not the most user-friendly option.
    507 It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricting users to usages that are compatible with how \CFA internally uses @io_uring@.
     453It obviously imposes a strong dependency between user code and \texttt{io\_uring} but at the same time restricting users to usages that are compatible with how \CFA internally uses \texttt{io\_uring}.
     454
     455
  • doc/theses/thierry_delisle_PhD/thesis/text/practice.tex

    rdef751f r4e2befe3  
    11\chapter{Scheduling in practice}\label{practice}
    2 The scheduling algorithm described in Chapter~\ref{core} addresses scheduling in a stable state.
    3 This chapter addresses problems that occur when the system state changes.
     2The scheduling algorithm discribed in Chapter~\ref{core} addresses scheduling in a stable state.
     3However, it does not address problems that occur when the system changes state.
    44Indeed the \CFA runtime, supports expanding and shrinking the number of \procs, both manually and, to some extent, automatically.
    5 These changes affect the scheduling algorithm, which must dynamically alter its behaviour.
    6 
    7 In detail, \CFA supports adding \procs using the type @processor@, in both RAII and heap coding scenarios.
    8 \begin{cfa}
    9 {
    10         processor p[4]; // 4 new kernel threads
    11         ... // execute on 4 processors
    12         processor * dp = new( processor, 6 ); // 6 new kernel threads
    13         ... // execute on 10 processors
    14         delete( dp );   // delete 6 kernel threads
    15         ... // execute on 4 processors
    16 } // delete 4 kernel threads
    17 \end{cfa}
    18 Dynamically allocated processors can be deleted an any time, \ie their lifetime exceeds the block of creation.
    19 The consequence is that the scheduler and \io subsystems must know when these \procs come in and out of existence and roll them into the appropriate scheduling algorithms.
     5This entails that the scheduling algorithm must support these transitions.
     6
     7More precise \CFA supports adding \procs using the RAII object @processor@.
     8These objects can be created at any time and can be destroyed at any time.
     9They are normally created as automatic stack variables, but this is not a requirement.
     10
     11The consequence is that the scheduler and \io subsystems must support \procs comming in and out of existence.
    2012
    2113\section{Manual Resizing}
    2214Manual resizing is expected to be a rare operation.
    23 Programmers normally create/delete processors on a clusters at startup/teardown.
    24 Therefore, dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state.
    25 As such, all internal scheduling arrays that are sized based on the number of \procs need to be @realloc@ed.
    26 This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or for any other reason.
     15Programmers are mostly expected to resize clusters on startup or teardown.
     16Therefore dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state.
     17As such all internal arrays that are sized based on the number of \procs need to be \texttt{realloc}ed.
     18This also means that any references into these arrays, pointers or indexes, may need to be fixed when shrinking\footnote{Indexes may still need fixing when shrinkingbecause some indexes are expected to refer to dense contiguous resources and there is no guarantee the resource being removed has the highest index.}.
    2719
    2820There are no performance requirements, within reason, for resizing since it is expected to be rare.
    29 However, this operation has strict correctness requirements since updating and idle sleep can easily lead to deadlocks.
     21However, this operation has strict correctness requirements since shrinking and idle sleep can easily lead to deadlocks.
    3022It should also avoid as much as possible any effect on performance when the number of \procs remain constant.
    3123This later requirement prohibits naive solutions, like simply adding a global lock to the ready-queue arrays.
    3224
    3325\subsection{Read-Copy-Update}
    34 One solution is to use the Read-Copy-Update pattern~\cite{wiki:rcu}.
    35 In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempt an Indiana Jones Switch to replace the original with the copy.
    36 This approach has the advantage that it may not need any synchronization to do the switch.
    37 However, there is a race where \procs still use the original data structure after the copy is switched.
    38 This race not only requires adding a memory-reclamation scheme, it also requires that operations made on the stale original version are eventually moved to the copy.
    39 
    40 Specifically, the original data structure must be kept until all \procs have witnessed the change.
    41 This requirement is the \newterm{memory reclamation challenge} and means every operation needs \emph{some} form of synchronization.
    42 If all operations need synchronization, then the overall cost of this technique is likely to be similar to an uncontended lock approach.
    43 In addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges.
     26One solution is to use the Read-Copy-Update\cite{wiki:rcu} pattern.
     27In this pattern, resizing is done by creating a copy of the internal data strucures, updating the copy with the desired changes, and then attempt an Idiana Jones Switch to replace the original witht the copy.
     28This approach potentially has the advantage that it may not need any synchronization to do the switch.
     29However, there is a race where \procs could still use the previous, original, data structure after the copy was switched in.
     30This race not only requires some added memory reclamation scheme, it also requires that operations made on the stale original version be eventually moved to the copy.
     31
     32For linked-lists, enqueing is only somewhat problematic, \ats enqueued to the original queues need to be transferred to the new, which might not preserve ordering.
     33Dequeing is more challenging.
     34Dequeing from the original will not necessarily update the copy which could lead to multiple \procs dequeing the same \at.
     35Fixing this requires more synchronization or more indirection on the queues.
     36
     37Another challenge is that the original must be kept until all \procs have witnessed the change.
     38This is a straight forward memory reclamation challenge but it does mean that every operation will need \emph{some} form of synchronization.
     39If each of these operation does need synchronization then it is possible a simpler solution achieves the same performance.
     40Because in addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges.
    4441Especially merging subqueues while having a minimal impact on fairness and locality.
    4542
    46 For example, given a linked-list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure.
    47 If the list supports arbitrary insertions, then inconsistencies in the tail pointer do not break the list;
    48 however, ordering may not be preserved.
    49 Furthermore, nodes enqueued to the original queues eventually need to be uniquely transferred to the new queues, which may further perturb ordering.
    50 Dequeuing is more challenging when nodes appear on both lists because of pending reclamation: dequeuing a node from one list does not remove it from the other nor is that node in the same place on the other list.
    51 This situation can lead to multiple \procs dequeuing the same \at.
    52 Fixing these challenges requires more synchronization or more indirection to the queues, plus coordinated searching to ensure unique elements.
    53 
    54 \subsection{Readers-Writer Lock}
    55 A simpler approach is to use a \newterm{Readers-Writer Lock}~\cite{wiki:rwlock}, where the resizing requires acquiring the lock as a writer while simply enqueueing/dequeuing \ats requires acquiring the lock as a reader.
     43\subsection{Read-Writer Lock}
     44A simpler approach would be to use a \newterm{Readers-Writer Lock}\cite{wiki:rwlock} where the resizing requires acquiring the lock as a writer while simply enqueing/dequeing \ats requires acquiring the lock as a reader.
    5645Using a Readers-Writer lock solves the problem of dynamically resizing and leaves the challenge of finding or building a lock with sufficient good read-side performance.
    57 Since this approach is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.
    58 
    59 To maximize reader scalability, readers should not contend with each other when attempting to acquire and release a critical section.
    60 To achieve this goal requires each reader to have its own memory to mark as locked and unlocked.
    61 The read acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock.
    62 The write acquire acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks.
    63 Acquiring all the local read locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer.
    64 
    65 Figure~\ref{f:SpecializedReadersWriterLock} shows the outline for this specialized readers-writer lock.
    66 The lock in nonblocking, so both readers and writers spin while the lock is held.
    67 \todo{finish explanation}
    68 
    69 \begin{figure}
    70 \begin{cfa}
     46Since this is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.
     47
     48To maximize reader scalability, the readers should not contend with eachother when attempting to acquire and release the critical sections.
     49This effectively requires that each reader have its own piece of memory to mark as locked and unlocked.
     50Reades then acquire the lock wait for writers to finish the critical section and then acquire their local spinlocks.
     51Writers acquire the global lock, so writers have mutual exclusion among themselves, and then acquires each of the local reader locks.
     52Acquiring all the local locks guarantees mutual exclusion between the readers and the writer, while the wait on the read side prevents readers from continously starving the writer.
     53\todo{reference listings}
     54
     55\begin{lstlisting}
    7156void read_lock() {
    7257        // Step 1 : make sure no writers in
    7358        while write_lock { Pause(); }
     59
     60        // May need fence here
     61
    7462        // Step 2 : acquire our local lock
    75         while atomic_xchg( tls.lock ) { Pause(); }
    76 }
     63        while atomic_xchg( tls.lock ) {
     64                Pause();
     65        }
     66}
     67
    7768void read_unlock() {
    7869        tls.lock = false;
    7970}
     71\end{lstlisting}
     72
     73\begin{lstlisting}
    8074void write_lock()  {
    8175        // Step 1 : lock global lock
    82         while atomic_xchg( write_lock ) { Pause(); }
     76        while atomic_xchg( write_lock ) {
     77                Pause();
     78        }
     79
    8380        // Step 2 : lock per-proc locks
    8481        for t in all_tls {
    85                 while atomic_xchg( t.lock ) { Pause(); }
    86         }
    87 }
     82                while atomic_xchg( t.lock ) {
     83                        Pause();
     84                }
     85        }
     86}
     87
    8888void write_unlock() {
    8989        // Step 1 : release local locks
    90         for t in all_tls { t.lock = false; }
     90        for t in all_tls {
     91                t.lock = false;
     92        }
     93
    9194        // Step 2 : release global lock
    9295        write_lock = false;
    9396}
    94 \end{cfa}
    95 \caption{Specialized Readers-Writer Lock}
    96 \label{f:SpecializedReadersWriterLock}
    97 \end{figure}
    98 
    99 \section{Idle-Sleep}\label{idlesleep}
    100 While manual resizing of \procs is expected to be rare, the number of \ats can vary significantly over an application's lifetime, which means there are times when there are too few or too many \procs.
    101 For this work, it is the programer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue.
    102 This leaves too many \procs when there are not enough \ats for all the \procs to be useful.
    103 These idle \procs cannot be removed because their lifetime is controlled by the application, and only the application knows when the number of \ats may increase or decrease.
    104 While idle \procs can spin until work appears, this approach wastes energy, unnecessarily produces heat and prevents other applications from using the processor.
    105 Therefore, idle \procs are put into an idle state, called \newterm{Idle-Sleep}, where the \gls{kthrd} is blocked until the scheduler deems it is needed.
     97\end{lstlisting}
     98
     99\section{Idle-Sleep}
     100In addition to users manually changing the number of \procs, it is desireable to support ``removing'' \procs when there is not enough \ats for all the \procs to be useful.
     101While manual resizing is expected to be rare, the number of \ats is expected to vary much more which means \procs may need to be ``removed'' for only short periods of time.
     102Furthermore, race conditions that spuriously lead to the impression that no \ats are ready are actually common in practice.
     103Therefore resources associated with \procs should not be freed but \procs simply put into an idle state where the \gls{kthrd} is blocked until more \ats become ready.
     104This state is referred to as \newterm{Idle-Sleep}.
    106105
    107106Idle sleep effectively encompasses several challenges.
    108 First, a data structure needs to keep track of all \procs that are in idle sleep.
    109 Because idle sleep is spurious, this data structure has strict performance requirements, in addition to strict correctness requirements.
    110 Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ on a pthread semaphore.
    111 The complexity here is to support \at parking and unparking, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity.
    112 Finally, the scheduler needs a heuristic to determine when to block and unblock an appropriate number of \procs.
    113 However, this third challenge is outside the scope of this thesis because developing a general heuristic is complex enough to justify its own work.
    114 Therefore, the \CFA scheduler simply follows the ``Race-to-Idle''~\cite{Albers12} approach where a sleeping \proc is woken any time a \at becomes ready and \procs go to idle sleep anytime they run out of work.
    115 An interesting sub-part of this heuristic is what to do with bursts of \ats that become ready.
    116 Since waking up a sleeping \proc can have notable latency, it is possible multiple \ats become ready while a single \proc is waking up.
    117 This facts begs the question, if many \procs are available, how many should be woken?
    118 If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelisation.
    119 If the ready \ats will run for a short very short time, waking many \procs may be wasteful.
    120 As mentioned, a heuristic to handle these complex cases is outside the scope of this thesis, the behaviour of the scheduler in this particular case is left unspecified.
     107First some data structure needs to keep track of all \procs that are in idle sleep.
     108Because of idle sleep can be spurious, this data structure has strict performance requirements in addition to the strict correctness requirements.
     109Next, some tool must be used to block kernel threads \glspl{kthrd}, \eg \texttt{pthread\_cond\_wait}, pthread semaphores.
     110The complexity here is to support \at parking and unparking, timers, \io operations and all other \CFA features with minimal complexity.
     111Finally, idle sleep also includes a heuristic to determine the appropriate number of \procs to be in idle sleep an any given time.
     112This third challenge is however outside the scope of this thesis because developping a general heuristic is involved enough to justify its own work.
     113The \CFA scheduler simply follows the ``Race-to-Idle'\cit{https://doi.org/10.1137/1.9781611973099.100}' approach where a sleeping \proc is woken any time an \at becomes ready and \procs go to idle sleep anytime they run out of work.
    121114
    122115\section{Sleeping}
    123116As usual, the corner-stone of any feature related to the kernel is the choice of system call.
    124 In terms of blocking a \gls{kthrd} until some event occurs, the Linux kernel has many available options.
    125 
    126 \subsection{\lstinline{pthread_mutex}/\lstinline{pthread_cond}}
    127 The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe park/unpark of a \gls{kthrd} to/from a @pthread_cond@.
    128 While this approach works for \glspl{kthrd} waiting among themselves, \io operations do not provide a mechanism to signal @pthread_cond@s.
    129 For \io results to wake a \proc waiting on a @pthread_cond@ means a different \glspl{kthrd} must be woken up first, which then signals the \proc.
    130 
    131 \subsection{\lstinline{io_uring} and Epoll}
    132 An alternative is to flip the problem on its head and block waiting for \io, using @io_uring@ or @epoll@.
    133 This creates the inverse situation, where \io operations directly wake sleeping \procs but waking blocked \procs must use an indirect scheme.
    134 This generally takes the form of creating a file descriptor, \eg, dummy file, pipe, or event fd, and using that file descriptor when \procs need to wake each other.
    135 This leads to additional complexity because there can be a race between these artificial \io and genuine \io operations.
    136 If not handled correctly, this can lead to artificial files getting delayed too long behind genuine files, resulting in longer latency.
     117In terms of blocking a \gls{kthrd} until some event occurs the linux kernel has many available options:
     118
     119\paragraph{\texttt{pthread\_mutex}/\texttt{pthread\_cond}}
     120The most classic option is to use some combination of \texttt{pthread\_mutex} and \texttt{pthread\_cond}.
     121These serve as straight forward mutual exclusion and synchronization tools and allow a \gls{kthrd} to wait on a \texttt{pthread\_cond} until signalled.
     122While this approach is generally perfectly appropriate for \glspl{kthrd} waiting after eachother, \io operations do not signal \texttt{pthread\_cond}s.
     123For \io results to wake a \proc waiting on a \texttt{pthread\_cond} means that a different \glspl{kthrd} must be woken up first, and then the \proc can be signalled.
     124
     125\subsection{\texttt{io\_uring} and Epoll}
     126An alternative is to flip the problem on its head and block waiting for \io, using \texttt{io\_uring} or even \texttt{epoll}.
     127This creates the inverse situation, where \io operations directly wake sleeping \procs but waking \proc from a running \gls{kthrd} must use an indirect scheme.
     128This generally takes the form of creating a file descriptor, \eg, a dummy file, a pipe or an event fd, and using that file descriptor when \procs need to wake eachother.
     129This leads to additional complexity because there can be a race between these artificial \io operations and genuine \io operations.
     130If not handled correctly, this can lead to the artificial files going out of sync.
    137131
    138132\subsection{Event FDs}
    139133Another interesting approach is to use an event file descriptor\cit{eventfd}.
    140 This Linux feature is a file descriptor that behaves like \io, \ie, uses @read@ and @write@, but also behaves like a semaphore.
    141 Indeed, all reads and writes must use a word-sized values, \ie 64 or 32 bits.
    142 Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero out the buffer and return the buffer values so far.\footnote{
    143 This behaviour is without the \lstinline{EFD_SEMAPHORE} flag, which changes the behaviour of \lstinline{read} but is not needed for this work.}
     134This is a Linux feature that is a file descriptor that behaves like \io, \ie, uses \texttt{read} and \texttt{write}, but also behaves like a semaphore.
     135Indeed, all read and writes must use 64bits large values\footnote{On 64-bit Linux, a 32-bit Linux would use 32 bits values.}.
     136Writes add their values to the buffer, that is arithmetic addition and not buffer append, and reads zero out the buffer and return the buffer values so far\footnote{This is without the \texttt{EFD\_SEMAPHORE} flag. This flags changes the behavior of \texttt{read} but is not needed for this work.}.
    144137If a read is made while the buffer is already 0, the read blocks until a non-0 value is added.
    145 What makes this feature particularly interesting is that @io_uring@ supports the @IORING_REGISTER_EVENTFD@ command to register an event @fd@ to a particular instance.
    146 Once that instance is registered, any \io completion results in @io_uring@ writing to the event @fd@.
    147 This means that a \proc waiting on the event @fd@ can be \emph{directly} woken up by either other \procs or incoming \io.
     138What makes this feature particularly interesting is that \texttt{io\_uring} supports the \texttt{IORING\_REGISTER\_EVENTFD} command, to register an event fd to a particular instance.
     139Once that instance is registered, any \io completion will result in \texttt{io\_uring} writing to the event FD.
     140This means that a \proc waiting on the event FD can be \emph{directly} woken up by either other \procs or incomming \io.
     141
     142\begin{figure}
     143        \centering
     144        \input{idle1.pstex_t}
     145        \caption[Basic Idle Sleep Data Structure]{Basic Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock.
     146        Each \proc has a private event FD.}
     147        \label{fig:idle1}
     148\end{figure}
     149
    148150
    149151\section{Tracking Sleepers}
    150152Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc.
    151 The classic challenge occurs when a \at is made ready while a \proc is going to sleep: there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at.
    152 Since \ats can be made ready by timers, \io operations, or other events outside a cluster, this race can occur even if the \proc going to sleep is the only \proc awake.
    153 As a result, improper handling of this race leads to all \procs going to sleep when there are ready \ats and the system deadlocks.
    154 
    155 The handshake closing the race is done with both the notifier and the idle \proc executing two ordered steps.
    156 The notifier first make sure the newly ready \at is visible to \procs searching for \ats, and then attempt to notify an idle \proc.
    157 On the other side, \procs make themselves visible as idle \procs and then search for any \ats they may have missed.
    158 Unlike regular work-stealing, this search must be exhaustive to make sure that pre-existing \at is missed.
    159 These steps from both sides guarantee that if the search misses a newly ready \at, then the notifier is guaranteed to see at least one idle \proc.
    160 Conversly, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search.
     153The classic challenge is when a \at is made ready while a \proc is going to sleep, there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at.
     154Since \ats can be made ready by timers, \io operations or other events outside a clusre, this race can occur even if the \proc going to sleep is the only \proc awake.
     155As a result, improper handling of this race can lead to all \procs going to sleep and the system deadlocking.
    161156
    162157Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers.
    163 Contention can be tolerated for \procs attempting to sleep or wake-up because these \procs are not doing useful work, and therefore, not contributing to overall performance.
    164 However, notifying, checking if a \proc must be woken-up, and doing so if needed, can significantly affect overall performance and must be low cost.
     158Contention slowing down \procs attempting to sleep or wake-up can be tolerated.
     159These \procs are not doing useful work and therefore not contributing to overall performance.
     160However, notifying, checking if a \proc must be woken-up and doing so if needed, can significantly affect overall performance and must be low cost.
    165161
    166162\subsection{Sleepers List}
    167163Each cluster maintains a list of idle \procs, organized as a stack.
    168 This ordering allows \procs at the tail to stay in idle sleep for extended period of times while those at the head of the list wake-up for bursts of activity.
    169 Because of unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \procs handle as much of the work as possible.
    170 The idle \procs maintain the stack of sleepers among themselves and notifying a sleeping \proc takes as little work as possible.
     164This ordering hopefully allows \proc at the tail to stay in idle sleep for extended period of times.
     165Because of these unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \proc handle as much of the work as possible.
     166The idle \procs maintain the of sleepers among themselves and notifying a sleeping \proc takes as little work as possible.
    171167This approach means that maintaining the list is fairly straightforward.
    172 The list can simply use a single lock per cluster and only \procs that are getting in and out of the idle state contend for that lock.
     168The list can simply use a single lock per cluster and only \procs that are getting in and out of idle state will contend for that lock.
    173169
    174170This approach also simplifies notification.
    175 Indeed, \procs not only need to be notify when a new \at is readied, but also must be notified during manual resizing, so the \gls{kthrd} can be joined.
    176 These requirements mean whichever entity removes idle \procs from the sleeper list must be able to do so in any order.
     171Indeed, \procs need to be notify when a new \at is readied, but they also must be notified during resizing, so the \gls{kthrd} can be joined.
     172This means that whichever entity removes idle \procs from the sleeper list must be able to do so in any order.
    177173Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure.
    178 The single lock also means the notification process simply needs to wake-up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest.
     174The notification process then simply needs to wake-up the desired idle \proc, using \texttt{pthread\_cond\_signal}, \texttt{write} on an fd, etc., and the \proc will handle the rest.
    179175
    180176\subsection{Reducing Latency}
    181 As mentioned in this section, \procs going to sleep for extremely short periods of time is likely in certain scenarios.
    182 Therefore, the latency of doing a system call to read from and writing to an event @fd@ can negatively affect overall performance in a notable way.
    183 Hence, it is important to reduce latency and contention of the notification as much as possible.
    184 Figure~\ref{fig:idle1} shows the basic idle-sleep data structure.
    185 For the notifiers, this data structure can cause contention on the lock and the event @fd@ syscall can cause notable latency.
    186 
    187 \begin{figure}
    188         \centering
    189         \input{idle1.pstex_t}
    190         \caption[Basic Idle Sleep Data Structure]{Basic Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock.
    191         Each \proc has a private event \lstinline{fd}.}
    192         \label{fig:idle1}
    193 \end{figure}
    194 
    195 Contention occurs because the idle-list lock must be held to access the idle list, \eg by \procs attempting to go to sleep, \procs waking, or notification attempts.
    196 The contention from the \procs attempting to go to sleep can be mitigated slightly by using @try_acquire@, so the \procs simply busy wait again searching for \ats if the lock is held.
    197 This trick cannot be used when waking \procs since the waker needs to return immediately to what it was doing.
    198 
    199 Interestingly, general notification, \ie waking any idle processor versus a specific one, does not strictly require modifying the list.
    200 Here, contention can be reduced notably by having notifiers avoid the lock entirely by adding a pointer to the event @fd@ of the first idle \proc, as in Figure~\ref{fig:idle2}.
    201 To avoid contention among notifiers, notifiers atomically exchange the pointer with @NULL@.
    202 The first notifier succeeds on the exchange and obtains the @fd@ of an idle \proc;
    203 hence, only one notifier contends on the system call.
    204 This notifier writes to the @fd@ to wake a \proc.
    205 The woken \proc then updates the atomic pointer, while it is updating the head of the list, as it removes itself from the list.
    206 Notifiers that obtained a @NULL@ in the exchange simply move on knowing that another notifier is already waking a \proc.
    207 This behaviour is equivalent to having multiple notifier write to the @fd@ since reads consume all previous writes.
    208 Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares witht the latency of \procs waking up.
    209 As mentioned in section~\ref{idlesleep}, there is no optimal approach to handle these bursts.
    210 It is therefore difficult to justify the cost of any extra synchronization here.
    211 
    212 \begin{figure}[t]
     177As mentioned in this section, \procs going idle for extremely short periods of time is likely in certain common scenarios.
     178Therefore, the latency of doing a system call to read from and writing to the event fd can actually negatively affect overall performance in a notable way.
     179Is it important to reduce latency and contention of the notification as much as possible.
     180Figure~\ref{fig:idle1} shoes the basic idle sleep data structure.
     181For the notifiers, this data structure can cause contention on the lock and the event fd syscall can cause notable latency.
     182
     183\begin{figure}
    213184        \centering
    214185        \input{idle2.pstex_t}
    215         \caption[Improved Idle-Sleep Data Structure]{Improved Idle-Sleep Data Structure \smallskip\newline An atomic pointer is added to the list pointing to the Event FD of the first \proc on the list.}
     186        \caption[Improved Idle Sleep Data Structure]{Improved Idle Sleep Data Structure \smallskip\newline An atomic pointer is added to the list, pointing to the Event FD of the first \proc on the list.}
    216187        \label{fig:idle2}
    217188\end{figure}
    218189
    219 The next optimization is to avoid the latency of the event @fd@, which can be done by adding what is effectively a binary benaphore\cit{benaphore} in front of the event @fd@.
    220 The benaphore over the event @fd@ logically provides a three state flag to avoid unnecessary system calls, where the states are expressed explicit in Figure~\ref{fig:idle:state}.
    221 A \proc begins its idle sleep by adding itself to the idle list before searching for an \at.
    222 In the process of adding itself to the idle list, it sets the state flag to @SEARCH@.
    223 If no \ats can be found during the search, the \proc then confirms it is going to sleep by atomically swapping the state to @SLEEP@.
    224 If the previous state is still @SEARCH@, then the \proc does read the event @fd@.
    225 Meanwhile, notifiers atomically exchange the state to @AWAKE@ state.
    226 If the previous state is @SLEEP@, then the notifier must write to the event @fd@.
    227 However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event @fd@ can be omitted, which reduces latency notably.
    228 These extensions leads to the final data structure shown in Figure~\ref{fig:idle}.
     190The contention is mostly due to the lock on the list needing to be held to get to the head \proc.
     191That lock can be contended by \procs attempting to go to sleep, \procs waking or notification attempts.
     192The contentention from the \procs attempting to go to sleep can be mitigated slightly by using \texttt{try\_acquire} instead, so the \procs simply continue searching for \ats if the lock is held.
     193This trick cannot be used for waking \procs since they are not in a state where they can run \ats.
     194However, it is worth nothing that notification does not strictly require accessing the list or the head \proc.
     195Therefore, contention can be reduced notably by having notifiers avoid the lock entirely and adding a pointer to the event fd of the first idle \proc, as in Figure~\ref{fig:idle2}.
     196To avoid contention between the notifiers, instead of simply reading the atomic pointer, notifiers atomically exchange it to \texttt{null} so only only notifier will contend on the system call.
    229197
    230198\begin{figure}
    231199        \centering
    232200        \input{idle_state.pstex_t}
    233         \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three state flag is added to the event \lstinline{fd}.}
     201        \caption[Improved Idle Sleep Data Structure]{Improved Idle Sleep Data Structure \smallskip\newline An atomic pointer is added to the list, pointing to the Event FD of the first \proc on the list.}
    234202        \label{fig:idle:state}
    235203\end{figure}
     204
     205The next optimization that can be done is to avoid the latency of the event fd when possible.
     206This can be done by adding what is effectively a benaphore\cit{benaphore} in front of the event fd.
     207A simple three state flag is added beside the event fd to avoid unnecessary system calls, as shown in Figure~\ref{fig:idle:state}.
     208The flag starts in state \texttt{SEARCH}, while the \proc is searching for \ats to run.
     209The \proc then confirms the sleep by atomically swaping the state to \texttt{SLEEP}.
     210If the previous state was still \texttt{SEARCH}, then the \proc does read the event fd.
     211Meanwhile, notifiers atomically exchange the state to \texttt{AWAKE} state.
     212if the previous state was \texttt{SLEEP}, then the notifier must write to the event fd.
     213However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event fd can be omitted, which reduces latency notably.
     214This leads to the final data structure shown in Figure~\ref{fig:idle}.
    236215
    237216\begin{figure}
     
    239218        \input{idle.pstex_t}
    240219        \caption[Low-latency Idle Sleep Data Structure]{Low-latency Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock.
    241         Each \proc has a private event \lstinline{fd} with a benaphore in front of it.
    242         The list also has an atomic pointer to the event \lstinline{fd} and benaphore of the first \proc on the list.}
     220        Each \proc has a private event FD with a benaphore in front of it.
     221        The list also has an atomic pointer to the event fd and benaphore of the first \proc on the list.}
    243222        \label{fig:idle}
    244223\end{figure}
  • doc/theses/thierry_delisle_PhD/thesis/thesis.tex

    rdef751f r4e2befe3  
    108108        citecolor=OliveGreen,   % color of links to bibliography
    109109        filecolor=magenta,      % color of file links
    110         urlcolor=blue,           % color of external links
    111         breaklinks=true
     110        urlcolor=cyan           % color of external links
    112111}
    113112\ifthenelse{\boolean{PrintVersion}}{   % for improved print quality, change some hyperref options
  • libcfa/Makefile.am

    rdef751f r4e2befe3  
    1818ACLOCAL_AMFLAGS  = -I automake
    1919SUBDIRS = prelude src      # order important
    20 
    21 DISTCLEANFILES = config.data
  • libcfa/src/Makefile.am

    rdef751f r4e2befe3  
    216216nobase_cfa_include_HEADERS = ${stdhdr} ${inst_headers_src} ${inst_headers_nosrc} ${inst_thread_headers_src} ${inst_thread_headers_nosrc}
    217217EXTRA_DIST = stdhdr
    218 DISTCLEANFILES = $(libdeps) $(thread_libdeps)
    219218
    220219#----------------------------------------------------------------------------------------------------------------
     
    222221        -rm -rf ${CFA_INCDIR} ${CFA_LIBDIR}
    223222
    224 #distclean-local:
    225 #       find ${builddir} -path '*.Plo' -delete
     223distclean-local:
     224        find ${builddir} -path '*.Plo' -delete
    226225
    227226
  • src/AST/Expr.cpp

    rdef751f r4e2befe3  
    272272        // Adjust the length of the string for the terminator.
    273273        const Expr * strSize = from_ulong( loc, str.size() + 1 );
    274         const Type * strType = new ArrayType( charType, strSize, FixedLen, DynamicDim );
     274        const Type * strType = new ArrayType( charType, strSize, FixedLen, StaticDim );
    275275        const std::string strValue = "\"" + str + "\"";
    276276        return new ConstantExpr( loc, strType, strValue, std::nullopt );
  • src/AST/Pass.hpp

    rdef751f r4e2befe3  
    264264        __pass::result1<ast::Stmt> call_accept_as_compound(const ast::Stmt *);
    265265
    266         // requests type environment to be updated (why is it implemented like this?)
    267         __pass::result1<ast::Expr> call_accept_top(const ast::Expr *);
    268 
    269266        template< template <class...> class container_t >
    270267        __pass::resultNstmt<container_t> call_accept( const container_t< ptr<Stmt> > & );
     
    280277        template<typename node_t, typename parent_t, typename field_t>
    281278        void maybe_accept_as_compound(const node_t * &, field_t parent_t::* field);
    282 
    283         template<typename node_t, typename parent_t, typename field_t>
    284         void maybe_accept_top(const node_t * &, field_t parent_t::* field);
    285279
    286280private:
  • src/AST/Pass.impl.hpp

    rdef751f r4e2befe3  
    155155                __pedantic_pass_assert( expr );
    156156
     157                const ast::TypeSubstitution ** typeSubs_ptr = __pass::typeSubs( core, 0 );
     158                if ( typeSubs_ptr && expr->env ) {
     159                        *typeSubs_ptr = expr->env;
     160                }
     161
    157162                auto nval = expr->accept( *this );
    158163                return { nval != expr, nval };
     
    166171                const ast::Stmt * nval = stmt->accept( *this );
    167172                return { nval != stmt, nval };
    168         }
    169 
    170         template< typename core_t >
    171         __pass::template result1<ast::Expr> ast::Pass< core_t >::call_accept_top( const ast::Expr * expr ) {
    172                 __pedantic_pass_assert( __visit_children() );
    173                 __pedantic_pass_assert( expr );
    174 
    175                 const ast::TypeSubstitution ** typeSubs_ptr = __pass::typeSubs( core, 0 );
    176                 if ( typeSubs_ptr && expr->env ) {
    177                         *typeSubs_ptr = expr->env;
    178                 }
    179 
    180                 auto nval = expr->accept( *this );
    181                 return { nval != expr, nval };
    182173        }
    183174
     
    419410
    420411                auto new_val = call_accept( old_val );
    421 
    422                 static_assert( !std::is_same<const ast::Node *, decltype(new_val)>::value /* || std::is_same<int, decltype(old_val)>::value */, "ERROR");
    423 
    424                 if( new_val.differs ) {
    425                         auto new_parent = __pass::mutate<core_t>(parent);
    426                         new_val.apply(new_parent, field);
    427                         parent = new_parent;
    428                 }
    429         }
    430 
    431         template< typename core_t >
    432         template<typename node_t, typename super_t, typename field_t>
    433         void ast::Pass< core_t >::maybe_accept_top(
    434                 const node_t * & parent,
    435                 field_t super_t::*field
    436         ) {
    437                 static_assert( std::is_base_of<super_t, node_t>::value, "Error deducing member object" );
    438 
    439                 if(__pass::skip(parent->*field)) return;
    440                 const auto & old_val = __pass::get(parent->*field, 0);
    441 
    442                 static_assert( !std::is_same<const ast::Node * &, decltype(old_val)>::value, "ERROR");
    443 
    444                 auto new_val = call_accept_top( old_val );
    445412
    446413                static_assert( !std::is_same<const ast::Node *, decltype(new_val)>::value /* || std::is_same<int, decltype(old_val)>::value */, "ERROR");
     
    789756
    790757        if ( __visit_children() ) {
    791                 maybe_accept_top( node, &StaticAssertDecl::cond );
     758                maybe_accept( node, &StaticAssertDecl::cond );
    792759                maybe_accept( node, &StaticAssertDecl::msg  );
    793760        }
     
    831798
    832799        if ( __visit_children() ) {
    833                 maybe_accept_top( node, &ExprStmt::expr );
     800                maybe_accept( node, &ExprStmt::expr );
    834801        }
    835802
     
    872839                guard_symtab guard { *this };
    873840                maybe_accept( node, &IfStmt::inits    );
    874                 maybe_accept_top( node, &IfStmt::cond     );
     841                maybe_accept( node, &IfStmt::cond     );
    875842                maybe_accept_as_compound( node, &IfStmt::then );
    876843                maybe_accept_as_compound( node, &IfStmt::else_ );
     
    890857                guard_symtab guard { *this };
    891858                maybe_accept( node, &WhileDoStmt::inits );
    892                 maybe_accept_top( node, &WhileDoStmt::cond  );
     859                maybe_accept( node, &WhileDoStmt::cond  );
    893860                maybe_accept_as_compound( node, &WhileDoStmt::body  );
    894861        }
     
    908875                // xxx - old ast does not create WithStmtsToAdd scope for loop inits. should revisit this later.
    909876                maybe_accept( node, &ForStmt::inits );
    910                 maybe_accept_top( node, &ForStmt::cond  );
    911                 maybe_accept_top( node, &ForStmt::inc   );
     877                maybe_accept( node, &ForStmt::cond  );
     878                maybe_accept( node, &ForStmt::inc   );
    912879                maybe_accept_as_compound( node, &ForStmt::body  );
    913880        }
     
    923890
    924891        if ( __visit_children() ) {
    925                 maybe_accept_top( node, &SwitchStmt::cond  );
     892                maybe_accept( node, &SwitchStmt::cond  );
    926893                maybe_accept( node, &SwitchStmt::cases );
    927894        }
     
    937904
    938905        if ( __visit_children() ) {
    939                 maybe_accept_top( node, &CaseClause::cond  );
     906                maybe_accept( node, &CaseClause::cond  );
    940907                maybe_accept( node, &CaseClause::stmts );
    941908        }
     
    959926
    960927        if ( __visit_children() ) {
    961                 maybe_accept_top( node, &ReturnStmt::expr );
     928                maybe_accept( node, &ReturnStmt::expr );
    962929        }
    963930
     
    1004971                guard_symtab guard { *this };
    1005972                maybe_accept( node, &CatchClause::decl );
    1006                 maybe_accept_top( node, &CatchClause::cond );
     973                maybe_accept( node, &CatchClause::cond );
    1007974                maybe_accept_as_compound( node, &CatchClause::body );
    1008975        }
     
    20912058
    20922059        if ( __visit_children() ) {
    2093                 maybe_accept_top( node, &SingleInit::value );
     2060                maybe_accept( node, &SingleInit::value );
    20942061        }
    20952062
  • src/AST/SymbolTable.cpp

    rdef751f r4e2befe3  
    6565
    6666Expr * SymbolTable::IdData::combine( const CodeLocation & loc, ResolvExpr::Cost & cost ) const {
    67         Expr * ret;
    68         if ( baseExpr ) {
    69                 if (baseExpr->env) {
    70                         Expr * base = shallowCopy(baseExpr);
    71                         const TypeSubstitution * subs = baseExpr->env;
    72                         base->env = nullptr;
    73                         ret = new MemberExpr{loc, id, referenceToRvalueConversion( base, cost )};
    74                         ret->env = subs;
    75                 }
    76                 else {
    77                         ret = new MemberExpr{ loc, id, referenceToRvalueConversion( baseExpr, cost ) };
    78                 }
    79         }
    80         else {
    81                 ret = new VariableExpr{ loc, id };
    82         }
     67        Expr * ret = ( baseExpr ) ?
     68                (Expr *)new MemberExpr{ loc, id, referenceToRvalueConversion( baseExpr, cost ) } :
     69                (Expr *)new VariableExpr{ loc, id };
    8370        if ( deleter ) { ret = new DeletedExpr{ loc, ret, deleter }; }
    8471        return ret;
     
    785772                                                && ! dynamic_cast<const UnionInstType *>(rty) ) continue;
    786773                                        ResolvExpr::Cost cost = ResolvExpr::Cost::zero;
    787                                         ast::ptr<ast::TypeSubstitution> tmp = expr->env;
    788                                         expr = mutate_field(expr, &Expr::env, nullptr);
    789774                                        const Expr * base = ResolvExpr::referenceToRvalueConversion( expr, cost );
    790                                         base = mutate_field(base, &Expr::env, tmp);
    791 
    792775                                        addMembers(
    793776                                                rty->aggr(), new MemberExpr{ base->location, dwt, base }, handleConflicts );
  • src/AST/TypeSubstitution.cpp

    rdef751f r4e2befe3  
    9797                TypeSubstitution * newEnv;
    9898                EnvTrimmer( const TypeSubstitution * env, TypeSubstitution * newEnv ) : env( env ), newEnv( newEnv ){}
    99                 void previsit( const FunctionType * ftype ) {
     99                void previsit( FunctionType * ftype ) {
    100100                        // transfer known bindings for seen type variables
    101101                        for (auto & formal : ftype->forall) {
  • src/CodeGen/FixNames.cc

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixNames.cc -- Adjustments to typed declarations.
     7// FixNames.cc --
    88//
    99// Author           : Richard C. Bilson
    1010// Created On       : Mon May 18 07:44:20 2015
    1111// Last Modified By : Andrew Beach
    12 // Last Modified On : Wed Jul 20 11:49:00 2022
    13 // Update Count     : 24
     12// Last Modified On : Fri Oct 29 15:49:00 2021
     13// Update Count     : 23
    1414//
    1515
     
    8787
    8888/// Does work with the main function and scopeLevels.
    89 class FixNames_new final {
     89class FixNames_new : public ast::WithGuards {
    9090        int scopeLevel = 1;
    9191
     
    103103
    104104        const ast::FunctionDecl *postvisit( const ast::FunctionDecl *functionDecl ) {
     105                // This store is used to ensure a maximum of one call to mutate.
     106                ast::FunctionDecl * mutDecl = nullptr;
     107
     108                if ( shouldSetScopeLevel( functionDecl ) ) {
     109                        mutDecl = ast::mutate( functionDecl );
     110                        mutDecl->scopeLevel = scopeLevel;
     111                }
     112
    105113                if ( FixMain::isMain( functionDecl ) ) {
    106                         auto mutDecl = ast::mutate( functionDecl );
    107 
    108                         if ( shouldSetScopeLevel( mutDecl ) ) {
    109                                 mutDecl->scopeLevel = scopeLevel;
    110                         }
     114                        if ( !mutDecl ) { mutDecl = ast::mutate( functionDecl ); }
    111115
    112116                        int nargs = mutDecl->params.size();
     
    120124                                )
    121125                        );
    122 
    123                         return mutDecl;
    124                 } else if ( shouldSetScopeLevel( functionDecl ) ) {
    125                         return ast::mutate_field( functionDecl, &ast::FunctionDecl::scopeLevel, scopeLevel );
    126                 } else {
    127                         return functionDecl;
    128126                }
     127                return mutDecl ? mutDecl : functionDecl;
    129128        }
    130129
    131130        void previsit( const ast::CompoundStmt * ) {
    132                 scopeLevel += 1;
    133         }
    134 
    135         void postvisit( const ast::CompoundStmt * ) {
    136                 scopeLevel -= 1;
     131                GuardValue( scopeLevel ) += 1;
    137132        }
    138133};
  • src/CodeGen/FixNames.h

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixNames.h -- Adjustments to typed declarations.
     7// FixNames.h --
    88//
    99// Author           : Richard C. Bilson
     
    2626        /// mangles object and function names
    2727        void fixNames( std::list< Declaration* > & translationUnit );
    28 /// Sets scope levels and fills in main's default return.
    29 void fixNames( ast::TranslationUnit & translationUnit );
     28        void fixNames( ast::TranslationUnit & translationUnit );
    3029} // namespace CodeGen
    3130
  • src/Concurrency/Keywords.h

    rdef751f r4e2befe3  
    2828        void implementThreadStarter( std::list< Declaration * > & translationUnit );
    2929
    30 /// Implement the sue-like keywords and the suspend keyword. Pre-Autogen
     30/// Implement the sue-like keywords and the suspend keyword.
    3131void implementKeywords( ast::TranslationUnit & translationUnit );
    32 /// Implement the mutex parameters and mutex statement. Post-Autogen
     32/// Implement the mutex parameters and mutex statement.
    3333void implementMutex( ast::TranslationUnit & translationUnit );
    34 /// Add the thread starter code to constructors. Post-Autogen
     34/// Add the thread starter code to constructors.
    3535void implementThreadStarter( ast::TranslationUnit & translationUnit );
    3636};
  • src/ControlStruct/ExceptDecl.cc

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // ExceptDecl.cc -- Handles declarations of exception types.
     7// ExceptDecl.cc --
    88//
    99// Author           : Henry Xue
  • src/ControlStruct/ExceptDecl.h

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // ExceptDecl.h -- Handles declarations of exception types.
     7// ExceptDecl.h --
    88//
    99// Author           : Henry Xue
    1010// Created On       : Tue Jul 20 04:10:50 2021
    11 // Last Modified By : Andrew Beach
    12 // Last Modified On : Tue Jul 12 15:49:00 2022
    13 // Update Count     : 2
     11// Last Modified By : Henry Xue
     12// Last Modified On : Tue Jul 20 04:10:50 2021
     13// Update Count     : 1
    1414//
    1515
     
    2020class Declaration;
    2121
    22 namespace ast {
    23         class TranslationUnit;
     22namespace ControlStruct {
     23        void translateExcept( std::list< Declaration *> & translationUnit );
    2424}
    25 
    26 namespace ControlStruct {
    27 /// Unfold exception declarations into raw structure declarations.
    28 /// Also builds vtable declarations and converts vtable types.
    29 void translateExcept( std::list< Declaration *> & translationUnit );
    30 void translateExcept( ast::TranslationUnit & translationUnit );
    31 }
  • src/ControlStruct/HoistControlDecls.hpp

    rdef751f r4e2befe3  
    2121
    2222namespace ControlStruct {
    23 /// Hoist declarations out of control flow statements into compound statement.
    24 /// Must happen before auto-gen routines are added.
     23// Hoist declarations out of control flow statements into compound statement.
    2524void hoistControlDecls( ast::TranslationUnit & translationUnit );
    2625} // namespace ControlStruct
  • src/ControlStruct/MultiLevelExit.cpp

    rdef751f r4e2befe3  
    149149};
    150150
    151 NullStmt * labelledNullStmt( const CodeLocation & cl, const Label & label ) {
     151NullStmt * labelledNullStmt(
     152        const CodeLocation & cl, const Label & label ) {
    152153        return new NullStmt( cl, vector<Label>{ label } );
    153154}
     
    163164
    164165const CompoundStmt * MultiLevelExitCore::previsit(
    165                 const CompoundStmt * stmt ) {
     166        const CompoundStmt * stmt ) {
    166167        visit_children = false;
    167168
     
    188189}
    189190
    190 size_t getUnusedIndex( const Stmt * stmt, const Label & originalTarget ) {
     191size_t getUnusedIndex(
     192        const Stmt * stmt, const Label & originalTarget ) {
    191193        const size_t size = stmt->labels.size();
    192194
     
    208210}
    209211
    210 const Stmt * addUnused( const Stmt * stmt, const Label & originalTarget ) {
     212const Stmt * addUnused(
     213        const Stmt * stmt, const Label & originalTarget ) {
    211214        size_t i = getUnusedIndex( stmt, originalTarget );
    212215        if ( i == stmt->labels.size() ) {
     
    353356
    354357// Mimic what the built-in push_front would do anyways. It is O(n).
    355 void push_front( vector<ptr<Stmt>> & vec, const Stmt * element ) {
     358void push_front(
     359        vector<ptr<Stmt>> & vec, const Stmt * element ) {
    356360        vec.emplace_back( nullptr );
    357361        for ( size_t i = vec.size() - 1 ; 0 < i ; --i ) {
     
    586590
    587591                ptr<Stmt> else_stmt = nullptr;
    588                 const Stmt * loop_kid = nullptr;
     592                Stmt * loop_kid = nullptr;
    589593                // check if loop node and if so add else clause if it exists
    590                 const WhileDoStmt * whilePtr = kid.as<WhileDoStmt>();
    591                 if ( whilePtr && whilePtr->else_ ) {
     594                const WhileDoStmt * whilePtr = dynamic_cast<const WhileDoStmt *>(kid.get());
     595                if ( whilePtr && whilePtr->else_) {
    592596                        else_stmt = whilePtr->else_;
    593                         loop_kid = mutate_field( whilePtr, &WhileDoStmt::else_, nullptr );
    594                 }
    595                 const ForStmt * forPtr = kid.as<ForStmt>();
    596                 if ( forPtr && forPtr->else_ ) {
     597                        WhileDoStmt * mutate_ptr = mutate(whilePtr);
     598                        mutate_ptr->else_ = nullptr;
     599                        loop_kid = mutate_ptr;
     600                }
     601                const ForStmt * forPtr = dynamic_cast<const ForStmt *>(kid.get());
     602                if ( forPtr && forPtr->else_) {
    597603                        else_stmt = forPtr->else_;
    598                         loop_kid = mutate_field( forPtr, &ForStmt::else_, nullptr );
     604                        ForStmt * mutate_ptr = mutate(forPtr);
     605                        mutate_ptr->else_ = nullptr;
     606                        loop_kid = mutate_ptr;
    599607                }
    600608
  • src/ControlStruct/module.mk

    rdef751f r4e2befe3  
    1717SRC += \
    1818        ControlStruct/ExceptDecl.cc \
    19         ControlStruct/ExceptDeclNew.cpp \
    2019        ControlStruct/ExceptDecl.h \
    2120        ControlStruct/ExceptTranslateNew.cpp \
  • src/GenPoly/Box.cc

    rdef751f r4e2befe3  
    189189                        /// Enters a new scope for type-variables, adding the type variables from ty
    190190                        void beginTypeScope( Type *ty );
     191                        /// Exits the type-variable scope
     192                        void endTypeScope();
    191193                        /// Enters a new scope for knowLayouts and knownOffsets and queues exit calls
    192194                        void beginGenericScope();
     
    196198                        UniqueName bufNamer;                           ///< Namer for VLA buffers
    197199                        Expression * addrMember = nullptr;             ///< AddressExpr argument is MemberExpr?
    198                         bool expect_func_type = false;                 ///< used to avoid recursing too deep in type decls
    199200                };
    200201
     
    14181419                void PolyGenericCalculator::beginGenericScope() {
    14191420                        GuardScope( *this );
    1420                         // We expect the first function type see to be the type relating to this scope
    1421                         // but any further type is probably some unrelated function pointer
    1422                         // keep track of which is the first
    1423                         GuardValue( expect_func_type );
    1424                         expect_func_type = true;
    14251421                }
    14261422
     
    14721468                void PolyGenericCalculator::premutate( FunctionType *funcType ) {
    14731469                        beginTypeScope( funcType );
    1474 
    1475                         GuardValue( expect_func_type );
    1476 
    1477                         if(!expect_func_type) {
    1478                                 GuardAction( [this]() {
    1479                                         knownLayouts.endScope();
    1480                                         knownOffsets.endScope();
    1481                                 });
    1482                                 // If this is the first function type we see
    1483                                 // Then it's the type of the declaration and we care about it
    1484                                 knownLayouts.beginScope();
    1485                                 knownOffsets.beginScope();
    1486                         }
    1487 
    1488                         // The other functions type we will see in this scope are probably functions parameters
    1489                         // they don't help us with the layout and offsets so don't mark them as known in this scope
    1490                         expect_func_type = false;
    14911470
    14921471                        // make sure that any type information passed into the function is accounted for
     
    17671746                                }
    17681747
    1769                                 // std::cout << "TRUE 2" << std::endl;
    1770 
    17711748                                return true;
    17721749                        } else if ( UnionInstType *unionTy = dynamic_cast< UnionInstType* >( ty ) ) {
  • src/GenPoly/Specialize.h

    rdef751f r4e2befe3  
    1717
    1818#include <list>  // for list
    19 #include "AST/TranslationUnit.hpp"
    2019
    2120class Declaration;
     
    2423        /// generates thunks where needed
    2524        void convertSpecializations( std::list< Declaration* >& translationUnit );
    26 
    27         void convertSpecializations( ast::TranslationUnit & translationUnit );
    2825} // namespace GenPoly
    2926
  • src/GenPoly/module.mk

    rdef751f r4e2befe3  
    3434        GenPoly/ScrubTyVars.h \
    3535        GenPoly/Specialize.cc \
    36         GenPoly/SpecializeNew.cpp \
    3736        GenPoly/Specialize.h
    3837
  • src/InitTweak/FixInitNew.cpp

    rdef751f r4e2befe3  
    7373        /// wrap function application expressions as ImplicitCopyCtorExpr nodes so that it is easy to identify which
    7474        /// function calls need their parameters to be copy constructed
    75         struct InsertImplicitCalls : public ast::WithShortCircuiting {
     75        struct InsertImplicitCalls : public ast::WithConstTypeSubstitution, public ast::WithShortCircuiting {
    7676                const ast::Expr * postvisit( const ast::ApplicationExpr * appExpr );
    7777
     
    457457                // is needed to obtain the type of temporary variables so that copy
    458458                // constructor calls can be resolved.
     459                assert( typeSubs );
    459460                expr->env = tmp;
    460461                return expr;
  • src/InitTweak/GenInit.cc

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // GenInit.cc -- Generate initializers, and other stuff.
     7// GenInit.cc --
    88//
    99// Author           : Rob Schluntz
  • src/InitTweak/GenInit.h

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // GenInit.h -- Generate initializers, and other stuff.
     7// GenInit.h --
    88//
    99// Author           : Rodolfo G. Esteves
     
    2929        void genInit( ast::TranslationUnit & translationUnit );
    3030
    31         /// Converts return statements into copy constructor calls on the hidden return variable.
    32         /// This pass must happen before auto-gen.
     31        /// Converts return statements into copy constructor calls on the hidden return variable
    3332        void fixReturnStatements( std::list< Declaration * > & translationUnit );
    3433        void fixReturnStatements( ast::TranslationUnit & translationUnit );
  • src/ResolvExpr/CandidateFinder.cpp

    rdef751f r4e2befe3  
    12631263                                        newExpr, copy( tenv ), ast::OpenVarSet{}, ast::AssertionSet{}, Cost::zero,
    12641264                                        cost );
    1265 
    1266                                 if (newCand->expr->env) {
    1267                                         newCand->env.add(*newCand->expr->env);
    1268                                         auto mutExpr = newCand->expr.get_and_mutate();
    1269                                         mutExpr->env  = nullptr;
    1270                                         newCand->expr = mutExpr;
    1271                                 }
    1272 
    12731265                                PRINT(
    12741266                                        std::cerr << "decl is ";
  • src/ResolvExpr/Resolver.cc

    rdef751f r4e2befe3  
    15551555                if ( type->dimension ) {
    15561556                        ast::ptr< ast::Type > sizeType = context.global.sizeType;
    1557                         ast::ptr< ast::Expr > dimension = findSingleExpression( type->dimension, sizeType, context );
    1558                         assertf(dimension->env->empty(), "array dimension expr has nonempty env");
    1559                         dimension.get_and_mutate()->env = nullptr;
    15601557                        ast::mutate_field(
    15611558                                type, &PtrType::dimension,
    1562                                 dimension);
     1559                                findSingleExpression( type->dimension, sizeType, context ) );
    15631560                }
    15641561                return type;
     
    20112008                                tmp->accept( *visitor );
    20122009                        }
    2013                         else if (expr->env && expr->env->empty()) {
    2014                                 expr = ast::mutate_field(expr.get(), &ast::Expr::env, nullptr);
    2015                         }
    20162010                }
    20172011        }
  • src/Tuples/Tuples.cc

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // Tuples.cc -- A collection of tuple operations.
     7// Tuples.h --
    88//
    99// Author           : Andrew Beach
  • src/Tuples/Tuples.h

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // Tuples.h -- A collection of tuple operations.
     7// Tuples.h --
    88//
    99// Author           : Rodolfo G. Esteves
  • src/Validate/Autogen.hpp

    rdef751f r4e2befe3  
    2222namespace Validate {
    2323
    24 /// Generate routines for all data types in the translation unit.
    25 /// A lot of passes have to happen either before or after this pass.
    2624void autogenerateRoutines( ast::TranslationUnit & translationUnit );
    2725
  • src/Validate/CompoundLiteral.hpp

    rdef751f r4e2befe3  
    2323
    2424/// Use variables to implement compound literals.
    25 /// Must happen after auto-gen routines are added.
    2625void handleCompoundLiterals( ast::TranslationUnit & translationUnit );
    2726
  • src/Validate/EnumAndPointerDecay.cpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // EnumAndPointerDecay.cpp -- Normalizes enumerations and types in functions.
     7// EnumAndPointerDecay.cpp --
    88//
    99// Author           : Andrew Beach
  • src/Validate/EnumAndPointerDecay.hpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // EnumAndPointerDecay.hpp -- Normalizes enumerations and types in functions.
     7// EnumAndPointerDecay.hpp --
    88//
    99// Author           : Andrew Beach
     
    2222namespace Validate {
    2323
    24 /// Fix the parameter and return types of functions. Also assigns types to
    25 /// enumeration values. This must happen before Link Reference to Types,
    26 /// it needs correct types for mangling, and before auto-gen.
    2724void decayEnumsAndPointers( ast::TranslationUnit & translationUnit );
    2825
  • src/Validate/FindSpecialDecls.h

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FindSpecialDeclarations.h -- Find special declarations used in the compiler.
     7// FindSpecialDeclarations.h --
    88//
    99// Author           : Rob Schluntz
     
    4343        void findSpecialDecls( std::list< Declaration * > & translationUnit );
    4444
    45 /// Find and remember some of the special declarations that are useful for
     45/// find and remember some of the special declarations that are useful for
    4646/// generating code, so that they do not have to be discovered multiple times.
    4747void findGlobalDecls( ast::TranslationUnit & translationUnit );
  • src/Validate/FixQualifiedTypes.cpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixQualifiedTypes.cpp -- Replace the qualified type with a direct type.
     7// FixQualifiedTypes.cpp --
    88//
    99// Author           : Andrew Beach
     
    7676                                                        ret->qualifiers = type->qualifiers;
    7777                                                        ast::TypeSubstitution sub( aggr->params, instp->params );
     78                                                        // = parent->genericSubstitution();
    7879                                                        auto result = sub.apply(ret);
    7980                                                        return result.node.release();
  • src/Validate/FixQualifiedTypes.hpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixQualifiedTypes.hpp -- Replace the qualified type with a direct type.
     7// FixQualifiedTypes.hpp --
    88//
    99// Author           : Andrew Beach
     
    2222namespace Validate {
    2323
    24 /// Replaces qualified types with an unqualified NamedTypeDecl.
    25 /// Must happen after Link References To Types,
    26 /// because aggregate members are accessed.
    2724void fixQualifiedTypes( ast::TranslationUnit & translationUnit );
    2825
  • src/Validate/FixReturnTypes.cpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixReturnTypes.cpp -- Unifies the representation of return types.
     7// FixReturnTypes.cpp --
    88//
    99// Author           : Andrew Beach
  • src/Validate/FixReturnTypes.hpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixReturnTypes.hpp -- Unifies the representation of return types.
     7// FixReturnTypes.hpp --
    88//
    99// Author           : Andrew Beach
     
    2222namespace Validate {
    2323
    24 /// This pass needs to happen early so that other passes can find tuple types
    25 /// in the right places, especially for function return types.
    26 /// Must happen before auto-gen.
     24// This pass needs to happen early so that other passes can find tuple types
     25// in the right places, especially for function return types.
    2726void fixReturnTypes( ast::TranslationUnit & translationUnit );
    2827
  • src/Validate/ForallPointerDecay.hpp

    rdef751f r4e2befe3  
    2929/// Also checks that operator names are used properly on functions and
    3030/// assigns unique IDs. This is a "legacy" pass.
    31 /// Must be after implement concurrent keywords; because uniqueIds must be
    32 /// set on declaration before resolution.
    33 /// Must happen before auto-gen routines are added.
    3431void decayForallPointers( ast::TranslationUnit & transUnit );
    3532
  • src/Validate/GenericParameter.cpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // GenericParameter.hpp -- Generic parameter related passes.
     7// GenericParameter.hpp --
    88//
    99// Author           : Andrew Beach
  • src/Validate/GenericParameter.hpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // GenericParameter.hpp -- Generic parameter related passes.
     7// GenericParameter.hpp --
    88//
    99// Author           : Andrew Beach
     
    2323
    2424/// Perform substutions for generic parameters and fill in defaults.
    25 /// Check as early as possible, but it can't happen before Link References to
    26 /// Types and observed failing when attempted before eliminate typedef.
    2725void fillGenericParameters( ast::TranslationUnit & translationUnit );
    2826
  • src/Validate/HoistStruct.hpp

    rdef751f r4e2befe3  
    2222namespace Validate {
    2323
    24 /// Flattens nested type declarations. (Run right after Fix Qualified Types.)
     24/// Flattens nested type declarations.
    2525void hoistStruct( ast::TranslationUnit & translationUnit );
    2626
  • src/Validate/HoistTypeDecls.cpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // HoistTypeDecls.cpp -- Hoists declarations of implicitly declared types.
     7// HoistTypeDecls.cpp --
    88//
    99// Author           : Andrew Beach
  • src/Validate/HoistTypeDecls.hpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // HoistTypeDecls.hpp -- Hoists declarations of implicitly declared types.
     7// HoistTypeDecls.hpp --
    88//
    99// Author           : Andrew Beach
     
    2222namespace Validate {
    2323
    24 /// There are some places where a type can be declared but are usually only
    25 /// referenced (with an *InstType). This inserts the declarations before
    26 /// they are referenced.
    2724void hoistTypeDecls( ast::TranslationUnit & translationUnit );
    2825
  • src/Validate/LabelAddressFixer.cpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // LabelAddressFixer.cpp -- Create label address expressions.
     7// LabelAddressFixer.cpp --
    88//
    99// Author           : Andrew Beach
  • src/Validate/LabelAddressFixer.hpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // LabelAddressFixer.hpp -- Create label address expressions.
     7// LabelAddressFixer.hpp --
    88//
    99// Author           : Andrew Beach
     
    2020namespace Validate {
    2121
    22 /// Label addresses are not actually created in the parser, this pass finds
    23 /// the patterns that represent the label address expression.
    2422void fixLabelAddresses( ast::TranslationUnit & translationUnit );
    2523
  • src/Validate/LinkReferenceToTypes.hpp

    rdef751f r4e2befe3  
    2222namespace Validate {
    2323
    24 /// Fills in the base value of various instance types, and some related
    25 /// adjustments, such as setting the sized flag.
    26 /// Because of the sized flag, it must happen before auto-gen.
    2724void linkReferenceToTypes( ast::TranslationUnit & translationUnit );
    2825
  • src/Validate/ReplaceTypedef.cpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // ReplaceTypedef.cpp -- Fill in all typedefs with the underlying type.
     7// ReplaceTypedef.cpp --
    88//
    99// Author           : Andrew Beach
    1010// Created On       : Tue Jun 29 14:59:00 2022
    1111// Last Modified By : Andrew Beach
    12 // Last Modified On : Wed Jul 13 14:45:00 2022
    13 // Update Count     : 1
     12// Last Modified On : Mon Jul 12 14:17:00 2022
     13// Update Count     : 0
    1414//
    1515
     
    6363        void previsit( ast::TraitDecl const * );
    6464
     65        void previsit( ast::FunctionType const * );
     66
    6567        template<typename AggrDecl>
    6668        void addImplicitTypedef( AggrDecl * aggDecl );
     
    7678        CodeLocation const * nearestLocation = nullptr;
    7779        int scopeLevel;
    78         bool isAtFunctionTop = false;
     80        bool inFunctionType = false;
    7981};
    8082
     
    103105                ast::Type * ret = ast::deepCopy( def->second.first->base );
    104106                ret->qualifiers |= type->qualifiers;
    105                 // We ignore certain attributes on function parameters if they arrive
    106                 // by typedef. GCC appears to do the same thing.
    107                 if ( isAtFunctionTop ) {
     107                // GCC ignores certain attributes if they arrive by typedef,
     108                // this mimics that.
     109                // TODO: This might cover too much, it should just cover arguments
     110                //   and return values of a function.
     111                if ( visitor->isInFunction() ) {
    108112                        erase_if( ret->attributes, isNonParameterAttribute );
    109113                }
     
    203207        GuardScope( typedefNames );
    204208        GuardScope( typedeclNames );
    205         GuardValue( isAtFunctionTop ) = true;
    206209}
    207210
     
    259262        GuardScope( typedefNames );
    260263        GuardScope( typedeclNames );
    261         GuardValue( isAtFunctionTop ) = false;
    262264        scopeLevel += 1;
    263265}
     
    290292        GuardScope( typedefNames );
    291293        GuardScope( typedeclNames );
     294}
     295
     296void ReplaceTypedefCore::previsit( ast::FunctionType const * ) {
     297        GuardValue( inFunctionType ) = true;
    292298}
    293299
  • src/Validate/ReplaceTypedef.hpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // ReplaceTypedef.hpp -- Fill in all typedefs with the underlying type.
     7// ReplaceTypedef.hpp --
    88//
    99// Author           : Andrew Beach
     
    2222namespace Validate {
    2323
    24 /// Uses of typedef are replaced with the type in the typedef.
    2524void replaceTypedef( ast::TranslationUnit & translationUnit );
    2625
  • src/Validate/VerifyCtorDtorAssign.cpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // VerifyCtorDtorAssign.cpp -- Check the form of operators.
     7// VerifyCtorDtorAssign.cpp --
    88//
    99// Author           : Andrew Beach
  • src/Validate/VerifyCtorDtorAssign.hpp

    rdef751f r4e2befe3  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // VerifyCtorDtorAssign.hpp -- Check the form of operators.
     7// VerifyCtorDtorAssign.hpp --
    88//
    99// Author           : Andrew Beach
     
    2222namespace Validate {
    2323
    24 /// Check that constructors, destructors and assignments all have the correct
    25 /// form. Must happen before auto-gen or anything that examines operators.
    2624void verifyCtorDtorAssign( ast::TranslationUnit & translationUnit );
    2725
  • src/Virtual/Tables.h

    rdef751f r4e2befe3  
    1919#include "AST/Fwd.hpp"
    2020class Declaration;
     21class StructDecl;
    2122class Expression;
    22 class FunctionDecl;
    23 class Initializer;
    24 class ObjectDecl;
    25 class StructDecl;
    26 class StructInstType;
    27 class Type;
    2823
    2924namespace Virtual {
  • src/main.cc

    rdef751f r4e2befe3  
    1010// Created On       : Fri May 15 23:12:02 2015
    1111// Last Modified By : Andrew Beach
    12 // Last Modified On : Mon Jul 18 11:08:00 2022
    13 // Update Count     : 676
     12// Last Modified On : Tue Jul 12 12:02:00 2022
     13// Update Count     : 675
    1414//
    1515
     
    330330                Stats::Time::StopBlock();
    331331
     332                PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) );
     333                if ( exdeclp ) {
     334                        dump( translationUnit );
     335                        return EXIT_SUCCESS;
     336                } // if
     337
     338                CodeTools::fillLocations( translationUnit );
     339
    332340                if( useNewAST ) {
     341                        CodeTools::fillLocations( translationUnit );
     342
    333343                        if (Stats::Counters::enabled) {
    334344                                ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New");
     
    339349                        forceFillCodeLocations( transUnit );
    340350
    341                         PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) );
    342                         if ( exdeclp ) {
    343                                 dump( move( transUnit ) );
    344                                 return EXIT_SUCCESS;
    345                         }
    346 
     351                        // Must happen before auto-gen, or anything that examines ops.
    347352                        PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) );
     353
    348354                        PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) );
    349355                        // Hoist Type Decls pulls some declarations out of contexts where
     
    353359
    354360                        PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) );
     361
     362                        // Must happen before auto-gen.
    355363                        PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) );
     364
     365                        // Must happen before Link Reference to Types, it needs correct
     366                        // types for mangling.
    356367                        PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) );
    357368
     369                        // Must happen before auto-gen, because it uses the sized flag.
    358370                        PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) );
    359371
     372                        // Must happen after Link References To Types,
     373                        // because aggregate members are accessed.
    360374                        PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) );
     375
    361376                        PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) );
    362377                        PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) );
     378
     379                        // Check as early as possible. Can't happen before
     380                        // LinkReferenceToType, observed failing when attempted
     381                        // before eliminateTypedef
    363382                        PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) );
     383
    364384                        PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) );
    365385                        PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) );
     386
     387                        // Must happen before Autogen.
    366388                        PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) );
     389
    367390                        PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) );
     391
     392                        // Must be after implement concurrent keywords; because uniqueIds
     393                        //   must be set on declaration before resolution.
     394                        // Must happen before autogen routines are added.
    368395                        PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) );
     396
     397                        // Must happen before autogen routines are added.
    369398                        PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) );
    370399
     400                        // Must be after enum and pointer decay.
     401                        // Must be before compound literals.
    371402                        PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) );
    372403
     
    439470                        PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) );
    440471                        PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) );
    441                         PASS( "Convert Specializations",  GenPoly::convertSpecializations( transUnit ) ); // needs to happen before tuple types are expanded
    442 
    443472
    444473                        translationUnit = convert( move( transUnit ) );
    445474                } else {
    446                         PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) );
    447                         if ( exdeclp ) {
    448                                 dump( translationUnit );
    449                                 return EXIT_SUCCESS;
    450                         } // if
    451 
    452475                        // add the assignment statement after the initialization of a type parameter
    453476                        PASS( "Validate", SymTab::validate( translationUnit ) );
     
    515538                        PASS( "Translate Tries", ControlStruct::translateTries( translationUnit ) );
    516539                        PASS( "Gen Waitfor", Concurrency::generateWaitFor( translationUnit ) );
    517                         PASS( "Convert Specializations",  GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded
    518 
    519540                }
    520541
    521 
    522                 // PASS( "Convert Specializations",  GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded
     542                PASS( "Convert Specializations",  GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded
    523543
    524544                PASS( "Expand Tuples", Tuples::expandTuples( translationUnit ) ); // xxx - is this the right place for this?
  • tests/alloc2.cfa

    rdef751f r4e2befe3  
    1111typedef struct S1 T1;
    1212
    13 void test_base( void * ip, size_t size, size_t align ) {
     13void test_base( void * ip, size_t size, size_t align) {
    1414        tests_total += 1;
    15 //      printf( "DEBUG: starting test %d\n", tests_total);
    16         bool passed = (malloc_size( ip ) == size) && (malloc_usable_size( ip ) >= size) && (malloc_alignment( ip ) == align) && ((uintptr_t)ip % align  == 0);
    17         if ( ! passed ) {
    18                 printf( "failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size( ip ), malloc_usable_size( ip ), malloc_alignment( ip ) );
     15//      printf("DEBUG: starting test %d\n", tests_total);
     16        bool passed = (malloc_size(ip) == size) && (malloc_usable_size(ip) >= size) && (malloc_alignment(ip) == align) && ((uintptr_t)ip % align  == 0);
     17        if (!passed) {
     18                printf("failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size(ip), malloc_usable_size(ip), malloc_alignment(ip));
    1919                tests_failed += 1;
    20         } // if
    21 //      printf( "DEBUG: done test %d\n", tests_total);
     20        }
     21//      printf("DEBUG: done test %d\n", tests_total);
    2222}
    2323
    24 void test_fill( void * ip_, size_t start, size_t end, char fill ) {
     24void test_fill( void * ip_, size_t start, size_t end, char fill) {
    2525        tests_total += 1;
    26 //      printf( "DEBUG: starting test %d\n", tests_total );
     26//      printf("DEBUG: starting test %d\n", tests_total);
    2727        bool passed = true;
    2828        char * ip = (char *) ip_;
    29         for ( i; start ~ end ) passed = passed && (ip[i] == fill);
    30         if ( ! passed ) {
    31                 printf( "failed test %3d: fill C\n", tests_total );
     29        for (i; start ~ end) passed = passed && (ip[i] == fill);
     30        if (!passed) {
     31                printf("failed test %3d: fill C\n", tests_total);
    3232                tests_failed += 1;
    33         } // if
    34 //      printf( "DEBUG: done test %d\n", tests_total );
     33        }
     34//      printf("DEBUG: done test %d\n", tests_total);
    3535}
    3636
    37 void test_fill( void * ip_, size_t start, size_t end, int fill ) {
     37void test_fill( void * ip_, size_t start, size_t end, int fill) {
    3838        tests_total += 1;
    39 //      printf( "DEBUG: starting test %d\n", tests_total );
     39//      printf("DEBUG: starting test %d\n", tests_total);
    4040        bool passed = true;
    41         int * ip = (int *)ip_;
    42         for (i; start ~ end ) passed = passed && (ip[i] == fill);
    43         if ( ! passed ) {
    44                 printf( "failed test %3d: fill int\n", tests_total );
     41        int * ip = (int *) ip_;
     42        for (i; start ~ end) passed = passed && (ip[i] == fill);
     43        if (!passed) {
     44                printf("failed test %3d: fill int\n", tests_total);
    4545                tests_failed += 1;
    46         } // if
    47 //      printf( "DEBUG: done test %d\n", tests_total );
     46        }
     47//      printf("DEBUG: done test %d\n", tests_total);
    4848}
    4949
    50 void test_fill( void * ip_, size_t start, size_t end, int * fill ) {
     50void test_fill( void * ip_, size_t start, size_t end, int * fill) {
    5151        tests_total += 1;
    52 //      printf( "DEBUG: starting test %d\n", tests_total );
    53         bool passed = memcmp((void*)((uintptr_t )ip_ + start ), (void*)fill, end ) == 0;
    54         if ( ! passed ) {
    55                 printf( "failed test %3d: fill int A\n", tests_total );
     52//      printf("DEBUG: starting test %d\n", tests_total);
     53        bool passed = (memcmp((void*)((uintptr_t)ip_ + start), (void*)fill, end) == 0);
     54        if (!passed) {
     55                printf("failed test %3d: fill int A\n", tests_total);
    5656                tests_failed += 1;
    57         } // if
    58 //      printf( "DEBUG: done test %d\n", tests_total );
     57        }
     58//      printf("DEBUG: done test %d\n", tests_total);
    5959}
    6060
    61 void test_fill( void * ip_, size_t start, size_t end, T1 fill ) {
     61void test_fill( void * ip_, size_t start, size_t end, T1 fill) {
    6262        tests_total += 1;
    63 //      printf( "DEBUG: starting test %d\n", tests_total );
     63//      printf("DEBUG: starting test %d\n", tests_total);
    6464        bool passed = true;
    6565        T1 * ip = (T1 *) ip_;
    66         for ( i; start ~ end ) passed = passed && (ip[i].data == fill.data );
    67         if ( ! passed ) {
    68                 printf( "failed test %3d: fill T1\n", tests_total );
     66        for (i; start ~ end) passed = passed && (ip[i].data == fill.data);
     67        if (!passed) {
     68                printf("failed test %3d: fill T1\n", tests_total);
    6969                tests_failed += 1;
    70         } // if
    71 //      printf( "DEBUG: done test %d\n", tests_total );
     70        }
     71//      printf("DEBUG: done test %d\n", tests_total);
    7272}
    7373
    74 void test_fill( void * ip_, size_t start, size_t end, T1 * fill ) {
     74void test_fill( void * ip_, size_t start, size_t end, T1 * fill) {
    7575        tests_total += 1;
    76 //      printf( "DEBUG: starting test %d\n", tests_total );
    77         bool passed = memcmp( (void*)((uintptr_t )ip_ + start ), (void*)fill, end ) == 0;
    78         if ( ! passed ) {
    79                 printf( "failed test %3d: fill T1 A\n", tests_total );
     76//      printf("DEBUG: starting test %d\n", tests_total);
     77        bool passed = (memcmp((void*)((uintptr_t)ip_ + start), (void*)fill, end) == 0);
     78        if (!passed) {
     79                printf("failed test %3d: fill T1 A\n", tests_total);
    8080                tests_failed += 1;
    81         } // if
    82 //      printf( "DEBUG: done test %d\n", tests_total );
     81        }
     82//      printf("DEBUG: done test %d\n", tests_total);
    8383}
    8484
    85 void test_use( int * ip, size_t dim ) {
     85void test_use( int * ip, size_t dim) {
    8686        tests_total += 1;
    87 //      printf( "DEBUG: starting test %d\n", tests_total );
     87//      printf("DEBUG: starting test %d\n", tests_total);
    8888        bool passed = true;
    89         for ( i; 0 ~ dim ) ip[i] = 0xdeadbeef;
    90         for ( i; 0 ~ dim ) passed = passed &&  (ip[i] == 0xdeadbeef);
    91         if ( ! passed ) {
    92                 printf( "failed test %3d: use int\n", tests_total );
     89        for (i; 0 ~ dim) ip[i] = 0xdeadbeef;
     90        for (i; 0 ~ dim) passed = passed &&  (ip[i] == 0xdeadbeef);
     91        if (!passed) {
     92                printf("failed test %3d: use int\n", tests_total);
    9393                tests_failed += 1;
    94         } // if
    95 //      printf( "DEBUG: done test %d\n", tests_total );
     94        }
     95//      printf("DEBUG: done test %d\n", tests_total);
    9696}
    9797
    98 void test_use( T1 * ip, size_t dim ) {
     98void test_use( T1 * ip, size_t dim) {
    9999        tests_total += 1;
    100 //      printf( "DEBUG: starting test %d\n", tests_total );
     100//      printf("DEBUG: starting test %d\n", tests_total);
    101101        bool passed = true;
    102         for ( i; 0 ~ dim ) ip[i].data = 0xdeadbeef;
    103         for ( i; 0 ~ dim ) passed = passed &&  (ip[i].data == 0xdeadbeef);
    104         if ( ! passed ) {
    105                 printf( "failed test %3d: use T1\n", tests_total );
     102        for (i; 0 ~ dim) ip[i].data = 0xdeadbeef;
     103        for (i; 0 ~ dim) passed = passed &&  (ip[i].data == 0xdeadbeef);
     104        if (!passed) {
     105                printf("failed test %3d: use T1\n", tests_total);
    106106                tests_failed += 1;
    107         } // if
    108 //      printf( "DEBUG: done test %d\n", tests_total );
     107        }
     108//      printf("DEBUG: done test %d\n", tests_total);
    109109}
    110110
    111111int main( void ) {
    112         enum { dim = 8, align = 64, libAlign = libAlign() };
    113112        size_t elemSize = sizeof(int);
     113        size_t dim = 8;
    114114        size_t size = dim * elemSize;
    115 
    116         int FillT = 9;
    117         char FillC = 'a';
    118         int * FillA = calloc( dim / 4 );
    119         T1 FillT1 = { FillT };
    120         T1 * FillT1A = (T1 *)(void *) malloc( (dim / 4) * sizeof(T1) );
    121         for ( i; 0 ~ (dim / 4) ) FillT1A[i] = FillT1;
    122 
    123         int * ip;
    124         int * op;
    125         double * dp;
    126         T1 * t1p;
    127         T1 * t1op;
     115        size_t align = 64;
     116        const size_t libAlign = libAlign();
     117
     118        int     FillT = 9;
     119        char    FillC = 'a';
     120        int   * FillA = calloc(dim / 4);
     121        T1          FillT1 = { FillT };
     122        T1        * FillT1A = (T1 *)(void *) malloc( (dim / 4) * sizeof(T1) );
     123        for (i; 0 ~ (dim / 4) ) FillT1A[i] = FillT1;
     124
     125        int             * ip;
     126        int     * op;
     127        double  * dp;
     128        T1      * t1p;
     129        T1          * t1op;
    128130
    129131        // testing alloc
     
    134136
    135137        ip = alloc();
    136         test_base( ip, elemSize, libAlign );
    137         test_use( ip, elemSize / elemSize );
    138         free( ip );
     138        test_base(ip, elemSize, libAlign);
     139        test_use(ip, elemSize / elemSize);
     140        free(ip);
    139141
    140142        ip = alloc( dim );
    141         test_base( ip, size, libAlign );
    142         test_use( ip, size / elemSize );
    143         free( ip );
     143        test_base(ip, size, libAlign);
     144        test_use(ip, size / elemSize);
     145        free(ip);
    144146
    145147        ip = alloc( 0 );
    146         test_base( ip, 0, libAlign );
    147         free( ip );
     148        test_base(ip, 0, libAlign);
     149        free(ip);
    148150
    149151        dp = alloc( dim );
    150152        ip = alloc( dp`resize );
    151         test_base( ip, elemSize, libAlign );
    152         test_use( ip, elemSize / elemSize );
    153         free( ip );
    154 
    155         ip = alloc( ((double *)0p)`resize );
    156         test_base( ip, elemSize, libAlign );
    157         test_use( ip, elemSize / elemSize );
    158         free( ip );
     153        test_base(ip, elemSize, libAlign);
     154        test_use(ip, elemSize / elemSize);
     155        free(ip);
     156
     157        ip = alloc( ((double*)0p)`resize );
     158        test_base(ip, elemSize, libAlign);
     159        test_use(ip, elemSize / elemSize);
     160        free(ip);
    159161
    160162        dp = alloc( dim );
    161163        ip = alloc( dim, dp`resize );
    162         test_base( ip, size, libAlign );
    163         test_use( ip, size / elemSize );
    164         free( ip );
     164        test_base(ip, size, libAlign);
     165        test_use(ip, size / elemSize);
     166        free(ip);
    165167
    166168        dp = alloc( dim );
    167169        ip = alloc( 0, dp`resize );
    168         test_base( ip, 0, libAlign );
    169         free( ip );
    170 
    171         ip = alloc( dim, 0p`resize );
    172         test_base( ip, size, libAlign );
    173         test_use( ip, size / elemSize );
    174         free( ip );
    175 
    176         ip = alloc( 0, 0p`resize );
    177         test_base( ip, 0, libAlign );
    178         free( ip );
    179 
    180         op = alloc( dim, 0xdeadbeefN`fill );
     170        test_base(ip, 0, libAlign);
     171        free(ip);
     172
     173        ip = alloc( dim, ((double*)0p)`resize );
     174        test_base(ip, size, libAlign);
     175        test_use(ip, size / elemSize);
     176        free(ip);
     177
     178        ip = alloc( 0, ((double*)0p)`resize );
     179        test_base(ip, 0, libAlign);
     180        free(ip);
     181
     182        op = alloc( dim, ((int)0xdeadbeef)`fill );
    181183        ip = alloc( dim, op`realloc );
    182         test_base( ip, size, libAlign );
    183         test_fill( ip, 0, dim, 0xdeadbeefN );
    184         test_use( ip, size / elemSize );
    185         free( ip );
    186 
    187         op = alloc( dim, 0xdeadbeefN`fill );
     184        test_base(ip, size, libAlign);
     185        test_fill(ip, 0, dim, (int)0xdeadbeef);
     186        test_use(ip, size / elemSize);
     187        free(ip);
     188
     189        op = alloc( dim, ((int)0xdeadbeef)`fill );
    188190        ip = alloc( 0, op`realloc );
    189         test_base( ip, 0, libAlign );
    190         free( ip );
    191 
    192         ip = alloc( dim, 0p`realloc );
    193         test_base( ip, size, libAlign );
    194         test_use( ip, size / elemSize );
    195         free( ip );
    196 
    197         ip = alloc( 0, 0p`realloc );
    198         test_base( ip, 0, libAlign );
    199         free( ip );
    200 
    201         op = alloc( dim, 0xdeadbeefN`fill );
     191        test_base(ip, 0, libAlign);
     192        free(ip);
     193
     194        ip = alloc( dim, ((int*)0p)`realloc );
     195        test_base(ip, size, libAlign);
     196        test_use(ip, size / elemSize);
     197        free(ip);
     198
     199        ip = alloc( 0, ((int*)0p)`realloc );
     200        test_base(ip, 0, libAlign);
     201        free(ip);
     202
     203        op = alloc( dim, ((int)0xdeadbeef)`fill );
    202204        ip = alloc( dim, op`resize );
    203         test_base( ip, size, libAlign );
    204         test_use( ip, size / elemSize );
    205         free( ip );
     205        test_base(ip, size, libAlign);
     206        test_use(ip, size / elemSize);
     207        free(ip);
    206208
    207209        ip = alloc( FillC`fill );
    208         test_base( ip, elemSize, libAlign );
    209         test_fill( ip, 0, elemSize, FillC );
    210         test_use( ip, elemSize / elemSize );
    211         free( ip );
     210        test_base(ip, elemSize, libAlign);
     211        test_fill(ip, 0, elemSize, FillC);
     212        test_use(ip, elemSize / elemSize);
     213        free(ip);
    212214
    213215        ip = alloc( FillT`fill );
    214         test_base( ip, elemSize, libAlign );
    215         test_fill( ip, 0, 1, FillT );
    216         test_use( ip, elemSize / elemSize );
    217         free( ip );
     216        test_base(ip, elemSize, libAlign);
     217        test_fill(ip, 0, 1, FillT);
     218        test_use(ip, elemSize / elemSize);
     219        free(ip);
    218220
    219221        ip = alloc( dim, FillC`fill );
    220         test_base( ip, size, libAlign );
    221         test_fill( ip, 0, size, FillC );
    222         test_use( ip, size / elemSize );
    223         free( ip );
     222        test_base(ip, size, libAlign);
     223        test_fill(ip, 0, size, FillC);
     224        test_use(ip, size / elemSize);
     225        free(ip);
    224226
    225227        ip = alloc( 0, FillC`fill );
    226         test_base( ip, 0, libAlign );
    227         free( ip );
     228        test_base(ip, 0, libAlign);
     229        free(ip);
    228230
    229231        ip = alloc( dim, FillT`fill );
    230         test_base( ip, size, libAlign );
    231         test_fill( ip, 0, dim, FillT );
    232         test_use( ip, size / elemSize );
    233         free( ip );
     232        test_base(ip, size, libAlign);
     233        test_fill(ip, 0, dim, FillT);
     234        test_use(ip, size / elemSize);
     235        free(ip);
    234236
    235237        ip = alloc( 0, FillT`fill );
    236         test_base( ip, 0, libAlign );
    237         free( ip );
     238        test_base(ip, 0, libAlign);
     239        free(ip);
    238240
    239241        ip = alloc( dim, [FillA, dim/4]`fill );
    240         test_base( ip, size, libAlign );
    241         test_fill( ip, 0, size/4, FillA );
    242         test_use( ip, size / elemSize );
    243         free( ip );
     242        test_base(ip, size, libAlign);
     243        test_fill(ip, 0, size/4, FillA);
     244        test_use(ip, size / elemSize);
     245        free(ip);
    244246
    245247        ip = alloc( 0, [FillA, dim/4]`fill );
    246         test_base( ip, 0, libAlign );
    247         free( ip );
    248 
    249         op = alloc( dim, 0xdeadbeefN`fill );
     248        test_base(ip, 0, libAlign);
     249        free(ip);
     250
     251        op = alloc( dim, ((int)0xdeadbeef)`fill );
    250252        ip = alloc( dim, op`realloc, FillC`fill );
    251         test_base( ip, size, libAlign );
    252         test_fill( ip, 0, dim, 0xdeadbeefN );
    253         test_use( ip, size / elemSize );
    254         free( ip );
    255 
    256         op = alloc( dim, 0xdeadbeefN`fill );
     253        test_base(ip, size, libAlign);
     254        test_fill(ip, 0, dim, (int)0xdeadbeef);
     255        test_use(ip, size / elemSize);
     256        free(ip);
     257
     258        op = alloc( dim, ((int)0xdeadbeef)`fill );
    257259        ip = alloc( dim / 4, op`realloc, FillC`fill );
    258         test_base( ip, size / 4, libAlign );
    259         test_fill( ip, 0, dim / 4, 0xdeadbeefN );
    260         test_use( ip, size / 4 / elemSize );
    261         free( ip );
    262 
    263         op = alloc( dim, 0xdeadbeefN`fill );
     260        test_base(ip, size / 4, libAlign);
     261        test_fill(ip, 0, dim / 4, (int)0xdeadbeef);
     262        test_use(ip, size / 4 / elemSize);
     263        free(ip);
     264
     265        op = alloc( dim, ((int)0xdeadbeef)`fill );
    264266        ip = alloc( dim * 4, op`realloc, FillC`fill );
    265         test_base( ip, size * 4, libAlign );
    266         test_fill( ip, 0, dim, 0xdeadbeefN );
    267         test_fill( ip, size, size * 4, FillC );
    268         test_use( ip, size * 4 / elemSize );
    269         free( ip );
    270 
    271         op = alloc( dim, 0xdeadbeefN`fill );
     267        test_base(ip, size * 4, libAlign);
     268        test_fill(ip, 0, dim, (int)0xdeadbeef);
     269        test_fill(ip, size, size * 4, FillC);
     270        test_use(ip, size * 4 / elemSize);
     271        free(ip);
     272
     273        op = alloc( dim, ((int)0xdeadbeef)`fill );
    272274        ip = alloc( 0, op`realloc, FillC`fill );
    273         test_base( ip, 0, libAlign );
    274         free( ip );
    275 
    276         ip = alloc( dim, 0p`realloc, FillC`fill );
    277         test_base( ip, size, libAlign );
    278         test_fill( ip, 0, size, FillC );
    279         test_use( ip, size / elemSize );
    280         free( ip );
    281 
    282         ip = alloc( 0, 0p`realloc, FillC`fill );
    283         test_base( ip, 0, libAlign );
    284         free( ip );
    285 
    286         op = alloc( dim, 0xdeadbeefN`fill );
     275        test_base(ip, 0, libAlign);
     276        free(ip);
     277
     278        ip = alloc( dim, ((int*)0p)`realloc, FillC`fill );
     279        test_base(ip, size, libAlign);
     280        test_fill(ip, 0, size, FillC);
     281        test_use(ip, size / elemSize);
     282        free(ip);
     283
     284        ip = alloc( 0, ((int*)0p)`realloc, FillC`fill );
     285        test_base(ip, 0, libAlign);
     286        free(ip);
     287
     288        op = alloc( dim, ((int)0xdeadbeef)`fill );
    287289        ip = alloc( dim, op`realloc, FillT`fill );
    288         test_base( ip, size, libAlign );
    289         test_fill( ip, 0, dim, 0xdeadbeefN );
    290         test_use( ip, size / elemSize );
    291         free( ip );
    292 
    293         op = alloc( dim, 0xdeadbeefN`fill );
     290        test_base(ip, size, libAlign);
     291        test_fill(ip, 0, dim, (int)0xdeadbeef);
     292        test_use(ip, size / elemSize);
     293        free(ip);
     294
     295        op = alloc( dim, ((int)0xdeadbeef)`fill );
    294296        ip = alloc( dim / 4, op`realloc, FillT`fill );
    295         test_base( ip, size / 4, libAlign );
    296         test_fill( ip, 0, dim / 4, 0xdeadbeefN );
    297         test_use( ip, size / 4 / elemSize );
    298         free( ip );
    299 
    300         op = alloc( dim, 0xdeadbeefN`fill );
     297        test_base(ip, size / 4, libAlign);
     298        test_fill(ip, 0, dim / 4, (int)0xdeadbeef);
     299        test_use(ip, size / 4 / elemSize);
     300        free(ip);
     301
     302        op = alloc( dim, ((int)0xdeadbeef)`fill );
    301303        ip = alloc( dim * 4, op`realloc, FillT`fill );
    302         test_base( ip, size * 4, libAlign );
    303         test_fill( ip, 0, dim, 0xdeadbeefN );
    304         test_fill( ip, dim, dim * 4, FillT );
    305         test_use( ip, size * 4 / elemSize );
    306         free( ip );
    307 
    308         op = alloc( dim, 0xdeadbeefN`fill );
     304        test_base(ip, size * 4, libAlign);
     305        test_fill(ip, 0, dim, (int)0xdeadbeef);
     306        test_fill(ip, dim, dim * 4, FillT);
     307        test_use(ip, size * 4 / elemSize);
     308        free(ip);
     309
     310        op = alloc( dim, ((int)0xdeadbeef)`fill );
    309311        ip = alloc( 0, op`realloc, FillT`fill );
    310         test_base( ip, 0, libAlign );
    311         free( ip );
    312 
    313         ip = alloc( dim, 0p`realloc, FillT`fill );
    314         test_base( ip, size, libAlign );
    315         test_fill( ip, 0, dim, FillT );
    316         test_use( ip, size / elemSize );
    317         free( ip );
    318 
    319         ip = alloc( 0, 0p`realloc, FillT`fill );
    320         test_base( ip, 0, libAlign );
    321         free( ip );
     312        test_base(ip, 0, libAlign);
     313        free(ip);
     314
     315        ip = alloc( dim, ((int*)0p)`realloc, FillT`fill );
     316        test_base(ip, size, libAlign);
     317        test_fill(ip, 0, dim, FillT);
     318        test_use(ip, size / elemSize);
     319        free(ip);
     320
     321        ip = alloc( 0, ((int*)0p)`realloc, FillT`fill );
     322        test_base(ip, 0, libAlign);
     323        free(ip);
    322324
    323325        ip = alloc( align`align );
    324         test_base( ip, elemSize, align );
    325         test_use( ip, elemSize / elemSize );
    326         free( ip );
     326        test_base(ip, elemSize, align);
     327        test_use(ip, elemSize / elemSize);
     328        free(ip);
    327329
    328330        ip = alloc( dim, align`align );
    329         test_base( ip, size, align );
    330         test_use( ip, size / elemSize );
    331         free( ip );
     331        test_base(ip, size, align);
     332        test_use(ip, size / elemSize);
     333        free(ip);
    332334
    333335        ip = alloc( 0, align`align );
    334         test_base( ip, 0, libAlign );
    335         free( ip );
    336 
    337         op = alloc( dim, 0xdeadbeefN`fill );
     336        test_base(ip, 0, libAlign);
     337        free(ip);
     338
     339        op = alloc( dim, ((int)0xdeadbeef)`fill );
    338340        ip = alloc( op`realloc, align`align );
    339         test_base( ip, elemSize, align );
    340         test_fill( ip, 0, 1, 0xdeadbeefN );
    341         test_use( ip, elemSize / elemSize );
    342         free( ip );
    343 
    344         ip = alloc( 0p`realloc, align`align );
    345         test_base( ip, elemSize, align );
    346         test_use( ip, elemSize / elemSize );
    347         free( ip );
     341        test_base(ip, elemSize, align);
     342        test_fill(ip, 0, 1, (int)0xdeadbeef);
     343        test_use(ip, elemSize / elemSize);
     344        free(ip);
     345
     346        ip = alloc( ((int*)0p)`realloc, align`align );
     347        test_base(ip, elemSize, align);
     348        test_use(ip, elemSize / elemSize);
     349        free(ip);
    348350
    349351        dp = alloc( dim );
    350352        ip = alloc( dp`resize, align`align );
    351         test_base( ip, elemSize, align );
    352         test_use( ip, elemSize / elemSize );
    353         free( ip );
    354 
    355         ip = alloc( 0p`resize, align`align );
    356         test_base( ip, elemSize, align );
    357         test_use( ip, elemSize / elemSize );
    358         free( ip );
    359 
    360         op = alloc( dim, 0xdeadbeefN`fill );
     353        test_base(ip, elemSize, align);
     354        test_use(ip, elemSize / elemSize);
     355        free(ip);
     356
     357        ip = alloc( ((double*)0p)`resize, align`align );
     358        test_base(ip, elemSize, align);
     359        test_use(ip, elemSize / elemSize);
     360        free(ip);
     361
     362        op = alloc( dim, ((int)0xdeadbeef)`fill);
    361363        ip = alloc( dim, op`realloc, align`align );
    362         test_base( ip, size, align );
    363         test_fill( ip, 0, dim, 0xdeadbeefN );
    364         test_use( ip, size / elemSize );
    365         free( ip );
    366 
    367         op = alloc( dim, 0xdeadbeefN`fill );
     364        test_base(ip, size, align);
     365        test_fill(ip, 0, dim, (int)0xdeadbeef);
     366        test_use(ip, size / elemSize);
     367        free(ip);
     368
     369        op = alloc( dim, ((int)0xdeadbeef)`fill );
    368370        ip = alloc( 0, op`realloc, align`align );
    369         test_base( ip, 0, libAlign );
    370         free( ip );
    371 
    372         ip = alloc( dim, 0p`realloc, align`align );
    373         test_base( ip, size, align );
    374         test_use( ip, size / elemSize );
    375         free( ip );
    376 
    377         ip = alloc( 0, 0p`realloc, align`align );
    378         test_base( ip, 0, libAlign );
    379         free( ip );
     371        test_base(ip, 0, libAlign);
     372        free(ip);
     373
     374        ip = alloc( dim, ((int*)0p)`realloc, align`align );
     375        test_base(ip, size, align);
     376        test_use(ip, size / elemSize);
     377        free(ip);
     378
     379        ip = alloc( 0, ((int*)0p)`realloc, align`align );
     380        test_base(ip, 0, libAlign);
     381        free(ip);
    380382
    381383        ip = alloc( align`align, FillC`fill );
    382         test_base( ip, elemSize, align );
    383         test_fill( ip, 0, elemSize, FillC );
    384         test_use( ip, elemSize / elemSize );
    385         free( ip );
     384        test_base(ip, elemSize, align);
     385        test_fill(ip, 0, elemSize, FillC);
     386        test_use(ip, elemSize / elemSize);
     387        free(ip);
    386388
    387389        ip = alloc( align`align, FillT`fill );
    388         test_base( ip, elemSize, align );
    389         test_fill( ip, 0, 1, FillT );
    390         test_use( ip, elemSize / elemSize );
    391         free( ip );
     390        test_base(ip, elemSize, align);
     391        test_fill(ip, 0, 1, FillT);
     392        test_use(ip, elemSize / elemSize);
     393        free(ip);
    392394
    393395        ip = alloc( dim, align`align, FillC`fill );
    394         test_base( ip, size, align );
    395         test_fill( ip, 0, size, FillC );
    396         test_use( ip, size / elemSize );
    397         free( ip );
     396        test_base(ip, size, align);
     397        test_fill(ip, 0, size, FillC);
     398        test_use(ip, size / elemSize);
     399        free(ip);
    398400
    399401        ip = alloc( 0, align`align, FillC`fill );
    400         test_base( ip, 0, libAlign );
    401         free( ip );
     402        test_base(ip, 0, libAlign);
     403        free(ip);
    402404
    403405        ip = alloc( dim, align`align, FillT`fill );
    404         test_base( ip, size, align );
    405         test_fill( ip, 0, dim, FillT );
    406         test_use( ip, size / elemSize );
    407         free( ip );
     406        test_base(ip, size, align);
     407        test_fill(ip, 0, dim, FillT);
     408        test_use(ip, size / elemSize);
     409        free(ip);
    408410
    409411        ip = alloc( 0, align`align, FillT`fill );
    410         test_base( ip, 0, libAlign );
    411         free( ip );
     412        test_base(ip, 0, libAlign);
     413        free(ip);
    412414
    413415        ip = alloc( dim, align`align, [FillA, dim/4]`fill );
    414         test_base( ip, size, align );
    415         test_fill( ip, 0, size/4, FillA );
    416         test_use( ip, size / elemSize );
    417         free( ip );
     416        test_base(ip, size, align);
     417        test_fill(ip, 0, size/4, FillA);
     418        test_use(ip, size / elemSize);
     419        free(ip);
    418420
    419421        ip = alloc( 0, align`align, [FillA, dim/4]`fill );
    420         test_base( ip, 0, libAlign );
    421         free( ip );
    422 
    423         op = alloc( dim, 0xdeadbeefN`fill );
     422        test_base(ip, 0, libAlign);
     423        free(ip);
     424
     425        op = alloc( dim, ((int)0xdeadbeef)`fill );
    424426        ip = alloc( dim, op`realloc, align`align, FillC`fill );
    425         test_base( ip, size, align );
    426         test_fill( ip, 0, dim, 0xdeadbeefN );
    427         test_use( ip, size / elemSize );
    428         free( ip );
    429 
    430         op = alloc( dim, 0xdeadbeefN`fill );
     427        test_base(ip, size, align);
     428        test_fill(ip, 0, dim, (int)0xdeadbeef);
     429        test_use(ip, size / elemSize);
     430        free(ip);
     431
     432        op = alloc( dim, ((int)0xdeadbeef)`fill );
    431433        ip = alloc( dim / 4, op`realloc, align`align, FillC`fill );
    432         test_base( ip, size / 4, align );
    433         test_fill( ip, 0, dim / 4, 0xdeadbeefN );
    434         test_use( ip, size / 4 / elemSize );
    435         free( ip );
    436 
    437         op = alloc( dim, 0xdeadbeefN`fill );
     434        test_base(ip, size / 4, align);
     435        test_fill(ip, 0, dim / 4, (int)0xdeadbeef);
     436        test_use(ip, size / 4 / elemSize);
     437        free(ip);
     438
     439        op = alloc( dim, ((int)0xdeadbeef)`fill );
    438440        ip = alloc( dim * 4, op`realloc, align`align, FillC`fill );
    439         test_base( ip, size * 4, align );
    440         test_fill( ip, 0, dim, 0xdeadbeefN );
    441         test_fill( ip, size, size * 4, FillC );
    442         test_use( ip, size * 4 / elemSize );
    443         free( ip );
    444 
    445         op = alloc( dim, 0xdeadbeefN`fill );
     441        test_base(ip, size * 4, align);
     442        test_fill(ip, 0, dim, (int)0xdeadbeef);
     443        test_fill(ip, size, size * 4, FillC);
     444        test_use(ip, size * 4 / elemSize);
     445        free(ip);
     446
     447        op = alloc( dim, ((int)0xdeadbeef)`fill );
    446448        ip = alloc( 0, op`realloc, align`align, FillC`fill );
    447         test_base( ip, 0, libAlign );
    448         free( ip );
    449 
    450         ip = alloc( dim, 0p`realloc, align`align, FillC`fill );
    451         test_base( ip, size, align );
    452         test_fill( ip, 0, size, FillC );
    453         test_use( ip, size / elemSize );
    454         free( ip );
    455 
    456         ip = alloc( 0, 0p`realloc, align`align, FillC`fill );
    457         test_base( ip, 0, libAlign );
    458         free( ip );
    459 
    460         op = alloc( dim, 0xdeadbeefN`fill );
     449        test_base(ip, 0, libAlign);
     450        free(ip);
     451
     452        ip = alloc( dim, ((int*)0p)`realloc, align`align, FillC`fill );
     453        test_base(ip, size, align);
     454        test_fill(ip, 0, size, FillC);
     455        test_use(ip, size / elemSize);
     456        free(ip);
     457
     458        ip = alloc( 0, ((int*)0p)`realloc, align`align, FillC`fill );
     459        test_base(ip, 0, libAlign);
     460        free(ip);
     461
     462        op = alloc( dim, ((int)0xdeadbeef)`fill );
    461463        ip = alloc( dim, op`realloc, align`align, FillT`fill );
    462         test_base( ip, size, align );
    463         test_fill( ip, 0, dim, 0xdeadbeefN );
    464         test_use( ip, size / elemSize );
    465         free( ip );
    466 
    467         op = alloc( dim, 0xdeadbeefN`fill );
     464        test_base(ip, size, align);
     465        test_fill(ip, 0, dim, (int)0xdeadbeef);
     466        test_use(ip, size / elemSize);
     467        free(ip);
     468
     469        op = alloc( dim, ((int)0xdeadbeef)`fill );
    468470        ip = alloc( dim / 4, op`realloc, align`align, FillT`fill );
    469         test_base( ip, size / 4, align );
    470         test_fill( ip, 0, dim / 4, 0xdeadbeefN );
    471         test_use( ip, size / 4 / elemSize );
    472         free( ip );
    473 
    474         op = alloc( dim, 0xdeadbeefN`fill );
     471        test_base(ip, size / 4, align);
     472        test_fill(ip, 0, dim / 4, (int)0xdeadbeef);
     473        test_use(ip, size / 4 / elemSize);
     474        free(ip);
     475
     476        op = alloc( dim, ((int)0xdeadbeef)`fill );
    475477        ip = alloc( dim * 4, op`realloc, align`align, FillT`fill );
    476         test_base( ip, size * 4, align );
    477         test_fill( ip, 0, dim, 0xdeadbeefN );
    478         test_fill( ip, dim, dim * 4, FillT );
    479         test_use( ip, size * 4 / elemSize );
    480         free( ip );
    481 
    482         op = alloc( dim, 0xdeadbeefN`fill );
     478        test_base(ip, size * 4, align);
     479        test_fill(ip, 0, dim, (int)0xdeadbeef);
     480        test_fill(ip, dim, dim * 4, FillT);
     481        test_use(ip, size * 4 / elemSize);
     482        free(ip);
     483
     484        op = alloc( dim, ((int)0xdeadbeef)`fill );
    483485        ip = alloc( 0, op`realloc, align`align, FillT`fill );
    484         test_base( ip, 0, libAlign );
    485         free( ip );
    486 
    487         ip = alloc( dim, 0p`realloc, align`align, FillT`fill );
    488         test_base( ip, size, align );
    489         test_fill( ip, 0, dim, FillT );
    490         test_use( ip, size / elemSize );
    491         free( ip );
    492 
    493         ip = alloc( 0, 0p`realloc, align`align, FillT`fill );
    494         test_base( ip, 0, libAlign );
    495         free( ip );
    496 
    497         if ( tests_failed == 0 ) printf( "PASSED alloc tests\n\n" );
    498         else printf( "failed alloc tests : %d/%d\n\n", tests_failed, tests_total );
    499 
    500         // testing alloc ( aligned struct )
     486        test_base(ip, 0, libAlign);
     487        free(ip);
     488
     489        ip = alloc( dim, ((int*)0p)`realloc, align`align, FillT`fill );
     490        test_base(ip, size, align);
     491        test_fill(ip, 0, dim, FillT);
     492        test_use(ip, size / elemSize);
     493        free(ip);
     494
     495        ip = alloc( 0, ((int*)0p)`realloc, align`align, FillT`fill );
     496        test_base(ip, 0, libAlign);
     497        free(ip);
     498
     499        if (tests_failed == 0) printf("PASSED alloc tests\n\n");
     500        else printf("failed alloc tests : %d/%d\n\n", tests_failed, tests_total);
     501
     502        // testing alloc (aligned struct)
    501503
    502504        elemSize = sizeof(T1);
     
    507509
    508510        t1p = alloc();
    509         test_base( t1p, elemSize, tAlign );
    510         test_use( t1p, elemSize / elemSize );
    511         free( t1p );
     511        test_base(t1p, elemSize, tAlign);
     512        test_use(t1p, elemSize / elemSize);
     513        free(t1p);
    512514
    513515        t1p = alloc( dim );
    514         test_base( t1p, size, tAlign );
    515         test_use( t1p, size / elemSize );
    516         free( t1p );
     516        test_base(t1p, size, tAlign);
     517        test_use(t1p, size / elemSize);
     518        free(t1p);
    517519
    518520        t1p = alloc( 0 );
    519         test_base( t1p, 0, libAlign );
    520         free( t1p );
     521        test_base(t1p, 0, libAlign);
     522        free(t1p);
    521523
    522524        dp = alloc( dim );
    523525        t1p = alloc( dp`resize );
    524         test_base( t1p, elemSize, tAlign );
    525         test_use( t1p, elemSize / elemSize );
    526         free( t1p );
    527 
    528         t1p = alloc( 0p`resize );
    529         test_base( t1p, elemSize, tAlign );
    530         test_use( t1p, elemSize / elemSize );
    531         free( t1p );
     526        test_base(t1p, elemSize, tAlign);
     527        test_use(t1p, elemSize / elemSize);
     528        free(t1p);
     529
     530        t1p = alloc( ((double*)0p)`resize );
     531        test_base(t1p, elemSize, tAlign);
     532        test_use(t1p, elemSize / elemSize);
     533        free(t1p);
    532534
    533535        dp = alloc( dim );
    534536        t1p = alloc( dim, dp`resize );
    535         test_base( t1p, size, tAlign );
    536         test_use( t1p, size / elemSize );
    537         free( t1p );
     537        test_base(t1p, size, tAlign);
     538        test_use(t1p, size / elemSize);
     539        free(t1p);
    538540
    539541        dp = alloc( dim );
    540542        t1p = alloc( 0, dp`resize );
    541         test_base( t1p, 0, libAlign );
    542         free( t1p );
    543 
    544         t1p = alloc( dim, 0p`resize );
    545         test_base( t1p, size, tAlign );
    546         test_use( t1p, size / elemSize );
    547         free( t1p );
    548 
    549         t1p = alloc( 0, 0p`resize );
    550         test_base( t1p, 0, libAlign );
    551         free( t1p );
     543        test_base(t1p, 0, libAlign);
     544        free(t1p);
     545
     546        t1p = alloc( dim, ((double*)0p)`resize );
     547        test_base(t1p, size, tAlign);
     548        test_use(t1p, size / elemSize);
     549        free(t1p);
     550
     551        t1p = alloc( 0, ((double*)0p)`resize );
     552        test_base(t1p, 0, libAlign);
     553        free(t1p);
    552554
    553555        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    554556        t1p = alloc( dim, t1op`realloc );
    555         test_base( t1p, size, tAlign );
    556         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    557         test_use( t1p, size / elemSize );
    558         free( t1p );
     557        test_base(t1p, size, tAlign);
     558        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     559        test_use(t1p, size / elemSize);
     560        free(t1p);
    559561
    560562        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    561563        t1p = alloc( 0, t1op`realloc );
    562         test_base( t1p, 0, libAlign );
    563         free( t1p );
    564 
    565         t1p = alloc( dim, 0p`realloc );
    566         test_base( t1p, size, tAlign );
    567         test_use( t1p, size / elemSize );
    568         free( t1p );
    569 
    570         t1p = alloc( 0, 0p`realloc );
    571         test_base( t1p, 0, libAlign );
    572         free( t1p );
     564        test_base(t1p, 0, libAlign);
     565        free(t1p);
     566
     567        t1p = alloc( dim, ((T1*)0p)`realloc );
     568        test_base(t1p, size, tAlign);
     569        test_use(t1p, size / elemSize);
     570        free(t1p);
     571
     572        t1p = alloc( 0, ((T1*)0p)`realloc );
     573        test_base(t1p, 0, libAlign);
     574        free(t1p);
    573575
    574576        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    575577        t1p = alloc( dim, t1op`resize );
    576         test_base( t1p, size, tAlign );
    577         test_use( t1p, size / elemSize );
    578         free( t1p );
     578        test_base(t1p, size, tAlign);
     579        test_use(t1p, size / elemSize);
     580        free(t1p);
    579581
    580582        t1p = alloc( FillC`fill );
    581         test_base( t1p, elemSize, tAlign );
    582         test_fill( t1p, 0, elemSize, FillC );
    583         test_use( t1p, elemSize / elemSize );
    584         free( t1p );
     583        test_base(t1p, elemSize, tAlign);
     584        test_fill(t1p, 0, elemSize, FillC);
     585        test_use(t1p, elemSize / elemSize);
     586        free(t1p);
    585587
    586588        t1p = alloc( FillT1`fill );
    587         test_base( t1p, elemSize, tAlign );
    588         test_fill( t1p, 0, 1, FillT1);
    589         test_use( t1p, elemSize / elemSize );
    590         free( t1p );
     589        test_base(t1p, elemSize, tAlign);
     590        test_fill(t1p, 0, 1, FillT1);
     591        test_use(t1p, elemSize / elemSize);
     592        free(t1p);
    591593
    592594        t1p = alloc( dim, FillC`fill );
    593         test_base( t1p, size, tAlign );
    594         test_fill( t1p, 0, size, FillC );
    595         test_use( t1p, size / elemSize );
    596         free( t1p );
     595        test_base(t1p, size, tAlign);
     596        test_fill(t1p, 0, size, FillC);
     597        test_use(t1p, size / elemSize);
     598        free(t1p);
    597599
    598600        t1p = alloc( 0, FillC`fill );
    599         test_base( t1p, 0, libAlign );
    600         free( t1p );
     601        test_base(t1p, 0, libAlign);
     602        free(t1p);
    601603
    602604        t1p = alloc( dim, FillT1`fill );
    603         test_base( t1p, size, tAlign );
    604         test_fill( t1p, 0, dim, FillT1);
    605         test_use( t1p, size / elemSize );
    606         free( t1p );
     605        test_base(t1p, size, tAlign);
     606        test_fill(t1p, 0, dim, FillT1);
     607        test_use(t1p, size / elemSize);
     608        free(t1p);
    607609
    608610        t1p = alloc( 0, FillT1`fill );
    609         test_base( t1p, 0, libAlign );
    610         free( t1p );
     611        test_base(t1p, 0, libAlign);
     612        free(t1p);
    611613
    612614        t1p = alloc( dim, [FillT1A, dim / 4]`fill );
    613         test_base( t1p, size, tAlign );
    614         test_fill( t1p, 0, size/4, FillT1A );
    615         test_use( t1p, size / elemSize );
    616         free( t1p );
     615        test_base(t1p, size, tAlign);
     616        test_fill(t1p, 0, size/4, FillT1A);
     617        test_use(t1p, size / elemSize);
     618        free(t1p);
    617619
    618620        t1p = alloc( 0, [FillT1A, dim / 4]`fill );
    619         test_base( t1p, 0, libAlign );
    620         free( t1p );
     621        test_base(t1p, 0, libAlign);
     622        free(t1p);
    621623
    622624        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    623625        t1p = alloc( dim, t1op`realloc, FillC`fill );
    624         test_base( t1p, size, tAlign );
    625         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    626         test_use( t1p, size / elemSize );
    627         free( t1p );
     626        test_base(t1p, size, tAlign);
     627        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     628        test_use(t1p, size / elemSize);
     629        free(t1p);
    628630
    629631        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    630632        t1p = alloc( dim / 4, t1op`realloc, FillC`fill );
    631         test_base( t1p, size / 4, tAlign );
    632         test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});
    633         test_use( t1p, size / 4 / elemSize );
    634         free( t1p );
     633        test_base(t1p, size / 4, tAlign);
     634        test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef});
     635        test_use(t1p, size / 4 / elemSize);
     636        free(t1p);
    635637
    636638        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    637639        t1p = alloc( dim * 4, t1op`realloc, FillC`fill );
    638         test_base( t1p, size * 4, tAlign );
    639         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    640         test_fill( t1p, size, size * 4, FillC );
    641         test_use( t1p, size * 4 / elemSize );
    642         free( t1p );
     640        test_base(t1p, size * 4, tAlign);
     641        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     642        test_fill(t1p, size, size * 4, FillC);
     643        test_use(t1p, size * 4 / elemSize);
     644        free(t1p);
    643645
    644646        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    645647        t1p = alloc( 0, t1op`realloc, FillC`fill );
    646         test_base( t1p, 0, libAlign );
    647         free( t1p );
    648 
    649         t1p = alloc( dim, 0p`realloc, FillC`fill );
    650         test_base( t1p, size, tAlign );
    651         test_fill( t1p, 0, size, FillC );
    652         test_use( t1p, size / elemSize );
    653         free( t1p );
    654 
    655         t1p = alloc( 0, 0p`realloc, FillC`fill );
    656         test_base( t1p, 0, libAlign );
    657         free( t1p );
     648        test_base(t1p, 0, libAlign);
     649        free(t1p);
     650
     651        t1p = alloc( dim, ((T1*)0p)`realloc, FillC`fill );
     652        test_base(t1p, size, tAlign);
     653        test_fill(t1p, 0, size, FillC);
     654        test_use(t1p, size / elemSize);
     655        free(t1p);
     656
     657        t1p = alloc( 0, ((T1*)0p)`realloc, FillC`fill );
     658        test_base(t1p, 0, libAlign);
     659        free(t1p);
    658660
    659661        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    660662        t1p = alloc( dim, t1op`realloc, FillT1`fill );
    661         test_base( t1p, size, tAlign );
    662         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    663         test_use( t1p, size / elemSize );
    664         free( t1p );
     663        test_base(t1p, size, tAlign);
     664        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     665        test_use(t1p, size / elemSize);
     666        free(t1p);
    665667
    666668        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    667669        t1p = alloc( dim / 4, t1op`realloc, FillT1`fill );
    668         test_base( t1p, size / 4, tAlign );
    669         test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});
    670         test_use( t1p, size / 4 / elemSize );
    671         free( t1p );
     670        test_base(t1p, size / 4, tAlign);
     671        test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef});
     672        test_use(t1p, size / 4 / elemSize);
     673        free(t1p);
    672674
    673675        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    674676        t1p = alloc( dim * 4, t1op`realloc, FillT1`fill );
    675         test_base( t1p, size * 4, tAlign );
    676         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    677         test_fill( t1p, dim, dim * 4, FillT1);
    678         test_use( t1p, size * 4 / elemSize );
    679         free( t1p );
     677        test_base(t1p, size * 4, tAlign);
     678        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     679        test_fill(t1p, dim, dim * 4, FillT1);
     680        test_use(t1p, size * 4 / elemSize);
     681        free(t1p);
    680682
    681683        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    682684        t1p = alloc( 0, t1op`realloc, FillT1`fill );
    683         test_base( t1p, 0, libAlign );
    684         free( t1p );
    685 
    686         t1p = alloc( dim, 0p`realloc, FillT1`fill );
    687         test_base( t1p, size, tAlign );
    688         test_fill( t1p, 0, dim, FillT1);
    689         test_use( t1p, size / elemSize );
    690         free( t1p );
    691 
    692         t1p = alloc( 0, 0p`realloc, FillT1`fill );
    693         test_base( t1p, 0, libAlign );
    694         free( t1p );
     685        test_base(t1p, 0, libAlign);
     686        free(t1p);
     687
     688        t1p = alloc( dim, ((T1*)0p)`realloc, FillT1`fill );
     689        test_base(t1p, size, tAlign);
     690        test_fill(t1p, 0, dim, FillT1);
     691        test_use(t1p, size / elemSize);
     692        free(t1p);
     693
     694        t1p = alloc( 0, ((T1*)0p)`realloc, FillT1`fill );
     695        test_base(t1p, 0, libAlign);
     696        free(t1p);
    695697
    696698        t1p = alloc( align`align );
    697         test_base( t1p, elemSize, align );
    698         test_use( t1p, elemSize / elemSize );
    699         free( t1p );
     699        test_base(t1p, elemSize, align);
     700        test_use(t1p, elemSize / elemSize);
     701        free(t1p);
    700702
    701703        t1p = alloc( dim, align`align );
    702         test_base( t1p, size, align );
    703         test_use( t1p, size / elemSize );
    704         free( t1p );
     704        test_base(t1p, size, align);
     705        test_use(t1p, size / elemSize);
     706        free(t1p);
    705707
    706708        t1p = alloc( 0, align`align );
    707         test_base( t1p, 0, libAlign );
    708         free( t1p );
     709        test_base(t1p, 0, libAlign);
     710        free(t1p);
    709711
    710712        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    711713        t1p = alloc( t1op`realloc, align`align );
    712         test_base( t1p, elemSize, align );
    713         test_fill( t1p, 0, 1, (T1){0xdeadbeef});
    714         test_use( t1p, elemSize / elemSize );
    715         free( t1p );
    716 
    717         t1p = alloc( 0p`realloc, align`align );
    718         test_base( t1p, elemSize, align );
    719         test_use( t1p, elemSize / elemSize );
    720         free( t1p );
     714        test_base(t1p, elemSize, align);
     715        test_fill(t1p, 0, 1, (T1){0xdeadbeef});
     716        test_use(t1p, elemSize / elemSize);
     717        free(t1p);
     718
     719        t1p = alloc( ((T1*)0p)`realloc, align`align );
     720        test_base(t1p, elemSize, align);
     721        test_use(t1p, elemSize / elemSize);
     722        free(t1p);
    721723
    722724        dp = alloc( dim );
    723725        t1p = alloc( dp`resize, align`align );
    724         test_base( t1p, elemSize, align );
    725         test_use( t1p, elemSize / elemSize );
    726         free( t1p );
    727 
    728         t1p = alloc( 0p`resize, align`align );
    729         test_base( t1p, elemSize, align );
    730         test_use( t1p, elemSize / elemSize );
    731         free( t1p );
     726        test_base(t1p, elemSize, align);
     727        test_use(t1p, elemSize / elemSize);
     728        free(t1p);
     729
     730        t1p = alloc( ((double*)0p)`resize, align`align );
     731        test_base(t1p, elemSize, align);
     732        test_use(t1p, elemSize / elemSize);
     733        free(t1p);
    732734
    733735        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    734736        t1p = alloc( dim, t1op`realloc, align`align );
    735         test_base( t1p, size, align );
    736         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    737         test_use( t1p, size / elemSize );
    738         free( t1p );
     737        test_base(t1p, size, align);
     738        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     739        test_use(t1p, size / elemSize);
     740        free(t1p);
    739741
    740742        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    741743        t1p = alloc( 0, t1op`realloc, align`align );
    742         test_base( t1p, 0, libAlign );
    743         free( t1p );
    744 
    745         t1p = alloc( dim, 0p`realloc, align`align );
    746         test_base( t1p, size, align );
    747         test_use( t1p, size / elemSize );
    748         free( t1p );
    749 
    750         t1p = alloc( 0, 0p`realloc, align`align );
    751         test_base( t1p, 0, libAlign );
    752         free( t1p );
     744        test_base(t1p, 0, libAlign);
     745        free(t1p);
     746
     747        t1p = alloc( dim, ((T1*)0p)`realloc, align`align );
     748        test_base(t1p, size, align);
     749        test_use(t1p, size / elemSize);
     750        free(t1p);
     751
     752        t1p = alloc( 0, ((T1*)0p)`realloc, align`align );
     753        test_base(t1p, 0, libAlign);
     754        free(t1p);
    753755
    754756        t1p = alloc( align`align, FillC`fill );
    755         test_base( t1p, elemSize, align );
    756         test_fill( t1p, 0, elemSize, FillC );
    757         test_use( t1p, elemSize / elemSize );
    758         free( t1p );
     757        test_base(t1p, elemSize, align);
     758        test_fill(t1p, 0, elemSize, FillC);
     759        test_use(t1p, elemSize / elemSize);
     760        free(t1p);
    759761
    760762        t1p = alloc( align`align, FillT1`fill );
    761         test_base( t1p, elemSize, align );
    762         test_fill( t1p, 0, 1, FillT1);
    763         test_use( t1p, elemSize / elemSize );
    764         free( t1p );
     763        test_base(t1p, elemSize, align);
     764        test_fill(t1p, 0, 1, FillT1);
     765        test_use(t1p, elemSize / elemSize);
     766        free(t1p);
    765767
    766768        t1p = alloc( dim, align`align, FillC`fill );
    767         test_base( t1p, size, align );
    768         test_fill( t1p, 0, size, FillC );
    769         test_use( t1p, size / elemSize );
    770         free( t1p );
     769        test_base(t1p, size, align);
     770        test_fill(t1p, 0, size, FillC);
     771        test_use(t1p, size / elemSize);
     772        free(t1p);
    771773
    772774        t1p = alloc( 0, align`align, FillC`fill );
    773         test_base( t1p, 0, libAlign );
    774         free( t1p );
     775        test_base(t1p, 0, libAlign);
     776        free(t1p);
    775777
    776778        t1p = alloc( dim, align`align, FillT1`fill );
    777         test_base( t1p, size, align );
    778         test_fill( t1p, 0, dim, FillT1);
    779         test_use( t1p, size / elemSize );
    780         free( t1p );
     779        test_base(t1p, size, align);
     780        test_fill(t1p, 0, dim, FillT1);
     781        test_use(t1p, size / elemSize);
     782        free(t1p);
    781783
    782784        t1p = alloc( 0, align`align, FillT1`fill );
    783         test_base( t1p, 0, libAlign );
    784         free( t1p );
     785        test_base(t1p, 0, libAlign);
     786        free(t1p);
    785787
    786788        t1p = alloc( dim, align`align, [FillT1A, dim / 4]`fill );
    787         test_base( t1p, size, align );
    788         test_fill( t1p, 0, size/4, FillT1A );
    789         test_use( t1p, size / elemSize );
    790         free( t1p );
     789        test_base(t1p, size, align);
     790        test_fill(t1p, 0, size/4, FillT1A);
     791        test_use(t1p, size / elemSize);
     792        free(t1p);
    791793
    792794        t1p = alloc( 0, align`align, [FillT1A, dim / 4]`fill );
    793         test_base( t1p, 0, libAlign );
    794         free( t1p );
     795        test_base(t1p, 0, libAlign);
     796        free(t1p);
    795797
    796798        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    797799        t1p = alloc( dim, t1op`realloc, align`align, FillC`fill );
    798         test_base( t1p, size, align );
    799         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    800         test_use( t1p, size / elemSize );
    801         free( t1p );
     800        test_base(t1p, size, align);
     801        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     802        test_use(t1p, size / elemSize);
     803        free(t1p);
    802804
    803805        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    804806        t1p = alloc( dim / 4, t1op`realloc, align`align, FillC`fill );
    805         test_base( t1p, size / 4, align );
    806         test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});
    807         test_use( t1p, size / 4 / elemSize );
    808         free( t1p );
     807        test_base(t1p, size / 4, align);
     808        test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef});
     809        test_use(t1p, size / 4 / elemSize);
     810        free(t1p);
    809811
    810812        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    811813        t1p = alloc( dim * 4, t1op`realloc, align`align, FillC`fill );
    812         test_base( t1p, size * 4, align );
    813         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    814         test_fill( t1p, size, size * 4, FillC );
    815         test_use( t1p, size * 4 / elemSize );
    816         free( t1p );
     814        test_base(t1p, size * 4, align);
     815        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     816        test_fill(t1p, size, size * 4, FillC);
     817        test_use(t1p, size * 4 / elemSize);
     818        free(t1p);
    817819
    818820        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    819821        t1p = alloc( 0, t1op`realloc, align`align, FillC`fill );
    820         test_base( t1p, 0, libAlign );
    821         free( t1p );
    822 
    823         t1p = alloc( dim, 0p`realloc, align`align, FillC`fill );
    824         test_base( t1p, size, align );
    825         test_fill( t1p, 0, size, FillC );
    826         test_use( t1p, size / elemSize );
    827         free( t1p );
    828 
    829         t1p = alloc( 0, 0p`realloc, align`align, FillC`fill );
    830         test_base( t1p, 0, libAlign );
    831         free( t1p );
    832 
    833         t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
     822        test_base(t1p, 0, libAlign);
     823        free(t1p);
     824
     825        t1p = alloc( dim, ((T1*)0p)`realloc, align`align, FillC`fill );
     826        test_base(t1p, size, align);
     827        test_fill(t1p, 0, size, FillC);
     828        test_use(t1p, size / elemSize);
     829        free(t1p);
     830
     831        t1p = alloc( 0, ((T1*)0p)`realloc, align`align, FillC`fill );
     832        test_base(t1p, 0, libAlign);
     833        free(t1p);
     834
     835        t1op = alloc( dim, ((T1){0xdeadbeef})`fill);
    834836        t1p = alloc( dim, t1op`realloc, align`align, FillT1`fill );
    835         test_base( t1p, size, align );
    836         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    837         test_use( t1p, size / elemSize );
    838         free( t1p );
     837        test_base(t1p, size, align);
     838        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     839        test_use(t1p, size / elemSize);
     840        free(t1p);
    839841
    840842        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    841843        t1p = alloc( dim / 4, t1op`realloc, align`align, FillT1`fill );
    842         test_base( t1p, size / 4, align );
    843         test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});
    844         test_use( t1p, size / 4 / elemSize );
    845         free( t1p );
     844        test_base(t1p, size / 4, align);
     845        test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef});
     846        test_use(t1p, size / 4 / elemSize);
     847        free(t1p);
    846848
    847849        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    848850        t1p = alloc( dim * 4, t1op`realloc, align`align, FillT1`fill );
    849         test_base( t1p, size * 4, align );
    850         test_fill( t1p, 0, dim, (T1){0xdeadbeef});
    851         test_fill( t1p, dim, dim * 4, FillT1);
    852         test_use( t1p, size * 4 / elemSize );
    853         free( t1p );
     851        test_base(t1p, size * 4, align);
     852        test_fill(t1p, 0, dim, (T1){0xdeadbeef});
     853        test_fill(t1p, dim, dim * 4, FillT1);
     854        test_use(t1p, size * 4 / elemSize);
     855        free(t1p);
    854856
    855857        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    856858        t1p = alloc( 0, t1op`realloc, align`align, FillT1`fill );
    857         test_base( t1p, 0, libAlign );
    858         free( t1p );
    859 
    860         t1p = alloc( dim, 0p`realloc, align`align, FillT1`fill );
    861         test_base( t1p, size, align );
    862         test_fill( t1p, 0, dim, FillT1);
    863         test_use( t1p, size / elemSize );
    864         free( t1p );
    865 
    866         t1p = alloc( 0, 0p`realloc, align`align, FillT1`fill );
    867         test_base( t1p, 0, libAlign );
    868         free( t1p );
    869 
    870         if ( tests_failed == 0) printf( "PASSED alloc tests (aligned struct)\n\n");
    871         else printf( "failed alloc tests ( aligned struct ) : %d/%d\n\n", tests_failed, tests_total );
    872 
    873         printf( "(if applicable) alignment error below indicates memory trashing caused by test_use.\n\n");
    874         free( FillA );
    875         free( FillT1A );
     859        test_base(t1p, 0, libAlign);
     860        free(t1p);
     861
     862        t1p = alloc( dim, ((T1*)0p)`realloc, align`align, FillT1`fill );
     863        test_base(t1p, size, align);
     864        test_fill(t1p, 0, dim, FillT1);
     865        test_use(t1p, size / elemSize);
     866        free(t1p);
     867
     868        t1p = alloc( 0, ((T1*)0p)`realloc, align`align, FillT1`fill );
     869        test_base(t1p, 0, libAlign);
     870        free(t1p);
     871
     872        if (tests_failed == 0) printf("PASSED alloc tests (aligned struct)\n\n");
     873        else printf("failed alloc tests (aligned struct) : %d/%d\n\n", tests_failed, tests_total);
     874
     875        printf("(if applicable) alignment error below indicates memory trashing caused by test_use.\n\n");
     876        free(FillA);
     877        free(FillT1A);
     878        return 0;
    876879} // main
Note: See TracChangeset for help on using the changeset viewer.