Changeset ffec1bf


Ignore:
Timestamp:
Jul 25, 2022, 2:23:28 PM (3 years ago)
Author:
Fangren Yu <f37yu@…>
Branches:
ADT, ast-experimental, master, pthread-emulation, qualifiedEnum
Children:
4c48be0, 5cf1228, def751f
Parents:
9e23b446 (diff), 1f950c3b (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

Files:
20 added
1 deleted
106 edited

Legend:

Unmodified
Added
Removed
  • Jenkins/FullBuild

    r9e23b446 rffec1bf  
    161161        <p>${result}</p>
    162162
    163         <p>- Performance ---------------------------------------------------------</p>
    164 
    165         <img src="https://cforall.uwaterloo.ca/jenkins/job/Cforall/job/master/plot/Compilation/getPlot?index=0" >
    166         <img src="https://cforall.uwaterloo.ca/jenkins/job/Cforall/job/master/plot/Compilation/getPlot?index=1" >
    167 
    168163        <p>- Logs ----------------------------------------------------------------</p>
    169164        """
  • Jenkinsfile

    r9e23b446 rffec1bf  
    209209
    210210                if( Settings.Publish && !Settings.RunBenchmark ) { echo 'No results to publish!!!' }
    211 
    212                 def groupCompile = new PlotGroup('Compilation', 'duration (s) - lower is better', true)
    213                 def groupConcurrency = new PlotGroup('Concurrency', 'duration (n) - lower is better', false)
    214 
    215                 //Then publish the results
    216                 do_plot(Settings.RunBenchmark && Settings.Publish, 'compile'        , groupCompile    , false, 'Compilation')
    217                 do_plot(Settings.RunBenchmark && Settings.Publish, 'compile.diff'   , groupCompile    , true , 'Compilation (relative)')
    218                 do_plot(Settings.RunBenchmark && Settings.Publish, 'ctxswitch'      , groupConcurrency, false, 'Context Switching')
    219                 do_plot(Settings.RunBenchmark && Settings.Publish, 'ctxswitch.diff' , groupConcurrency, true , 'Context Switching (relative)')
    220                 do_plot(Settings.RunBenchmark && Settings.Publish, 'mutex'          , groupConcurrency, false, 'Mutual Exclusion')
    221                 do_plot(Settings.RunBenchmark && Settings.Publish, 'mutex.diff'     , groupConcurrency, true , 'Mutual Exclusion (relative)')
    222                 do_plot(Settings.RunBenchmark && Settings.Publish, 'scheduling'     , groupConcurrency, false, 'Internal and External Scheduling')
    223                 do_plot(Settings.RunBenchmark && Settings.Publish, 'scheduling.diff', groupConcurrency, true , 'Internal and External Scheduling (relative)')
    224211        }
    225212}
     
    376363                this.GitNewRef = ''
    377364                this.GitOldRef = ''
    378         }
    379 }
    380 
    381 class PlotGroup implements Serializable {
    382         public String name
    383         public String unit
    384         public boolean log
    385 
    386         PlotGroup(String name, String unit, boolean log) {
    387                 this.name = name
    388                 this.unit = unit
    389                 this.log = log
    390365        }
    391366}
     
    476451        }
    477452}
    478 
    479 def do_plot(boolean new_data, String file, PlotGroup group, boolean relative, String title) {
    480 
    481         if(new_data) {
    482                 echo "Publishing new data"
    483         }
    484 
    485         def series = new_data ? [[
    486                                 file: "${file}.csv",
    487                                 exclusionValues: '',
    488                                 displayTableFlag: false,
    489                                 inclusionFlag: 'OFF',
    490                                 url: ''
    491                         ]] : [];
    492 
    493         echo "file is ${BuildDir}/benchmark/${file}.csv, group ${group}, title ${title}"
    494         dir("${BuildDir}/benchmark/") {
    495                 plot csvFileName: "cforall-${env.BRANCH_NAME}-${file}.csv",
    496                         csvSeries: series,
    497                         group: "${group.name}",
    498                         title: "${title}",
    499                         style: 'lineSimple',
    500                         exclZero: false,
    501                         keepRecords: false,
    502                         logarithmic: !relative && group.log,
    503                         numBuilds: '120',
    504                         useDescr: true,
    505                         yaxis: group.unit,
    506                         yaxisMaximum: '',
    507                         yaxisMinimum: ''
    508         }
    509 }
  • Makefile.am

    r9e23b446 rffec1bf  
    5252        @find libcfa -name config.status -printf "\n%h\n\t" -exec {} --config \; | sed "s/ /\n\t/g; s/\t'/\t/g; s/'\n/\n/g; s/^'//g; s/'$$//g"
    5353
    54 mostlyclean-local: @LIBCFA_TARGET_MAKEFILES@
    55         for dir in @LIBCFA_TARGET_DIRS@; do \
    56                 $(MAKE) -C $${dir} mostlyclean; \
    57         done
     54@LIBCFA_TARGET_DIRS@::
     55        $(MAKE) -C $@ $(MAKECMDGOALS)
    5856
    59 clean-local: @LIBCFA_TARGET_MAKEFILES@
    60         for dir in @LIBCFA_TARGET_DIRS@; do \
    61                 $(MAKE) -C $${dir} clean; \
    62         done
    63 
    64 distclean-local: @LIBCFA_TARGET_MAKEFILES@
    65         for dir in @LIBCFA_TARGET_DIRS@; do \
    66                 $(MAKE) -C $${dir} distclean; \
    67                 rm $${dir}/config.data; \
    68         done
     57mostlyclean clean distclean maintainer-clean: @LIBCFA_TARGET_DIRS@
  • benchmark/readyQ/churn.cfa

    r9e23b446 rffec1bf  
    5858
    5959                        threads_left = nthreads;
    60                         BThrd * threads[nthreads];
     60                        BThrd ** threads = alloc(nthreads);
    6161                        for(i; nthreads ) {
    6262                                BThrd & t = *(threads[i] = malloc());
     
    9090
    9191                        free(spots);
     92                        free(threads);
    9293                }
    9394
  • benchmark/readyQ/cycle.cfa

    r9e23b446 rffec1bf  
    5252                {
    5353                        threads_left = tthreads;
    54                         BThrd * threads[tthreads];
    55                         Partner thddata[tthreads];
     54                        BThrd **  threads = alloc(tthreads);
     55                        Partner * thddata = alloc(tthreads);
    5656                        for(i; tthreads) {
     57                                (thddata[i]){};
    5758                                unsigned pi = (i + nthreads) % tthreads;
    5859                                thddata[i].next = &thddata[pi].self;
     
    8384                                delete(threads[i]);
    8485                        }
     86                        free(threads);
     87                        free(thddata);
    8588                }
    8689
  • benchmark/readyQ/cycle.cpp

    r9e23b446 rffec1bf  
    3939                {
    4040                        threads_left = tthreads;
    41                         Fibre * threads[tthreads];
    42                         Partner thddata[tthreads];
     41                        Fibre ** threads = new Fibre *[tthreads]();
     42                        Partner* thddata = new Partner[tthreads]();
    4343                        for(unsigned i = 0; i < tthreads; i++) {
    4444                                unsigned pi = (i + nthreads) % tthreads;
     
    6969                                global_blocks  += thddata[i].blocks;
    7070                        }
     71
     72                        delete[](threads);
     73                        delete[](thddata);
    7174                }
    7275
  • benchmark/readyQ/locality.cfa

    r9e23b446 rffec1bf  
    222222                threads_left = nprocs;
    223223                {
    224                         MyThread * threads[nthreads];
     224                        MyThread ** threads = alloc(nthreads);
    225225                        for(i; nthreads) {
    226226                                threads[i] = malloc();
     
    259259                                free( threads[i] );
    260260                        }
     261                        free( threads );
    261262                }
    262263
  • benchmark/readyQ/locality.cpp

    r9e23b446 rffec1bf  
    217217        {
    218218                FibreInit(1, nprocs);
    219                 MyData * data_arrays[nthreads];
     219                MyData ** data_arrays = new MyData *[nthreads]();
    220220                for(size_t i = 0; i < nthreads; i++) {
    221221                        data_arrays[i] = new MyData( i, wsize );
     
    228228
    229229                threads_left = nthreads - nspots;
    230                 Fibre * threads[nthreads];
    231                 MyCtx * thddata[nthreads];
     230                Fibre ** threads = new Fibre *[nthreads]();
     231                MyCtx ** thddata = new MyCtx *[nthreads]();
    232232                {
    233233                        for(size_t i = 0; i < nthreads; i++) {
     
    240240                                        i
    241241                                );
    242                                 threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(thread_main), thddata[i] );
     242                                threads[i] = new Fibre();
     243                                threads[i]->run( reinterpret_cast<void (*)(MyCtx*)>(thread_main), thddata[i] );
    243244                        }
    244245
     
    267268                        delete( data_arrays[i] );
    268269                }
     270                delete[](data_arrays);
    269271
    270272                for(size_t i = 0; i < nspots; i++) {
    271273                        delete( spots[i] );
    272274                }
     275
     276                delete[](threads);
     277                delete[](thddata);
    273278        }
    274279
  • benchmark/readyQ/yield.cfa

    r9e23b446 rffec1bf  
    3434                {
    3535                        threads_left = nthreads;
    36                         Yielder threads[nthreads];
     36                        Yielder * threads = alloc(nthreads);
     37                        for(i; nthreads) {
     38                                (threads[i]){};
     39                        }
     40
    3741                        printf("Starting\n");
    3842
     
    5256                                Yielder & y = join( threads[i] );
    5357                                global_counter += y.count;
     58                                ^(threads[i]){};
    5459                        }
     60                        free(threads);
    5561                }
    5662
  • benchmark/readyQ/yield.cpp

    r9e23b446 rffec1bf  
    3333                {
    3434                        threads_left = nthreads;
    35                         Fibre * threads[nthreads];
     35                        Fibre ** threads = new Fibre *[nthreads]();
    3636                        for(unsigned i = 0; i < nthreads; i++) {
    3737                                threads[i] = new Fibre();
     
    5252                                fibre_join( threads[i], nullptr );
    5353                        }
     54                        delete[] threads;
    5455                }
    5556
  • doc/bibliography/pl.bib

    r9e23b446 rffec1bf  
    20242024@manual{C++20Coroutine19,
    20252025    keywords    = {coroutine},
     2026    key         = {Coroutines},
    20262027    contributer = {pabuhr@plg},
    20272028    title       = {Coroutines (C++20)},
    20282029    organization= {cppreference.com},
    2029     month       = apr,
    2030     year        = 2019,
     2030    month       = jun,
     2031    year        = 2022,
    20312032    note        = {\href{https://en.cppreference.com/w/cpp/language/coroutines}{https://\-en.cppreference.com/\-w/\-cpp/\-language/\-coroutines}},
    20322033}
     
    69916992% S
    69926993
     6994@inproceedings{Imam14,
     6995    keywords    = {actor model, performance comparison, java actor libraries, benchmark suite},
     6996    contributer = {pabuhr@plg},
     6997    author      = {Shams M. Imam and Vivek Sarkar},
     6998    title       = {Savina - An Actor Benchmark Suite: Enabling Empirical Evaluation of Actor Libraries},
     6999    year        = {2014},
     7000    publisher   = {ACM},
     7001    address     = {New York, NY, USA},
     7002    booktitle   = {Proceedings of the 4th International Workshop on Programming Based on Actors Agents \& Decentralized Control},
     7003    pages       = {67-80},
     7004    numpages    = {14},
     7005    location    = {Portland, Oregon, USA},
     7006    series      = {AGERE! '14}
     7007}
     7008
    69937009@manual{Scala,
    69947010    keywords    = {Scala programming language},
  • doc/theses/mike_brooks_MMath/array.tex

    r9e23b446 rffec1bf  
    182182\CFA's array is also the first extension of C to use its tracked bounds to generate the pointer arithmetic implied by advanced allocation patterns.  Other bound-tracked extensions of C either forbid certain C patterns entirely, or address the problem of \emph{verifying} that the user's provided pointer arithmetic is self-consistent.  The \CFA array, applied to accordion structures [TOD: cross-reference] \emph{implies} the necessary pointer arithmetic, generated automatically, and not appearing at all in a user's program.
    183183
    184 \subsction{Safety in a padded room}
     184\subsection{Safety in a padded room}
    185185
    186186Java's array [todo:cite] is a straightforward example of assuring safety against undefined behaviour, at a cost of expressiveness for more applied properties.  Consider the array parameter declarations in:
  • doc/theses/thierry_delisle_PhD/thesis/.gitignore

    r9e23b446 rffec1bf  
    11back_text/
     2SAVE.fig
  • doc/theses/thierry_delisle_PhD/thesis/Makefile

    r9e23b446 rffec1bf  
    3434        base \
    3535        base_avg \
     36        base_ts2 \
    3637        cache-share \
    3738        cache-noshare \
     
    4041        emptytls \
    4142        emptytree \
     43        executionStates \
    4244        fairness \
    4345        idle \
     
    4749        io_uring \
    4850        pivot_ring \
     51        MQMS \
     52        MQMSG \
    4953        system \
    5054        cycle \
     
    6569        result.memcd.rate.qps \
    6670        result.memcd.rate.99th \
     71        SQMS \
    6772}
    6873
  • doc/theses/thierry_delisle_PhD/thesis/fig/base.fig

    r9e23b446 rffec1bf  
    13131 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
    1414-6
    15 6 6375 5100 6675 5250
    16 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6450 5175 20 20 6450 5175 6470 5175
    17 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6525 5175 20 20 6525 5175 6545 5175
    18 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6600 5175 20 20 6600 5175 6620 5175
     156 6450 5025 6750 5175
     161 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6525 5100 20 20 6525 5100 6545 5100
     171 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6600 5100 20 20 6600 5100 6620 5100
     181 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6675 5100 20 20 6675 5100 6695 5100
    1919-6
    20201 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 2400 300 300 3900 2400 4200 2400
     
    80802 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    8181         2400 2475 3000 2475
    82 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    83          3300 5210 3150 4950 2850 4950 2700 5210 2850 5470 3150 5470
    84          3300 5210
    85 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    86          4500 5210 4350 4950 4050 4950 3900 5210 4050 5470 4350 5470
    87          4500 5210
    88 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    89          5700 5210 5550 4950 5250 4950 5100 5210 5250 5470 5550 5470
    90          5700 5210
    91822 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2
    92          3600 5700 3600 1200
     83         3600 5400 3600 1200
    93842 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2
    94          4800 5700 4800 1200
     85         4800 5400 4800 1200
    95862 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2
    96          6000 5700 6000 1200
    97 4 2 -1 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001
    98 4 2 -1 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001
    99 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 4450 TS\001
    100 4 2 -1 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001
    101 4 2 -1 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001
    102 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 3550 TS\001
    103 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 2650 TS\001
    104 4 2 -1 50 -1 0 12 0.0000 2 135 900 2100 5175 Processors\001
     87         6000 5400 6000 1200
     882 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     89         2700 4800 3300 4800 3300 5400 2700 5400 2700 4800
     902 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     91         3900 4800 4500 4800 4500 5400 3900 5400 3900 4800
     922 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     93         5100 4800 5700 4800 5700 5400 5100 5400 5100 4800
     944 2 -1 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
     954 2 -1 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001
     964 1 -1 50 -1 0 11 0.0000 2 120 210 2700 4450 TS\001
     974 2 -1 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
     984 2 -1 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
     994 1 -1 50 -1 0 11 0.0000 2 120 210 2700 3550 TS\001
     1004 2 -1 50 -1 0 12 0.0000 2 135 840 2100 5175 Processors\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/base_avg.fig

    r9e23b446 rffec1bf  
    13131 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
    1414-6
    15 6 6375 5100 6675 5250
    16 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6450 5175 20 20 6450 5175 6470 5175
    17 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6525 5175 20 20 6525 5175 6545 5175
    18 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6600 5175 20 20 6600 5175 6620 5175
     156 6450 5025 6750 5175
     161 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6525 5100 20 20 6525 5100 6545 5100
     171 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6600 5100 20 20 6600 5100 6620 5100
     181 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6675 5100 20 20 6675 5100 6695 5100
    1919-6
    20201 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 2400 300 300 3900 2400 4200 2400
     
    52522 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    5353        1 1 1.00 45.00 90.00
    54          3900 3975 3900 3600
     54         3900 4200 3900 3600
    55552 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    5656        1 1 1.00 45.00 90.00
     
    61612 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    6262        1 1 1.00 45.00 90.00
    63          5100 3975 5100 3600
     63         5100 4200 5100 3600
    64642 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    6565        1 1 1.00 45.00 90.00
     
    67672 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    6868        1 1 1.00 45.00 90.00
    69          6300 3975 6300 3600
     69         6300 4200 6300 3600
    70702 1 0 1 -1 7 50 -1 -1 0.000 0 0 -1 1 0 2
    7171        1 1 1.00 45.00 90.00
     
    75752 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    7676        1 1 1.00 45.00 90.00
    77          4500 3975 4500 3600
     77         4500 4200 4500 3600
    78782 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    7979         2400 3375 3000 3375
    80802 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    8181         2400 2475 3000 2475
    82 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    83          3300 5210 3150 4950 2850 4950 2700 5210 2850 5470 3150 5470
    84          3300 5210
    85 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    86          4500 5210 4350 4950 4050 4950 3900 5210 4050 5470 4350 5470
    87          4500 5210
    88 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    89          5700 5210 5550 4950 5250 4950 5100 5210 5250 5470 5550 5470
    90          5700 5210
    91822 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2
    92          3600 5700 3600 1200
     83         3600 5400 3600 1200
    93842 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2
    94          4800 5700 4800 1200
     85         4800 5400 4800 1200
    95862 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2
    96          6000 5700 6000 1200
     87         6000 5400 6000 1200
     882 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     89         2700 4800 3300 4800 3300 5400 2700 5400 2700 4800
     902 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     91         3900 4800 4500 4800 4500 5400 3900 5400 3900 4800
     922 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     93         5100 4800 5700 4800 5700 5400 5100 5400 5100 4800
    97942 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    9895         2400 4050 3000 4050
    99 4 2 -1 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001
    100 4 2 -1 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001
    101 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 4450 MA\001
    102 4 2 -1 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001
    103 4 2 -1 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001
    104 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 3550 TS\001
    105 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 2650 TS\001
    106 4 2 -1 50 -1 0 12 0.0000 2 135 900 2100 5175 Processors\001
    107 4 1 -1 50 -1 0 11 0.0000 2 135 180 2700 4200 TS\001
     964 2 -1 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
     974 2 -1 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001
     984 1 -1 50 -1 0 11 0.0000 2 120 300 2700 4450 MA\001
     994 2 -1 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
     1004 2 -1 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
     1014 1 -1 50 -1 0 11 0.0000 2 120 210 2700 3550 TS\001
     1024 2 -1 50 -1 0 12 0.0000 2 135 840 2100 5175 Processors\001
     1034 1 -1 50 -1 0 11 0.0000 2 120 210 2700 4225 TS\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/cache-noshare.fig

    r9e23b446 rffec1bf  
    88-2
    991200 2
    10 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2550 2550 456 456 2550 2550 2100 2475
    11 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3750 2550 456 456 3750 2550 3300 2475
    12 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4950 2550 456 456 4950 2550 4500 2475
    13 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6150 2550 456 456 6150 2550 5700 2475
     101 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1650 1650 456 456 1650 1650 1200 1575
     111 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2850 1650 456 456 2850 1650 2400 1575
     121 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4050 1650 456 456 4050 1650 3600 1575
     131 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5250 1650 456 456 5250 1650 4800 1575
    14142 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    15          2100 3300 3000 3300 3000 3600 2100 3600 2100 3300
     15         1200 2400 2100 2400 2100 2700 1200 2700 1200 2400
    16162 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    17          2100 3900 3000 3900 3000 4500 2100 4500 2100 3900
     17         1200 3000 2100 3000 2100 3600 1200 3600 1200 3000
    18182 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    19          3300 3300 4200 3300 4200 3600 3300 3600 3300 3300
     19         2400 2400 3300 2400 3300 2700 2400 2700 2400 2400
    20202 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    21          3300 3900 4200 3900 4200 4500 3300 4500 3300 3900
     21         2400 3000 3300 3000 3300 3600 2400 3600 2400 3000
    22222 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    23          4500 3300 5400 3300 5400 3600 4500 3600 4500 3300
     23         3600 2400 4500 2400 4500 2700 3600 2700 3600 2400
    24242 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    25          4500 3900 5400 3900 5400 4500 4500 4500 4500 3900
     25         3600 3000 4500 3000 4500 3600 3600 3600 3600 3000
    26262 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    27          5700 3300 6600 3300 6600 3600 5700 3600 5700 3300
     27         4800 2400 5700 2400 5700 2700 4800 2700 4800 2400
    28282 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    29          5700 3900 6600 3900 6600 4500 5700 4500 5700 3900
     29         4800 3000 5700 3000 5700 3600 4800 3600 4800 3000
    30302 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    31          2100 4800 4200 4800 4200 5700 2100 5700 2100 4800
     31         1200 3900 3300 3900 3300 4800 1200 4800 1200 3900
    32322 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    33          4500 4800 6600 4800 6600 5700 4500 5700 4500 4800
     33         3600 3900 5700 3900 5700 4800 3600 4800 3600 3900
    34342 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    3535        1 1 1.00 60.00 45.00
    3636        1 1 1.00 60.00 45.00
    37          2550 3000 2550 3300
     37         1650 2100 1650 2400
    38382 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    3939        1 1 1.00 60.00 45.00
    4040        1 1 1.00 60.00 45.00
    41          6150 3000 6150 3300
     41         5250 2100 5250 2400
    42422 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4343        1 1 1.00 60.00 45.00
    4444        1 1 1.00 60.00 45.00
    45          6150 3600 6150 3900
     45         5250 2700 5250 3000
    46462 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4747        1 1 1.00 60.00 45.00
    4848        1 1 1.00 60.00 45.00
    49          3750 3000 3750 3300
     49         2850 2100 2850 2400
    50502 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5151        1 1 1.00 60.00 45.00
    5252        1 1 1.00 60.00 45.00
    53          4950 3000 4950 3300
     53         4050 2100 4050 2400
    54542 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5555        1 1 1.00 60.00 45.00
    5656        1 1 1.00 60.00 45.00
    57          4950 3600 4950 3900
     57         4050 2700 4050 3000
    58582 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5959        1 1 1.00 60.00 45.00
    6060        1 1 1.00 60.00 45.00
    61          3750 3600 3750 3900
     61         1650 2700 1650 3000
    62622 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6363        1 1 1.00 60.00 45.00
    6464        1 1 1.00 60.00 45.00
    65          2550 3600 2550 3900
     65         1650 3600 1650 3900
    66662 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6767        1 1 1.00 60.00 45.00
    6868        1 1 1.00 60.00 45.00
    69          2550 4500 2550 4800
     69         2850 3600 2850 3900
    70702 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7171        1 1 1.00 60.00 45.00
    7272        1 1 1.00 60.00 45.00
    73          3750 4500 3750 4800
     73         4050 3600 4050 3900
    74742 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7575        1 1 1.00 60.00 45.00
    7676        1 1 1.00 60.00 45.00
    77          4950 4500 4950 4800
     77         5250 3600 5250 3900
    78782 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7979        1 1 1.00 60.00 45.00
    8080        1 1 1.00 60.00 45.00
    81          6150 4500 6150 4800
     81         3300 4350 3600 4350
    82822 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    8383        1 1 1.00 60.00 45.00
    8484        1 1 1.00 60.00 45.00
    85          4200 5250 4500 5250
    86 4 0 0 50 -1 0 11 0.0000 2 135 360 4725 2625 CPU2\001
    87 4 0 0 50 -1 0 11 0.0000 2 135 360 2325 2625 CPU0\001
    88 4 0 0 50 -1 0 11 0.0000 2 135 360 5925 2625 CPU3\001
    89 4 0 0 50 -1 0 11 0.0000 2 135 360 3525 2625 CPU1\001
    90 4 0 0 50 -1 0 11 0.0000 2 135 180 2475 3525 L1\001
    91 4 0 0 50 -1 0 11 0.0000 2 135 180 4875 3525 L1\001
    92 4 0 0 50 -1 0 11 0.0000 2 135 180 6075 3525 L1\001
    93 4 0 0 50 -1 0 11 0.0000 2 135 180 2400 4275 L2\001
    94 4 0 0 50 -1 0 11 0.0000 2 135 180 4875 4275 L2\001
    95 4 0 0 50 -1 0 11 0.0000 2 135 180 3675 4275 L2\001
    96 4 0 0 50 -1 0 11 0.0000 2 135 180 6075 4275 L2\001
    97 4 0 0 50 -1 0 11 0.0000 2 135 180 3675 3525 L1\001
    98 4 0 0 50 -1 0 11 0.0000 2 135 180 3000 5250 L3\001
    99 4 0 0 50 -1 0 11 0.0000 2 135 180 5475 5250 L3\001
     85         2850 2700 2850 3000
     864 1 0 50 -1 0 12 0.0000 2 165 945 1650 1725 CORE$_0$\001
     874 1 0 50 -1 0 12 0.0000 2 135 225 2250 4425 L3\001
     884 1 0 50 -1 0 12 0.0000 2 135 225 4650 4425 L3\001
     894 1 0 50 -1 0 12 0.0000 2 135 225 5250 3375 L2\001
     904 1 0 50 -1 0 12 0.0000 2 135 225 4050 3375 L2\001
     914 1 0 50 -1 0 12 0.0000 2 135 225 2850 3375 L2\001
     924 1 0 50 -1 0 12 0.0000 2 135 225 1650 3375 L2\001
     934 1 0 50 -1 0 12 0.0000 2 135 225 1650 2625 L1\001
     944 1 0 50 -1 0 12 0.0000 2 135 225 2850 2625 L1\001
     954 1 0 50 -1 0 12 0.0000 2 135 225 4050 2625 L1\001
     964 1 0 50 -1 0 12 0.0000 2 135 225 5250 2625 L1\001
     974 1 0 50 -1 0 12 0.0000 2 165 945 2850 1725 CORE$_1$\001
     984 1 0 50 -1 0 12 0.0000 2 165 945 4050 1725 CORE$_2$\001
     994 1 0 50 -1 0 12 0.0000 2 165 945 5250 1725 CORE$_3$\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/cache-share.fig

    r9e23b446 rffec1bf  
    88-2
    991200 2
    10 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2550 2550 456 456 2550 2550 2100 2475
    11 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3750 2550 456 456 3750 2550 3300 2475
    12 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4950 2550 456 456 4950 2550 4500 2475
    13 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6150 2550 456 456 6150 2550 5700 2475
     101 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1650 1650 456 456 1650 1650 1200 1575
     111 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4050 1650 456 456 4050 1650 3600 1575
     121 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5250 1650 456 456 5250 1650 4800 1575
     131 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2850 1650 456 456 2850 1650 2400 1575
    14142 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    15          2100 3300 3000 3300 3000 3600 2100 3600 2100 3300
     15         1200 2400 2100 2400 2100 2700 1200 2700 1200 2400
    16162 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    17          2100 3900 3000 3900 3000 4500 2100 4500 2100 3900
     17         1200 3000 2100 3000 2100 3600 1200 3600 1200 3000
    18182 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    19          3300 3300 4200 3300 4200 3600 3300 3600 3300 3300
     19         2400 2400 3300 2400 3300 2700 2400 2700 2400 2400
    20202 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    21          3300 3900 4200 3900 4200 4500 3300 4500 3300 3900
     21         2400 3000 3300 3000 3300 3600 2400 3600 2400 3000
    22222 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    23          4500 3300 5400 3300 5400 3600 4500 3600 4500 3300
     23         3600 2400 4500 2400 4500 2700 3600 2700 3600 2400
    24242 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    25          4500 3900 5400 3900 5400 4500 4500 4500 4500 3900
     25         3600 3000 4500 3000 4500 3600 3600 3600 3600 3000
    26262 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    27          5700 3300 6600 3300 6600 3600 5700 3600 5700 3300
     27         4800 2400 5700 2400 5700 2700 4800 2700 4800 2400
    28282 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    29          5700 3900 6600 3900 6600 4500 5700 4500 5700 3900
    30 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    31          2100 4800 6600 4800 6600 5775 2100 5775 2100 4800
     29         4800 3000 5700 3000 5700 3600 4800 3600 4800 3000
    32302 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    3331        1 1 1.00 60.00 45.00
    3432        1 1 1.00 60.00 45.00
    35          2550 3000 2550 3300
     33         1650 2100 1650 2400
    36342 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    3735        1 1 1.00 60.00 45.00
    3836        1 1 1.00 60.00 45.00
    39          3750 3000 3750 3300
     37         2850 2100 2850 2400
    40382 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4139        1 1 1.00 60.00 45.00
    4240        1 1 1.00 60.00 45.00
    43          4950 3000 4950 3300
     41         4050 2100 4050 2400
    44422 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4543        1 1 1.00 60.00 45.00
    4644        1 1 1.00 60.00 45.00
    47          6150 3000 6150 3300
     45         5250 2100 5250 2400
    48462 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    4947        1 1 1.00 60.00 45.00
    5048        1 1 1.00 60.00 45.00
    51          6150 3600 6150 3900
     49         5250 2700 5250 3000
    52502 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5351        1 1 1.00 60.00 45.00
    5452        1 1 1.00 60.00 45.00
    55          4950 3600 4950 3900
     53         4050 2700 4050 3000
    56542 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5755        1 1 1.00 60.00 45.00
    5856        1 1 1.00 60.00 45.00
    59          3750 3600 3750 3900
     57         2850 2700 2850 3000
    60582 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6159        1 1 1.00 60.00 45.00
    6260        1 1 1.00 60.00 45.00
    63          2550 3600 2550 3900
     61         1650 2700 1650 3000
    64622 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6563        1 1 1.00 60.00 45.00
    6664        1 1 1.00 60.00 45.00
    67          2550 4500 2550 4800
     65         1650 3600 1650 3900
    68662 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6967        1 1 1.00 60.00 45.00
    7068        1 1 1.00 60.00 45.00
    71          3750 4500 3750 4800
     69         2850 3600 2850 3900
    72702 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7371        1 1 1.00 60.00 45.00
    7472        1 1 1.00 60.00 45.00
    75          4950 4500 4950 4800
     73         4050 3600 4050 3900
    76742 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7775        1 1 1.00 60.00 45.00
    7876        1 1 1.00 60.00 45.00
    79          6150 4500 6150 4800
    80 4 0 0 50 -1 0 11 0.0000 2 135 360 4725 2625 CPU2\001
    81 4 0 0 50 -1 0 11 0.0000 2 135 360 2325 2625 CPU0\001
    82 4 0 0 50 -1 0 11 0.0000 2 135 360 5925 2625 CPU3\001
    83 4 0 0 50 -1 0 11 0.0000 2 135 360 3525 2625 CPU1\001
    84 4 0 0 50 -1 0 11 0.0000 2 135 180 2475 3525 L1\001
    85 4 0 0 50 -1 0 11 0.0000 2 135 180 4875 3525 L1\001
    86 4 0 0 50 -1 0 11 0.0000 2 135 180 6075 3525 L1\001
    87 4 0 0 50 -1 0 11 0.0000 2 135 180 2400 4275 L2\001
    88 4 0 0 50 -1 0 11 0.0000 2 135 180 4875 4275 L2\001
    89 4 0 0 50 -1 0 11 0.0000 2 135 180 3675 4275 L2\001
    90 4 0 0 50 -1 0 11 0.0000 2 135 180 6075 4275 L2\001
    91 4 0 0 50 -1 0 11 0.0000 2 135 180 3675 3525 L1\001
    92 4 0 0 50 -1 0 11 0.0000 2 135 180 4275 5325 L3\001
     77         5250 3600 5250 3900
     782 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     79         1200 3900 5700 3900 5700 4800 1200 4800 1200 3900
     804 1 0 50 -1 0 12 0.0000 2 135 225 3450 4425 L3\001
     814 1 0 50 -1 0 12 0.0000 2 135 225 1650 3375 L2\001
     824 1 0 50 -1 0 12 0.0000 2 135 225 2850 3375 L2\001
     834 1 0 50 -1 0 12 0.0000 2 135 225 4050 3375 L2\001
     844 1 0 50 -1 0 12 0.0000 2 135 225 5250 3375 L2\001
     854 1 0 50 -1 0 12 0.0000 2 135 225 5250 2625 L1\001
     864 1 0 50 -1 0 12 0.0000 2 135 225 4050 2625 L1\001
     874 1 0 50 -1 0 12 0.0000 2 135 225 2850 2625 L1\001
     884 1 0 50 -1 0 12 0.0000 2 135 225 1650 2625 L1\001
     894 1 0 50 -1 0 12 0.0000 2 165 945 1650 1725 CORE$_0$\001
     904 1 0 50 -1 0 12 0.0000 2 165 945 2850 1725 CORE$_1$\001
     914 1 0 50 -1 0 12 0.0000 2 165 945 4050 1725 CORE$_2$\001
     924 1 0 50 -1 0 12 0.0000 2 165 945 5250 1725 CORE$_3$\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/cycle.fig

    r9e23b446 rffec1bf  
    88-2
    991200 2
    10 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3144.643 2341.072 3525 2250 3375 2025 3150 1950
    11         2 0 1.00 60.00 120.00
    12 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 1955.357 2341.072 1950 1950 1725 2025 1575 2250
    13         2 0 1.00 60.00 120.00
    14 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3637.500 3487.500 3750 3750 3900 3600 3900 3375
    15         2 0 1.00 60.00 120.00
    16 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2587.500 4087.500 2325 4500 2550 4575 2850 4500
    17         2 0 1.00 60.00 120.00
    18 5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 1612.500 3487.500 1200 3375 1200 3600 1350 3825
    19         2 0 1.00 60.00 120.00
    20 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3675 2850 586 586 3675 2850 4125 3225
    21 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3300 4125 586 586 3300 4125 3750 4500
    22 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1875 4125 586 586 1875 4125 2325 4500
    23 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1425 2850 586 586 1425 2850 1875 3225
    24 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2550 1950 586 586 2550 1950 3000 2325
    25 4 0 0 50 -1 0 11 0.0000 2 135 720 1125 2925 Thread 2\001
    26 4 2 0 50 -1 0 11 0.0000 2 165 540 1650 1950 Unpark\001
    27 4 0 0 50 -1 0 11 0.0000 2 165 540 4050 3600 Unpark\001
    28 4 2 0 50 -1 0 11 0.0000 2 165 540 1125 3750 Unpark\001
    29 4 2 0 50 -1 0 11 0.0000 2 165 540 2850 4800 Unpark\001
    30 4 0 0 50 -1 0 11 0.0000 2 135 720 2250 2025 Thread 1\001
    31 4 0 0 50 -1 0 11 0.0000 2 135 720 3000 4200 Thread 4\001
    32 4 0 0 50 -1 0 11 0.0000 2 135 720 1575 4200 Thread 3\001
    33 4 0 0 50 -1 0 11 0.0000 2 165 540 3525 2025 Unpark\001
    34 4 0 0 50 -1 0 11 0.0000 2 135 720 3375 2925 Thread 5\001
     105 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 3150.000 4012.500 2850 4575 3150 4650 3450 4575
     11        1 1 1.00 60.00 120.00
     125 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 2268.750 3450.000 1950 3825 1800 3600 1800 3300
     13        1 1 1.00 60.00 120.00
     145 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 4031.250 3450.000 4350 3825 4500 3600 4500 3300
     15        1 1 1.00 60.00 120.00
     165 1 0 1 0 7 50 -1 -1 0.000 0 0 0 1 3675.000 2250.000 3750 1725 4050 1875 4200 2175
     17        1 1 1.00 60.00 120.00
     185 1 0 1 0 7 50 -1 -1 0.000 0 1 1 0 2625.000 2250.000 2550 1725 2250 1875 2100 2175
     19        1 1 1.00 60.00 120.00
     201 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3150 1800 600 600 3150 1800 3750 1800
     211 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1875 2700 600 600 1875 2700 2475 2700
     221 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2400 4200 600 600 2400 4200 3000 4200
     231 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 4200 600 600 3900 4200 4500 4200
     241 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4425 2700 600 600 4425 2700 5025 2700
     254 1 0 50 -1 0 11 0.0000 2 165 855 2400 4275 Thread$_3$\001
     264 1 0 50 -1 0 11 0.0000 2 165 855 3900 4275 Thread$_4$\001
     274 1 0 50 -1 0 11 0.0000 2 165 855 1875 2775 Thread$_2$\001
     284 1 0 50 -1 0 11 0.0000 2 165 855 3150 1875 Thread$_1$\001
     294 1 0 50 -1 0 11 0.0000 2 165 855 4425 2775 Thread$_5$\001
     304 1 0 50 -1 0 11 0.0000 2 180 540 3150 4875 Unpark\001
     314 0 0 50 -1 0 11 0.0000 2 180 540 4650 3675 Unpark\001
     324 2 0 50 -1 0 11 0.0000 2 180 540 1650 3600 Unpark\001
     334 2 0 50 -1 0 11 0.0000 2 180 540 2100 1875 Unpark\001
     344 0 0 50 -1 0 11 0.0000 2 180 540 4200 1875 Unpark\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/idle.fig

    r9e23b446 rffec1bf  
    88-2
    991200 2
    10 6 5919 5250 6375 5775
    11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410
    12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410
    13 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    14          6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501
    15          6284 5501 6284 5410
    16 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
    17          6102 5410 6102 5501 6192 5501 6192 5410
    18 -6
    19 6 7442 6525 7875 6900
     105 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3376.136 2169.318 2250 2625 2775 3225 3525 3375
     11        1 1 1.00 60.00 120.00
     12        7 1 1.00 60.00 60.00
     136 3466 2774 3899 3149
    20142 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    21          7501 6584 7442 6900
     15         3525 2833 3466 3149
    22162 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    23          7856 6584 7836 6703
     17         3880 2833 3860 2952
    24183 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    25          7481 6703 7599 6663 7737 6722 7836 6703
     19         3505 2952 3623 2912 3761 2971 3860 2952
    2620         0.000 -0.500 -0.500 0.000
    27213 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    28          7503 6579 7621 6540 7759 6599 7857 6579
     22         3527 2828 3645 2789 3783 2848 3881 2828
    2923         0.000 -0.500 -0.500 0.000
    3024-6
    31 6 7575 6825 7950 7325
     256 3599 3074 3974 3574
    32262 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    33          7575 6950 7700 6825 7950 6825 7950 7325 7575 7325 7575 6950
    34          7700 6950 7700 6825
     27         3599 3199 3724 3074 3974 3074 3974 3574 3599 3574 3599 3199
     28         3724 3199 3724 3074
    3529-6
    36 6 9092 6525 9525 6900
     306 5116 2774 5549 3149
    37312 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    38          9151 6584 9092 6900
     32         5175 2833 5116 3149
    39332 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    40          9506 6584 9486 6703
     34         5530 2833 5510 2952
    41353 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    42          9131 6703 9249 6663 9387 6722 9486 6703
     36         5155 2952 5273 2912 5411 2971 5510 2952
    4337         0.000 -0.500 -0.500 0.000
    44383 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    45          9153 6579 9271 6540 9409 6599 9507 6579
     39         5177 2828 5295 2789 5433 2848 5531 2828
    4640         0.000 -0.500 -0.500 0.000
    4741-6
    48 6 9225 6825 9600 7325
     426 5249 3074 5625 3574
    49432 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    50          9225 6950 9350 6825 9600 6825 9600 7325 9225 7325 9225 6950
    51          9350 6950 9350 6825
     44         5249 3199 5374 3074 5625 3074 5625 3574 5249 3574 5249 3199
     45         5374 3199 5374 3074
    5246-6
    53 6 10742 6525 11175 6900
     476 6766 2774 7199 3149
    54482 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    55          10801 6584 10742 6900
     49         6825 2833 6766 3149
    56502 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    57          11156 6584 11136 6703
     51         7180 2833 7160 2952
    58523 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    59          10781 6703 10899 6663 11037 6722 11136 6703
     53         6805 2952 6923 2912 7061 2971 7160 2952
    6054         0.000 -0.500 -0.500 0.000
    61553 2 0 1 0 7 50 -1 -1 0.000 0 0 0 4
    62          10803 6579 10921 6540 11059 6599 11157 6579
     56         6827 2828 6945 2789 7083 2848 7181 2828
    6357         0.000 -0.500 -0.500 0.000
    6458-6
    65 6 10875 6825 11250 7325
     596 6899 3074 7274 3574
    66602 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    67          10875 6950 11000 6825 11250 6825 11250 7325 10875 7325 10875 6950
    68          11000 6950 11000 6825
     61         6899 3199 7024 3074 7274 3074 7274 3574 6899 3574 6899 3199
     62         7024 3199 7024 3074
     63-6
     646 1875 1500 2331 2025
     655 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660
     665 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660
     672 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
     68         1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751
     69         2240 1751 2240 1660
     702 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
     71         2058 1660 2058 1751 2148 1751 2148 1660
    6972-6
    70732 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    71          5850 6150 6675 6150
    72 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    73          5850 5250 6675 5250 6675 6600 5850 6600 5850 5250
    74 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    75         1 1 1.00 60.00 120.00
    76         7 0 1.00 60.00 60.00
    77          7725 6150 7725 6525
    78 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    79         1 1 1.00 60.00 120.00
    80         7 0 1.00 60.00 60.00
    81          9375 6150 9375 6525
    82 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    83         1 1 1.00 60.00 120.00
    84         7 0 1.00 60.00 60.00
    85          11025 6150 11025 6525
    86 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    87          10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400
    88          10500 5854
    89 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    90          8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400
    91          8850 5854
    92 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    93          7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400
    94          7200 5854
     74         1800 2400 2699 2399
    95752 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    9676        1 1 1.00 60.00 120.00
    9777        7 1 1.00 60.00 60.00
    98          6450 5925 7275 5925
     78         3749 2399 3749 2774
    99792 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    10080        1 1 1.00 60.00 120.00
    10181        7 1 1.00 60.00 60.00
    102          8025 5925 8925 5925
     82         5399 2399 5399 2774
    103832 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    10484        1 1 1.00 60.00 120.00
    10585        7 1 1.00 60.00 60.00
    106          9675 5925 10575 5925
     86         2550 2175 3299 2174
    107872 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    10888        1 1 1.00 60.00 120.00
    10989        7 1 1.00 60.00 60.00
    110          10725 5775 9825 5775
     90         4049 2174 4949 2174
    111912 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    11292        1 1 1.00 60.00 120.00
    11393        7 1 1.00 60.00 60.00
    114          9075 5775 8175 5775
    115 3 2 0 1 0 7 50 -1 -1 0.000 0 1 1 4
     94         5699 2174 6599 2174
     952 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    11696        1 1 1.00 60.00 120.00
    11797        7 1 1.00 60.00 60.00
    118          6300 6375 6375 6825 6750 7050 7350 6975
    119          0.000 -0.500 -0.500 0.000
    120 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001
    121 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001
    122 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001
    123 4 0 0 50 -1 0 11 0.0000 2 135 540 5775 6900 Atomic\001
    124 4 0 0 50 -1 0 11 0.0000 2 135 630 5775 7125 Pointer\001
    125 4 0 0 50 -1 0 11 0.0000 2 165 810 7950 6675 Benaphore\001
    126 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 7125 Event FD\001
    127 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001
    128 4 0 0 50 -1 0 11 0.0000 2 165 810 9600 6675 Benaphore\001
    129 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 7125 Event FD\001
    130 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001
    131 4 0 0 50 -1 0 11 0.0000 2 165 810 11250 6675 Benaphore\001
    132 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 7125 Event FD\001
    133 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001
     98         6749 2024 5849 2024
     992 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     100        1 1 1.00 60.00 120.00
     101        7 1 1.00 60.00 60.00
     102         5099 2024 4199 2024
     1032 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     104         1800 1499 2699 1499 2699 2850 1800 2850 1800 1499
     1052 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     106         4950 1650 5850 1650 5850 2550 4950 2550 4950 1650
     1072 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     108         3300 1650 4200 1650 4200 2550 3300 2550 3300 1650
     1092 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     110         6600 1650 7500 1650 7500 2550 6600 2550 6600 1650
     1112 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     112        1 1 1.00 60.00 120.00
     113        7 1 1.00 60.00 60.00
     114         7049 2399 7049 2774
     1154 0 0 50 -1 0 11 0.0000 2 120 525 1799 3149 Atomic\001
     1164 0 0 50 -1 0 11 0.0000 2 120 510 1799 3374 Pointer\001
     1174 0 0 50 -1 0 11 0.0000 2 180 765 3974 2924 Benaphore\001
     1184 0 0 50 -1 0 11 0.0000 2 120 690 4049 3374 Event FD\001
     1194 0 0 50 -1 0 11 0.0000 2 180 765 5625 2924 Benaphore\001
     1204 0 0 50 -1 0 11 0.0000 2 120 690 5699 3374 Event FD\001
     1214 0 0 50 -1 0 11 0.0000 2 180 765 7274 2924 Benaphore\001
     1224 0 0 50 -1 0 11 0.0000 2 120 690 7349 3374 Event FD\001
     1234 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001
     1244 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001
     1254 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001
     1264 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001
     1274 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001
     1284 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/idle1.fig

    r9e23b446 rffec1bf  
    88-2
    991200 2
    10 6 5919 5250 6375 5775
    11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410
    12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410
     106 1875 1500 2331 2025
     115 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660
     125 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660
    13132 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    14          6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501
    15          6284 5501 6284 5410
     14         1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751
     15         2240 1751 2240 1660
    16162 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
    17          6102 5410 6102 5501 6192 5501 6192 5410
     17         2058 1660 2058 1751 2148 1751 2148 1660
    1818-6
    19 6 7575 6525 7950 7025
     196 3599 2774 3974 3274
    20202 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    21          7575 6650 7700 6525 7950 6525 7950 7025 7575 7025 7575 6650
    22          7700 6650 7700 6525
     21         3599 2899 3724 2774 3974 2774 3974 3274 3599 3274 3599 2899
     22         3724 2899 3724 2774
    2323-6
    24 6 9225 6525 9600 7025
     246 5249 2774 5625 3274
    25252 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    26          9225 6650 9350 6525 9600 6525 9600 7025 9225 7025 9225 6650
    27          9350 6650 9350 6525
     26         5249 2899 5374 2774 5625 2774 5625 3274 5249 3274 5249 2899
     27         5374 2899 5374 2774
    2828-6
    29 6 10875 6525 11250 7025
     296 6899 2774 7274 3274
    30302 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    31          10875 6650 11000 6525 11250 6525 11250 7025 10875 7025 10875 6650
    32          11000 6650 11000 6525
     31         6899 2899 7024 2774 7274 2774 7274 3274 6899 3274 6899 2899
     32         7024 2899 7024 2774
    3333-6
    34342 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    3535        1 1 1.00 60.00 120.00
    36         7 0 1.00 60.00 60.00
    37          7725 6150 7725 6525
    38 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    39         1 1 1.00 60.00 120.00
    40         7 0 1.00 60.00 60.00
    41          9375 6150 9375 6525
    42 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    43         1 1 1.00 60.00 120.00
    44         7 0 1.00 60.00 60.00
    45          11025 6150 11025 6525
    46 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    47          10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400
    48          10500 5854
    49 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    50          8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400
    51          8850 5854
    52 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    53          7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400
    54          7200 5854
     36        7 1 1.00 60.00 60.00
     37         3749 2399 3749 2774
    55382 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    5639        1 1 1.00 60.00 120.00
    5740        7 1 1.00 60.00 60.00
    58          6450 5925 7275 5925
     41         5399 2399 5399 2774
    59422 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6043        1 1 1.00 60.00 120.00
    6144        7 1 1.00 60.00 60.00
    62          8025 5925 8925 5925
     45         7049 2399 7049 2774
    63462 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6447        1 1 1.00 60.00 120.00
    6548        7 1 1.00 60.00 60.00
    66          9675 5925 10575 5925
     49         2550 2175 3299 2174
    67502 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6851        1 1 1.00 60.00 120.00
    6952        7 1 1.00 60.00 60.00
    70          10725 5775 9825 5775
     53         4049 2174 4949 2174
    71542 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7255        1 1 1.00 60.00 120.00
    7356        7 1 1.00 60.00 60.00
    74          9075 5775 8175 5775
     57         5699 2174 6599 2174
     582 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     59        1 1 1.00 60.00 120.00
     60        7 1 1.00 60.00 60.00
     61         6749 2024 5849 2024
     622 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     63        1 1 1.00 60.00 120.00
     64        7 1 1.00 60.00 60.00
     65         5099 2024 4199 2024
    75662 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    76          5850 5250 6675 5250 6675 6075 5850 6075 5850 5250
    77 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001
    78 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001
    79 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001
    80 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001
    81 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001
    82 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001
    83 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 6825 Event FD\001
    84 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 6825 Event FD\001
    85 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 6825 Event FD\001
     67         4950 1650 5850 1650 5850 2550 4950 2550 4950 1650
     682 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     69         3300 1650 4200 1650 4200 2550 3300 2550 3300 1650
     702 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     71         6600 1650 7500 1650 7500 2550 6600 2550 6600 1650
     722 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     73         1800 1499 2699 1499 2699 2400 1800 2400 1800 1499
     744 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001
     754 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001
     764 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001
     774 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001
     784 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001
     794 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001
     804 0 0 50 -1 0 11 0.0000 2 120 690 4049 3074 Event FD\001
     814 0 0 50 -1 0 11 0.0000 2 120 690 5699 3074 Event FD\001
     824 0 0 50 -1 0 11 0.0000 2 120 690 7349 3074 Event FD\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/idle2.fig

    r9e23b446 rffec1bf  
    88-2
    991200 2
    10 6 5919 5250 6375 5775
    11 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5409.011 6102 5410 6147 5364 6192 5410
    12 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 6147.000 5410.000 6010 5410 6147 5273 6284 5410
     105 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3150.000 2106.250 2250 2625 2775 3075 3525 3075
     11        1 1 1.00 60.00 120.00
     12        7 1 1.00 60.00 60.00
     136 1875 1500 2331 2025
     145 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1660.011 2058 1660 2103 1614 2148 1660
     155 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2104.000 1661.000 1966 1660 2103 1523 2240 1660
    13162 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    14          6010 5410 6010 5501 5919 5501 5919 5775 6375 5775 6375 5501
    15          6284 5501 6284 5410
     17         1966 1660 1966 1751 1875 1751 1875 2025 2331 2025 2331 1751
     18         2240 1751 2240 1660
    16192 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
    17          6102 5410 6102 5501 6192 5501 6192 5410
     20         2058 1660 2058 1751 2148 1751 2148 1660
    1821-6
    19 6 7575 6525 7950 7025
     226 3599 2774 3974 3274
    20232 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    21          7575 6650 7700 6525 7950 6525 7950 7025 7575 7025 7575 6650
    22          7700 6650 7700 6525
     24         3599 2899 3724 2774 3974 2774 3974 3274 3599 3274 3599 2899
     25         3724 2899 3724 2774
    2326-6
    24 6 9225 6525 9600 7025
     276 5249 2774 5625 3274
    25282 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    26          9225 6650 9350 6525 9600 6525 9600 7025 9225 7025 9225 6650
    27          9350 6650 9350 6525
     29         5249 2899 5374 2774 5625 2774 5625 3274 5249 3274 5249 2899
     30         5374 2899 5374 2774
    2831-6
    29 6 10875 6525 11250 7025
     326 6899 2774 7274 3274
    30332 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8
    31          10875 6650 11000 6525 11250 6525 11250 7025 10875 7025 10875 6650
    32          11000 6650 11000 6525
     34         6899 2899 7024 2774 7274 2774 7274 3274 6899 3274 6899 2899
     35         7024 2899 7024 2774
    3336-6
    34372 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    35          5850 6150 6675 6150
    36 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    37          5850 5250 6675 5250 6675 6600 5850 6600 5850 5250
    38 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    39         1 1 1.00 60.00 120.00
    40         7 0 1.00 60.00 60.00
    41          7725 6150 7725 6525
    42 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    43         1 1 1.00 60.00 120.00
    44         7 0 1.00 60.00 60.00
    45          9375 6150 9375 6525
    46 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    47         1 1 1.00 60.00 120.00
    48         7 0 1.00 60.00 60.00
    49          11025 6150 11025 6525
    50 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    51          10500 5854 10763 6308 11288 6308 11550 5854 11288 5400 10763 5400
    52          10500 5854
    53 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    54          8850 5854 9113 6308 9638 6308 9900 5854 9638 5400 9113 5400
    55          8850 5854
    56 2 3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
    57          7200 5854 7463 6308 7988 6308 8250 5854 7988 5400 7463 5400
    58          7200 5854
     38         1800 2400 2699 2399
    59392 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6040        1 1 1.00 60.00 120.00
    6141        7 1 1.00 60.00 60.00
    62          6450 5925 7275 5925
     42         3749 2399 3749 2774
    63432 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6444        1 1 1.00 60.00 120.00
    6545        7 1 1.00 60.00 60.00
    66          8025 5925 8925 5925
     46         5399 2399 5399 2774
    67472 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    6848        1 1 1.00 60.00 120.00
    6949        7 1 1.00 60.00 60.00
    70          9675 5925 10575 5925
     50         7049 2399 7049 2774
    71512 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7252        1 1 1.00 60.00 120.00
    7353        7 1 1.00 60.00 60.00
    74          10725 5775 9825 5775
     54         2550 2175 3299 2174
    75552 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7656        1 1 1.00 60.00 120.00
    7757        7 1 1.00 60.00 60.00
    78          9075 5775 8175 5775
    79 3 2 0 1 0 7 50 -1 -1 0.000 0 1 1 4
     58         4049 2174 4949 2174
     592 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    8060        1 1 1.00 60.00 120.00
    8161        7 1 1.00 60.00 60.00
    82          6300 6375 6375 6825 6900 6975 7500 6750
    83          0.000 -0.500 -0.500 0.000
    84 4 0 0 50 -1 0 11 0.0000 2 135 810 5925 5175 Idle List\001
    85 4 0 0 50 -1 0 11 0.0000 2 135 810 5175 5550 Idle List\001
    86 4 0 0 50 -1 0 11 0.0000 2 135 360 5325 5700 Lock\001
    87 4 0 0 50 -1 0 11 0.0000 2 135 540 5775 6900 Atomic\001
    88 4 0 0 50 -1 0 11 0.0000 2 135 630 5775 7125 Pointer\001
    89 4 0 0 50 -1 0 11 0.0000 2 135 1260 7275 5325 Idle Processor\001
    90 4 0 0 50 -1 0 11 0.0000 2 135 1260 8925 5325 Idle Processor\001
    91 4 0 0 50 -1 0 11 0.0000 2 135 1260 10575 5325 Idle Processor\001
    92 4 0 0 50 -1 0 11 0.0000 2 135 720 8025 6825 Event FD\001
    93 4 0 0 50 -1 0 11 0.0000 2 135 720 9675 6825 Event FD\001
    94 4 0 0 50 -1 0 11 0.0000 2 135 720 11325 6825 Event FD\001
     62         5699 2174 6599 2174
     632 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     64        1 1 1.00 60.00 120.00
     65        7 1 1.00 60.00 60.00
     66         6749 2024 5849 2024
     672 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
     68        1 1 1.00 60.00 120.00
     69        7 1 1.00 60.00 60.00
     70         5099 2024 4199 2024
     712 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     72         1800 1499 2699 1499 2699 2850 1800 2850 1800 1499
     732 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     74         4950 1650 5850 1650 5850 2550 4950 2550 4950 1650
     752 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     76         3300 1650 4200 1650 4200 2550 3300 2550 3300 1650
     772 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     78         6600 1650 7500 1650 7500 2550 6600 2550 6600 1650
     794 0 0 50 -1 0 11 0.0000 2 120 525 1799 3149 Atomic\001
     804 0 0 50 -1 0 11 0.0000 2 120 510 1799 3374 Pointer\001
     814 2 0 50 -1 0 11 0.0000 2 135 585 1725 1800 Idle List\001
     824 2 0 50 -1 0 11 0.0000 2 135 360 1725 1950 Lock\001
     834 1 0 50 -1 0 11 0.0000 2 135 585 2250 1425 Idle List\001
     844 1 0 50 -1 0 11 0.0000 2 135 1020 3750 1575 Idle Processor\001
     854 1 0 50 -1 0 11 0.0000 2 135 1020 5400 1575 Idle Processor\001
     864 1 0 50 -1 0 11 0.0000 2 135 1020 7050 1575 Idle Processor\001
     874 0 0 50 -1 0 11 0.0000 2 120 690 4049 3074 Event FD\001
     884 0 0 50 -1 0 11 0.0000 2 120 690 5699 3074 Event FD\001
     894 0 0 50 -1 0 11 0.0000 2 120 690 7349 3074 Event FD\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/idle_state.fig

    r9e23b446 rffec1bf  
    88-2
    991200 2
    10 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 3600 571 571 3900 3600 3375 3375
    11 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 3600 605 605 6300 3600 5775 3300
    12 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 5400 600 600 5100 5400 4500 5400
     101 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3000 3600 600 600 3000 3600 2400 3600
     111 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1800 1800 600 600 1800 1800 1200 1800
     121 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4205 1800 600 600 4205 1800 3605 1800
    13132 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    14         0 0 1.00 60.00 120.00
    15          4200 4125 4725 4950
     14        1 1 1.00 60.00 120.00
     15         2100 2325 2625 3150
    16162 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    17         0 0 1.00 60.00 120.00
    18          4500 3600 5700 3600
     17        1 1 1.00 60.00 120.00
     18         2400 1800 3600 1800
    19192 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    20         0 0 1.00 60.00 120.00
    21          5923 4125 5475 4875
    22 4 1 0 50 -1 0 11 0.0000 2 135 450 5100 5475 AWAKE\001
    23 4 1 0 50 -1 0 11 0.0000 2 135 450 6300 3675 SLEEP\001
    24 4 1 0 50 -1 0 11 0.0000 2 135 540 3900 3675 SEARCH\001
    25 4 0 0 50 -1 0 11 0.0000 2 135 360 5775 4650 WAKE\001
    26 4 2 0 50 -1 0 11 0.0000 2 135 540 4350 4650 CANCEL\001
    27 4 1 0 50 -1 0 11 0.0000 2 135 630 5025 3450 CONFIRM\001
     20        1 1 1.00 60.00 120.00
     21         3900 2325 3375 3150
     224 1 0 50 -1 0 11 0.0000 2 120 675 3000 3675 AWAKE\001
     234 1 0 50 -1 0 11 0.0000 2 120 525 4200 1875 SLEEP\001
     244 1 0 50 -1 0 11 0.0000 2 120 720 1800 1875 SEARCH\001
     254 2 0 50 -1 0 11 0.0000 2 120 720 2250 2850 CANCEL\001
     264 1 0 50 -1 0 11 0.0000 2 120 840 2925 1650 CONFIRM\001
     274 0 0 50 -1 0 11 0.0000 2 120 540 3750 2850 WAKE\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/io_uring.fig

    r9e23b446 rffec1bf  
    88-2
    991200 2
    10 6 180 3240 2025 3510
     106 675 3105 2520 3375
    11112 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2
    12          720 3240 720 3510
     12         1215 3105 1215 3375
    13132 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2
    14          450 3240 450 3510
     14         945 3105 945 3375
    15152 2 0 1 0 7 45 -1 20 0.000 0 0 -1 0 0 5
    16          180 3240 1260 3240 1260 3510 180 3510 180 3240
     16         675 3105 1755 3105 1755 3375 675 3375 675 3105
    17172 1 0 1 0 7 40 -1 -1 0.000 0 0 -1 0 0 2
    18          990 3240 990 3510
    19 4 0 0 40 -1 0 12 0.0000 2 165 990 1035 3420 {\\small S3}\001
    20 4 0 0 40 -1 0 12 0.0000 2 165 990 765 3420 {\\small S2}\001
    21 4 0 0 40 -1 0 12 0.0000 2 165 990 225 3420 {\\small S0}\001
    22 4 0 0 40 -1 0 12 0.0000 2 165 990 495 3420 {\\small S1}\001
     18         1485 3105 1485 3375
     194 0 0 40 -1 0 12 0.0000 2 165 930 1530 3285 {\\small S3}\001
     204 0 0 40 -1 0 12 0.0000 2 165 930 1260 3285 {\\small S2}\001
     214 0 0 40 -1 0 12 0.0000 2 165 930 720 3285 {\\small S0}\001
     224 0 0 40 -1 0 12 0.0000 2 165 930 990 3285 {\\small S1}\001
    2323-6
    24 6 1530 2610 3240 4140
    25 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 2455.714 3375.000 1890 2700 1575 3375 1890 4050
     246 2025 2475 3735 4005
     255 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 2950.714 3240.000 2385 2565 2070 3240 2385 3915
    2626        1 1 1.00 60.00 120.00
    27 1 3 0 1 0 7 40 -1 20 0.000 1 0.0000 2475 3375 315 315 2475 3375 2790 3375
    28 1 3 0 1 0 7 50 -1 20 0.000 1 0.0000 2475 3375 765 765 2475 3375 3240 3375
     271 3 0 1 0 7 40 -1 20 0.000 1 0.0000 2970 3240 315 315 2970 3240 3285 3240
     281 3 0 1 0 7 50 -1 20 0.000 1 0.0000 2970 3240 765 765 2970 3240 3735 3240
    29292 1 0 1 0 7 45 -1 -1 0.000 0 0 -1 0 0 2
    30          2475 3375 2133 2690
     30         2970 3240 2628 2555
    31312 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    32          2475 3375 1769 3093
     32         2970 3240 2264 2958
    33332 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    34          2475 3375 1769 3661
     34         2970 3240 2264 3526
    35352 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    36          2475 3375 2133 4057
     36         2970 3240 2628 3922
    37372 1 1 1 0 7 35 -1 0 4.000 0 0 -1 0 0 2
    38          2205 3375 2745 3375
     38         2700 3240 3240 3240
    3939-6
    40 6 585 2250 1485 2610
    41 4 2 0 50 -1 0 12 0.0000 2 135 900 1485 2385 Submission\001
    42 4 2 0 50 -1 0 12 0.0000 2 165 360 1485 2580 Ring\001
     406 1080 2115 1980 2475
     414 2 0 50 -1 0 12 0.0000 2 135 945 1980 2250 Submission\001
     424 2 0 50 -1 0 12 0.0000 2 180 405 1980 2445 Ring\001
    4343-6
    44 6 3600 2610 5265 4140
    45 5 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 4384.000 3375.000 4950 4050 5265 3375 4950 2700
     446 4095 2475 5760 4005
     455 1 0 1 0 7 35 -1 -1 0.000 0 1 1 0 4879.000 3240.000 5445 3915 5760 3240 5445 2565
    4646        1 1 1.00 60.00 120.00
    47 1 3 0 1 0 7 40 -1 20 0.000 1 3.1416 4365 3375 315 315 4365 3375 4050 3375
    48 1 3 0 1 0 7 50 -1 20 0.000 1 3.1416 4365 3375 765 765 4365 3375 3600 3375
     471 3 0 1 0 7 40 -1 20 0.000 1 3.1416 4860 3240 315 315 4860 3240 4545 3240
     481 3 0 1 0 7 50 -1 20 0.000 1 3.1416 4860 3240 765 765 4860 3240 4095 3240
    49492 1 0 1 0 7 45 -1 -1 0.000 0 0 -1 0 0 2
    50          4365 3375 4707 4060
     50         4860 3240 5202 3925
    51512 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    52          4365 3375 5071 3657
     52         4860 3240 5566 3522
    53532 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    54          4365 3375 5071 3089
     54         4860 3240 5566 2954
    55552 1 0 1 0 7 45 -1 -1 4.000 0 0 -1 0 0 2
    56          4365 3375 4707 2693
     56         4860 3240 5202 2558
    57572 1 1 1 0 7 35 -1 0 4.000 0 0 -1 0 0 2
    58          4635 3375 4095 3375
     58         5130 3240 4590 3240
    5959-6
    60 6 5355 2250 6255 2610
    61 4 0 0 50 -1 0 12 0.0000 2 165 360 5355 2580 Ring\001
    62 4 0 0 50 -1 0 12 0.0000 2 165 900 5355 2385 Completion\001
     606 5850 2115 6750 2475
     614 0 0 50 -1 0 12 0.0000 2 180 405 5850 2445 Ring\001
     624 0 0 50 -1 0 12 0.0000 2 180 975 5850 2250 Completion\001
    6363-6
    64642 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
    6565        1 1 1.00 60.00 120.00
    66          2925 2025 2550 2486
     66         3420 1890 3045 2351
    67672 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2
    6868        1 1 1.00 60.00 120.00
    69          4275 2475 3825 2025
     69         4770 2340 4320 1890
    70702 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2
    7171        1 1 1.00 60.00 120.00
    72          2751 4268 3066 4538
     72         3060 4095 3600 4410
    73732 1 0 1 0 7 50 -1 -1 4.000 0 0 -1 1 0 2
    7474        1 1 1.00 60.00 120.00
    75          3780 4545 4275 4230
     75         4275 4410 4770 4095
    76762 1 1 1 0 7 55 -1 -1 4.000 0 0 -1 0 0 2
    77          0 3375 6255 3375
    78 4 0 0 35 -1 0 12 0.0000 2 165 1170 1845 3060 {\\small \\&S2}\001
    79 4 0 0 35 -1 0 12 0.0000 2 165 1170 1755 3420 {\\small \\&S3}\001
    80 4 0 0 35 -1 0 12 0.0000 2 165 1170 1890 3735 {\\small \\&S0}\001
    81 4 0 0 50 -1 0 12 0.0000 6 135 360 2790 2565 Push\001
    82 4 0 0 50 -1 0 12 0.0000 6 165 270 2880 4230 Pop\001
    83 4 0 0 50 -1 0 12 0.0000 6 135 360 2025 4275 Head\001
    84 4 0 0 50 -1 0 12 0.0000 6 135 360 2025 2565 Tail\001
    85 4 0 0 35 -1 0 12 0.0000 2 165 990 4635 3060 {\\small C0}\001
    86 4 0 0 35 -1 0 12 0.0000 2 165 990 4815 3420 {\\small C1}\001
    87 4 0 0 35 -1 0 12 0.0000 2 165 990 4635 3780 {\\small C2}\001
    88 4 0 0 50 -1 0 12 0.0000 4 135 360 4725 4275 Tail\001
    89 4 0 0 50 -1 0 12 0.0000 6 135 360 4590 2565 Head\001
    90 4 0 0 50 -1 0 12 0.0000 2 135 990 5535 3285 Kernel Line\001
    91 4 1 0 50 -1 0 12 0.0000 2 180 1350 3375 4815 {\\Large Kernel}\001
    92 4 1 0 50 -1 0 12 0.0000 2 180 1800 3375 1845 {\\Large Application}\001
    93 4 0 0 50 -1 0 12 0.0000 6 165 270 3690 2565 Pop\001
    94 4 0 0 50 -1 0 12 0.0000 4 135 360 3465 4230 Push\001
    95 4 0 0 50 -1 0 12 0.0000 2 135 90 0 3285 S\001
     77         495 3240 6750 3240
     784 0 0 35 -1 0 12 0.0000 2 165 1140 2340 2925 {\\small \\&S2}\001
     794 0 0 50 -1 0 12 0.0000 6 135 390 3285 2430 Push\001
     804 0 0 50 -1 0 12 0.0000 6 135 330 2520 2430 Tail\001
     814 0 0 35 -1 0 12 0.0000 2 165 960 5130 2925 {\\small C0}\001
     824 0 0 35 -1 0 12 0.0000 2 165 960 5310 3285 {\\small C1}\001
     834 0 0 35 -1 0 12 0.0000 2 165 960 5130 3645 {\\small C2}\001
     844 0 0 50 -1 0 12 0.0000 4 135 330 5220 4140 Tail\001
     854 0 0 50 -1 0 12 0.0000 6 135 420 5085 2430 Head\001
     864 0 0 50 -1 0 12 0.0000 2 135 960 6030 3150 Kernel Line\001
     874 0 0 50 -1 0 12 0.0000 2 135 105 495 3150 S\001
     884 0 0 35 -1 0 12 0.0000 2 165 1140 2385 3645 {\\small \\&S0}\001
     894 0 0 50 -1 0 12 0.0000 6 135 420 2340 4140 Head\001
     904 0 0 35 -1 0 12 0.0000 2 165 1140 2250 3285 {\\small \\&S3}\001
     914 2 0 50 -1 0 12 0.0000 4 135 390 4500 4140 Push\001
     924 1 0 50 -1 0 12 0.0000 2 180 1290 3915 4680 {\\Large Kernel}\001
     934 0 0 50 -1 0 12 0.0000 6 180 315 3285 4140 Pop\001
     944 1 0 50 -1 0 12 0.0000 2 180 1725 3915 1755 {\\Large Application}\001
     954 2 0 50 -1 0 12 0.0000 6 180 315 4545 2430 Pop\001
  • doc/theses/thierry_delisle_PhD/thesis/fig/system.fig

    r9e23b446 rffec1bf  
    4949         7800 3750 8025 3750
    5050-6
     516 4125 4725 4950 4950
     521 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4250 4838 100 100 4250 4838 4350 4838
     534 0 -1 0 0 0 12 0.0000 2 135 510 4425 4875 thread\001
     54-6
     556 5175 4725 6300 4950
     562 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
     57         5400 4950 5400 4725 5175 4725 5175 4950 5400 4950
     584 0 -1 0 0 0 12 0.0000 2 135 765 5475 4875 processor\001
     59-6
     606 6600 4725 7500 4950
     612 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
     62         6825 4950 6600 4950 6600 4725 6825 4725 6825 4950
     634 0 -1 0 0 0 12 0.0000 2 135 540 6900 4875 cluster\001
     64-6
     656 2175 4725 3975 4950
     661 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830
     674 0 -1 0 0 0 12 0.0000 2 180 1605 2325 4875 generator/coroutine\001
     68-6
     696 1575 2550 2775 3900
     702 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
     71         2400 3450 2400 3000 1950 3000 1950 3450 2400 3450
     724 1 -1 0 0 0 12 0.0000 2 135 1170 2175 2700 Discrete-event\001
     734 1 -1 0 0 0 12 0.0000 2 180 720 2175 2925 Manager\001
     744 1 -1 0 0 0 12 0.0000 2 180 930 2175 3675 preemption\001
     754 1 -1 0 0 0 12 0.0000 2 135 630 2175 3900 timeout\001
     76-6
    51771 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 2625 150 150 5550 2625 5700 2625
    52781 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 3225 150 150 5550 3225 5700 3225
     
    62881 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 2850 150 150 3975 2850 4125 2850
    63891 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 7200 2775 150 150 7200 2775 7350 2775
    64 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830
    65901 3 0 1 0 0 0 0 0 0.000 1 0.0000 7200 2775 30 30 7200 2775 7230 2805
    66911 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 3600 150 150 3525 3600 3675 3600
    67 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4625 4838 100 100 4625 4838 4725 4838
    68 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
    69          2400 4200 2400 3750 1950 3750 1950 4200 2400 4200
    70922 2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5
    7193         6300 4500 6300 1800 3000 1800 3000 4500 6300 4500
     
    135157        1 1 1.00 45.00 90.00
    136158         7875 3750 7875 2325 7200 2325 7200 2550
    137 2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
    138          6975 4950 6750 4950 6750 4725 6975 4725 6975 4950
    139 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
    140          5850 4950 5850 4725 5625 4725 5625 4950 5850 4950
    141 4 1 -1 0 0 0 10 0.0000 2 135 900 5550 4425 Processors\001
    142 4 1 -1 0 0 0 10 0.0000 2 165 1170 4200 3975 Ready Threads\001
    143 4 1 -1 0 0 0 10 0.0000 2 165 1440 7350 1725 Other Cluster(s)\001
    144 4 1 -1 0 0 0 10 0.0000 2 135 1080 4650 1725 User Cluster\001
    145 4 1 -1 0 0 0 10 0.0000 2 165 630 2175 3675 Manager\001
    146 4 1 -1 0 0 0 10 0.0000 2 135 1260 2175 3525 Discrete-event\001
    147 4 1 -1 0 0 0 10 0.0000 2 150 900 2175 4350 preemption\001
    148 4 0 -1 0 0 0 10 0.0000 2 135 630 7050 4875 cluster\001
    149 4 1 -1 0 0 0 10 0.0000 2 135 1350 4200 3225 Blocked Threads\001
    150 4 0 -1 0 0 0 10 0.0000 2 135 540 4800 4875 thread\001
    151 4 0 -1 0 0 0 10 0.0000 2 120 810 5925 4875 processor\001
    152 4 0 -1 0 0 0 10 0.0000 2 165 1710 2325 4875 generator/coroutine\001
     1594 1 -1 0 0 0 12 0.0000 2 135 840 5550 4425 Processors\001
     1604 1 -1 0 0 0 12 0.0000 2 180 1215 4200 3975 Ready Threads\001
     1614 1 -1 0 0 0 12 0.0000 2 165 1275 7350 1725 Other Cluster(s)\001
     1624 1 -1 0 0 0 12 0.0000 2 135 990 4650 1725 User Cluster\001
     1634 1 -1 0 0 0 12 0.0000 2 135 1380 4200 3225 Blocked Threads\001
  • doc/theses/thierry_delisle_PhD/thesis/local.bib

    r9e23b446 rffec1bf  
    22% Cforall
    33@misc{cfa:frontpage,
    4   url = {https://cforall.uwaterloo.ca/}
     4  howpublished = {\href{https://cforall.uwaterloo.ca}{https://\-cforall.uwaterloo.ca}}
    55}
    66@article{cfa:typesystem,
     
    481481@misc{MAN:linux/cfs,
    482482  title = {{CFS} Scheduler - The Linux Kernel documentation},
    483   url = {https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}
     483  howpublished = {\href{https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}{https://\-www.kernel.org/\-doc/\-html/\-latest/\-scheduler/\-sched-design-CFS.html}}
    484484}
    485485
     
    489489  year = {2019},
    490490  month = {February},
    491   url = {https://opensource.com/article/19/2/fair-scheduling-linux}
     491  howpublished = {\href{https://opensource.com/article/19/2/fair-scheduling-linux}{https://\-opensource.com/\-article/\-19/2\-/\-fair-scheduling-linux}}
    492492}
    493493
     
    499499}
    500500
    501 @article{MAN:linux/cfs/balancing,
     501@misc{MAN:linux/cfs/balancing,
    502502  title={Reworking {CFS} load balancing},
    503   journal={LWN article, available at: https://lwn.net/Articles/793427/},
    504   year={2013}
     503  journal={LWN article},
     504  year={2019},
     505  howpublished = {\href{https://lwn.net/Articles/793427}{https://\-lwn.net/\-Articles/\-793427}},
    505506}
    506507
     
    523524  title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide},
    524525  organization = {Apple Inc.},
    525   url = {https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}
     526  howPublish = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}}
    526527}
    527528
     
    536537  month = {June},
    537538  series = {Developer Reference},
    538   url = {https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}
    539 }
    540 
    541 @online{GITHUB:go,
     539  howpublished = {\href{https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}{https://\-www.microsoftpressstore.com/\-articles/\-article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}}
     540}
     541
     542@misc{GITHUB:go,
    542543  title = {GitHub - The Go Programming Language},
    543544  author = {The Go Programming Language},
    544   url = {https://github.com/golang/go},
     545  howpublished = {\href{https://github.com/golang/go}{https://\-github.com/\-golang/\-go}},
    545546  version = {Change-Id: If07f40b1d73b8f276ee28ffb8b7214175e56c24d}
    546547}
     
    551552  year = {2019},
    552553  booktitle = {Hydra},
    553   url = {https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}
     554  howpublished = {\href{https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}{https://\-www.youtube.com/\-watch?v=-K11rY57K7k&ab_channel=Hydra}}
    554555}
    555556
     
    559560  year = {2008},
    560561  booktitle = {Erlang User Conference},
    561   url = {http://www.erlang.se/euc/08/euc_smp.pdf}
    562 }
    563 
    564 
     562  howpublished = {\href{http://www.erlang.se/euc/08/euc_smp.pdf}{http://\-www.erlang.se/\-euc/\-08/\-euc_smp.pdf}}
     563}
    565564
    566565@manual{MAN:tbb/scheduler,
    567566  title = {Scheduling Algorithm - Intel{\textregistered} Threading Building Blocks Developer Reference},
    568567  organization = {Intel{\textregistered}},
    569   url = {https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}
     568  howpublished = {\href{https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}{https://\-www.threadingbuildingblocks.org/\-docs/\-help/\-reference/\-task\_scheduler/\-scheduling\_algorithm.html}}
    570569}
    571570
     
    573572  title = {Quasar Core - Quasar User Manual},
    574573  organization = {Parallel Universe},
    575   url = {https://docs.paralleluniverse.co/quasar/}
     574  howpublished = {\href{https://docs.paralleluniverse.co/quasar}{https://\-docs.paralleluniverse.co/\-quasar}}
    576575}
    577576@misc{MAN:project-loom,
    578   url = {https://www.baeldung.com/openjdk-project-loom}
     577  howpublished = {\href{https://www.baeldung.com/openjdk-project-loom}{https://\-www.baeldung.com/\-openjdk-project-loom}}
    579578}
    580579
    581580@misc{MAN:java/fork-join,
    582   url = {https://www.baeldung.com/java-fork-join}
     581  howpublished = {\href{https://www.baeldung.com/java-fork-join}{https://\-www.baeldung.com/\-java-fork-join}}
    583582}
    584583
     
    633632  month   = "March",
    634633  version = {0,4},
    635   howpublished = {\url{https://kernel.dk/io_uring.pdf}}
     634  howpublished = {\href{https://kernel.dk/io_uring.pdf}{https://\-kernel.dk/\-io\_uring.pdf}}
    636635}
    637636
     
    642641  title = "Control theory --- {W}ikipedia{,} The Free Encyclopedia",
    643642  year = "2020",
    644   url = "https://en.wikipedia.org/wiki/Task_parallelism",
     643  howpublished = {\href{https://en.wikipedia.org/wiki/Task_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Task\_parallelism}},
    645644  note = "[Online; accessed 22-October-2020]"
    646645}
     
    650649  title = "Task parallelism --- {W}ikipedia{,} The Free Encyclopedia",
    651650  year = "2020",
    652   url = "https://en.wikipedia.org/wiki/Control_theory",
     651  howpublished = "\href{https://en.wikipedia.org/wiki/Control_theory}{https://\-en.wikipedia.org/\-wiki/\-Control\_theory}",
    653652  note = "[Online; accessed 22-October-2020]"
    654653}
     
    658657  title = "Implicit parallelism --- {W}ikipedia{,} The Free Encyclopedia",
    659658  year = "2020",
    660   url = "https://en.wikipedia.org/wiki/Implicit_parallelism",
     659  howpublished = "\href{https://en.wikipedia.org/wiki/Implicit_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Implicit\_parallelism}",
    661660  note = "[Online; accessed 23-October-2020]"
    662661}
     
    666665  title = "Explicit parallelism --- {W}ikipedia{,} The Free Encyclopedia",
    667666  year = "2017",
    668   url = "https://en.wikipedia.org/wiki/Explicit_parallelism",
     667  howpublished = "\href{https://en.wikipedia.org/wiki/Explicit_parallelism}{https://\-en.wikipedia.org/\-wiki/\-Explicit\_parallelism}",
    669668  note = "[Online; accessed 23-October-2020]"
    670669}
     
    674673  title = "Linear congruential generator --- {W}ikipedia{,} The Free Encyclopedia",
    675674  year = "2020",
    676   url = "https://en.wikipedia.org/wiki/Linear_congruential_generator",
     675  howpublished = "\href{https://en.wikipedia.org/wiki/Linear_congruential_generator}{https://en.wikipedia.org/wiki/Linear\_congruential\_generator}",
    677676  note = "[Online; accessed 2-January-2021]"
    678677}
     
    682681  title = "Futures and promises --- {W}ikipedia{,} The Free Encyclopedia",
    683682  year = "2020",
    684   url = "https://en.wikipedia.org/wiki/Futures_and_promises",
     683  howpublished = "\href{https://en.wikipedia.org/wiki/Futures_and_promises}{https://\-en.wikipedia.org/\-wiki/Futures\_and\_promises}",
    685684  note = "[Online; accessed 9-February-2021]"
    686685}
     
    690689  title = "Read-copy-update --- {W}ikipedia{,} The Free Encyclopedia",
    691690  year = "2022",
    692   url = "https://en.wikipedia.org/wiki/Linear_congruential_generator",
     691  howpublished = "\href{https://en.wikipedia.org/wiki/Linear_congruential_generator}{https://\-en.wikipedia.org/\-wiki/\-Linear\_congruential\_generator}",
    693692  note = "[Online; accessed 12-April-2022]"
    694693}
     
    698697  title = "Readers-writer lock --- {W}ikipedia{,} The Free Encyclopedia",
    699698  year = "2021",
    700   url = "https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock",
     699  howpublished = "\href{https://en.wikipedia.org/wiki/Readers-writer_lock}{https://\-en.wikipedia.org/\-wiki/\-Readers-writer\_lock}",
    701700  note = "[Online; accessed 12-April-2022]"
     701}
     702
     703@misc{wiki:binpak,
     704  author = "{Wikipedia contributors}",
     705  title = "Bin packing problem --- {W}ikipedia{,} The Free Encyclopedia",
     706  year = "2022",
     707  howpublished = "\href{https://en.wikipedia.org/wiki/Bin_packing_problem}{https://\-en.wikipedia.org/\-wiki/\-Bin\_packing\_problem}",
     708  note = "[Online; accessed 29-June-2022]"
    702709}
    703710
     
    705712% [05/04, 12:36] Trevor Brown
    706713%     i don't know where rmr complexity was first introduced, but there are many many many papers that use the term and define it
    707 % [05/04, 12:37] Trevor Brown
     714% [05/04, 12:37] Trevor Brown
    708715%     here's one paper that uses the term a lot and links to many others that use it... might trace it to something useful there https://drops.dagstuhl.de/opus/volltexte/2021/14832/pdf/LIPIcs-DISC-2021-30.pdf
    709 % [05/04, 12:37] Trevor Brown
     716% [05/04, 12:37] Trevor Brown
    710717%     another option might be to cite a textbook
    711 % [05/04, 12:42] Trevor Brown
     718% [05/04, 12:42] Trevor Brown
    712719%     but i checked two textbooks in the area i'm aware of and i don't see a definition of rmr complexity in either
    713 % [05/04, 12:42] Trevor Brown
     720% [05/04, 12:42] Trevor Brown
    714721%     this one has a nice statement about the prevelance of rmr complexity, as well as some rough definition
    715 % [05/04, 12:42] Trevor Brown
     722% [05/04, 12:42] Trevor Brown
    716723%     https://dl.acm.org/doi/pdf/10.1145/3465084.3467938
    717724
     
    721728%
    722729% https://doi.org/10.1137/1.9781611973099.100
     730
     731
     732@misc{AIORant,
     733  author = "Linus Torvalds",
     734  title = "Re: [PATCH 09/13] aio: add support for async openat()",
     735  year = "2016",
     736  month = jan,
     737  howpublished = "\href{https://lwn.net/Articles/671657}{https://\-lwn.net/\-Articles/671657}",
     738  note = "[Online; accessed 6-June-2022]"
     739}
     740
     741@misc{apache,
     742  key = {Apache Software Foundation},
     743  title = {{T}he {A}pache Web Server},
     744  howpublished = {\href{http://httpd.apache.org}{http://\-httpd.apache.org}},
     745  note = "[Online; accessed 6-June-2022]"
     746}
     747
     748@misc{SeriallyReusable,
     749    author      = {IBM},
     750    title       = {Serially reusable programs},
     751    month       = mar,
     752    howpublished= {\href{https://www.ibm.com/docs/en/ztpf/1.1.0.15?topic=structures-serially-reusable-programs}{https://www.ibm.com/\-docs/\-en/\-ztpf/\-1.1.0.15?\-topic=structures\--serially\--reusable-programs}},
     753    year        = 2021,
     754}
     755
     756@inproceedings{Albers12,
     757    author      = {Susanne Albers and Antonios Antoniadis},
     758    title       = {Race to Idle: New Algorithms for Speed Scaling with a Sleep State},
     759    booktitle   = {Proceedings of the 2012  Annual ACM-SIAM Symposium on Discrete Algorithms (SODA)},
     760    doi         = {10.1137/1.9781611973099.100},
     761    URL         = {https://epubs.siam.org/doi/abs/10.1137/1.9781611973099.100},
     762    eprint      = {https://epubs.siam.org/doi/pdf/10.1137/1.9781611973099.100},
     763    year        = 2012,
     764    month       = jan,
     765    pages       = {1266-1285},
     766}
  • doc/theses/thierry_delisle_PhD/thesis/text/core.tex

    r9e23b446 rffec1bf  
    11\chapter{Scheduling Core}\label{core}
    22
    3 Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers. In short, the system is neither overloaded nor underloaded.
    4 
    5 It is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new load and return to the steady state, \eg, by adding or removing workers. Therefore, flaws in scheduling the steady state tend to be pervasive in all states.
     3Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state.
     4For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers.
     5In short, the system is neither overloaded nor underloaded.
     6
     7It is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected.
     8As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new load and return to the steady state, \eg, by adding or removing workers.
     9Therefore, flaws in scheduling the steady state tend to be pervasive in all states.
    610
    711\section{Design Goals}
    8 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental-model. To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental-model, the system also respects this model.
     12As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental-model.
     13To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental-model, the system also respects this model.
    914
    1015For threading, a simple and common execution mental-model is the ``Ideal multi-tasking CPU'' :
     
    1722Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
    1823
    19 In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with each other but simply share the hardware. This assumption makes it easier to reason about threading because ready \glspl{thrd} can be thought of in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees:
     24In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with each other but simply share the hardware.
     25This assumption makes it easier to reason about threading because ready \glspl{thrd} can be thought of in isolation and the effect of the scheduler can be virtually ignored.
     26This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees:
    2027\begin{enumerate}
    2128        \item A fairness guarantee: a \gls{thrd} that is ready to run is not prevented by another thread.
     
    2330\end{enumerate}
    2431
    25 It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still share the limited hardware resources. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware resources, even if that share is very small.
    26 
    27 Similarly the performance guarantee, the lack of interference among threads, is only relevant up to a point. Ideally, the cost of running and blocking should be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document, the performance experimentation attempts to show the cost of scheduling is at worst equivalent to existing algorithms used in popular languages. This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models. Recall programmer expectation is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is compatitive to other popular languages, the guarantee will be consider achieved.
    28 
     32It is important to note that these guarantees are expected only up to a point.
     33\Glspl{thrd} that are ready to run should not be prevented to do so, but they still share the limited hardware resources.
     34Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware resources, even if that share is very small.
     35
     36Similar to the performance guarantee, the lack of interference among threads is only relevant up to a point.
     37Ideally, the cost of running and blocking should be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention.
     38How much is an acceptable cost is obviously highly variable.
     39For this document, the performance experimentation attempts to show the cost of scheduling is at worst equivalent to existing algorithms used in popular languages.
     40This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models.
     41Recall programmer expectation is that the impact of the scheduler can be ignored.
     42Therefore, if the cost of scheduling is competitive to other popular languages, the guarantee is consider achieved.
    2943More precisely the scheduler should be:
    3044\begin{itemize}
     
    3448
    3549\subsection{Fairness Goals}
    36 For this work fairness will be considered as having two strongly related requirements: true starvation freedom and ``fast'' load balancing.
    37 
    38 \paragraph{True starvation freedom} is more easily defined: As long as at least one \proc continues to dequeue \ats, all read \ats should be able to run eventually.
    39 In any running system, \procs can stop dequeing \ats if they start running a \at that will simply never park.
    40 Traditional workstealing schedulers do not have starvation freedom in these cases.
     50For this work, fairness is considered to have two strongly related requirements: true starvation freedom and ``fast'' load balancing.
     51
     52\paragraph{True starvation freedom} means as long as at least one \proc continues to dequeue \ats, all ready \ats should be able to run eventually, \ie, eventual progress.
     53In any running system, a \proc can stop dequeuing \ats if it starts running a \at that never blocks.
     54Without preemption, traditional work-stealing schedulers do not have starvation freedom in this case.
    4155Now this requirement begs the question, what about preemption?
    4256Generally speaking preemption happens on the timescale of several milliseconds, which brings us to the next requirement: ``fast'' load balancing.
    4357
    4458\paragraph{Fast load balancing} means that load balancing should happen faster than preemption would normally allow.
    45 For interactive applications that need to run at 60, 90, 120 frames per second, \ats having to wait for several millseconds to run are effectively starved.
     59For interactive applications that need to run at 60, 90, 120 frames per second, \ats having to wait for several milliseconds to run are effectively starved.
    4660Therefore load-balancing should be done at a faster pace, one that can detect starvation at the microsecond scale.
    4761With that said, this is a much fuzzier requirement since it depends on the number of \procs, the number of \ats and the general load of the system.
    4862
    4963\subsection{Fairness vs Scheduler Locality} \label{fairnessvlocal}
    50 An important performance factor in modern architectures is cache locality. Waiting for data at lower levels or not present in the cache can have a major impact on performance. Having multiple \glspl{hthrd} writing to the same cache lines also leads to cache lines that must be waited on. It is therefore preferable to divide data among each \gls{hthrd}\footnote{This partitioning can be an explicit division up front or using data structures where different \glspl{hthrd} are naturally routed to different cache lines.}.
    51 
    52 For a scheduler, having good locality\footnote{This section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling. External locality is a much more complicated subject and is discussed in the next section.}, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness. Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available.
    53 
    54 However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally, where Figure~\ref{fig:fair} shows a visual representation of this behaviour. As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental-model.
     64An important performance factor in modern architectures is cache locality.
     65Waiting for data at lower levels or not present in the cache can have a major impact on performance.
     66Having multiple \glspl{hthrd} writing to the same cache lines also leads to cache lines that must be waited on.
     67It is therefore preferable to divide data among each \gls{hthrd}\footnote{This partitioning can be an explicit division up front or using data structures where different \glspl{hthrd} are naturally routed to different cache lines.}.
     68
     69For a scheduler, having good locality, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness.
     70Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available.
     71Note that this section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling.
     72External locality is a much more complicated subject and is discussed in the next section.
     73
     74However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally.
     75Figure~\ref{fig:fair} shows a visual representation of this behaviour.
     76As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental-model.
    5577
    5678\begin{figure}
     
    5880        \input{fairness.pstex_t}
    5981        \vspace*{-10pt}
    60         \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache, Locality, decreases. At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model. Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.}
     82        \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache decreases, Locality.
     83        At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model.
     84        Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.}
    6185        \label{fig:fair}
    6286\end{figure}
    6387
    6488\subsection{Performance Challenges}\label{pref:challenge}
    65 While there exists a multitude of potential scheduling algorithms, they generally always have to contend with the same performance challenges. Since these challenges are recurring themes in the design of a scheduler it is relevant to describe the central ones here before looking at the design.
     89While there exists a multitude of potential scheduling algorithms, they generally always have to contend with the same performance challenges.
     90Since these challenges are recurring themes in the design of a scheduler it is relevant to describe the central ones here before looking at the design.
    6691
    6792\subsubsection{Scalability}
     
    6994Given a large number of \procs and an even larger number of \ats, scalability measures how fast \procs can enqueue and dequeues \ats.
    7095One could expect that doubling the number of \procs would double the rate at which \ats are dequeued, but contention on the internal data structure of the scheduler can lead to worst improvements.
    71 While the ready-queue itself can be sharded to alleviate the main source of contention, auxillary scheduling features, \eg counting ready \ats, can also be sources of contention.
     96While the ready-queue itself can be sharded to alleviate the main source of contention, auxiliary scheduling features, \eg counting ready \ats, can also be sources of contention.
    7297
    7398\subsubsection{Migration Cost}
    74 Another important source of latency in scheduling is migration.
    75 An \at is said to have migrated if it is executed by two different \proc consecutively, which is the process discussed in \ref{fairnessvlocal}.
    76 Migrations can have many different causes, but it certain programs it can be all but impossible to limit migrations.
    77 Chapter~\ref{microbench} for example, has a benchmark where any \at can potentially unblock any other \at, which can leat to \ats migrating more often than not.
    78 Because of this it is important to design the internal data structures of the scheduler to limit the latency penalty from migrations.
     99Another important source of scheduling latency is migration.
     100A \at migrates if it executes on two different \procs consecutively, which is the process discussed in \ref{fairnessvlocal}.
     101Migrations can have many different causes, but in certain programs, it can be impossible to limit migration.
     102Chapter~\ref{microbench} has a benchmark where any \at can potentially unblock any other \at, which can lead to \ats migrating frequently.
     103Hence, it is important to design the internal data structures of the scheduler to limit any latency penalty from migrations.
    79104
    80105
    81106\section{Inspirations}
    82 In general, a na\"{i}ve \glsxtrshort{fifo} ready-queue does not scale with increased parallelism from \glspl{hthrd}, resulting in decreased performance. The problem is adding/removing \glspl{thrd} is a single point of contention. As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}. The solution to this problem is to shard the ready-queue : create multiple sub-ready-queues that multiple \glspl{hthrd} can access and modify without interfering.
    83 
    84 Before going into the design of \CFA's scheduler proper, it is relevant to discuss two sharding solutions which served as the inspiration scheduler in this thesis.
     107In general, a na\"{i}ve \glsxtrshort{fifo} ready-queue does not scale with increased parallelism from \glspl{hthrd}, resulting in decreased performance.
     108The problem is a single point of contention when adding/removing \ats.
     109As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}.
     110The solution to this problem is to shard the ready-queue: create multiple \emph{subqueues} forming the logical ready-queue and the subqueues are accessed by multiple \glspl{hthrd} without interfering.
     111
     112Before going into the design of \CFA's scheduler, it is relevant to discuss two sharding solutions that served as the inspiration scheduler in this thesis.
    85113
    86114\subsection{Work-Stealing}
    87115
    88 As mentioned in \ref{existing:workstealing}, a popular pattern shard the ready-queue is work-stealing.
    89 In this pattern each \gls{proc} has its own local ready-queue and \glspl{proc} only access each other's ready-queue if they run out of work on their local ready-queue.
    90 The interesting aspect of workstealing happen in easier scheduling cases, \ie enough work for everyone but no more and no load balancing needed.
    91 In these cases, work-stealing is close to optimal scheduling: it can achieve perfect locality and have no contention.
     116As mentioned in \ref{existing:workstealing}, a popular sharding approach for the ready-queue is work-stealing.
     117In this approach, each \gls{proc} has its own local subqueue and \glspl{proc} only access each other's subqueue if they run out of work on their local ready-queue.
     118The interesting aspect of work stealing happens in the steady-state scheduling case, \ie all \glspl{proc} have work and no load balancing is needed.
     119In this case, work stealing is close to optimal scheduling: it can achieve perfect locality and have no contention.
    92120On the other hand, work-stealing schedulers only attempt to do load-balancing when a \gls{proc} runs out of work.
    93121This means that the scheduler never balances unfair loads unless they result in a \gls{proc} running out of work.
    94 Chapter~\ref{microbench} shows that in pathological cases this problem can lead to indefinite starvation.
    95 
    96 
    97 Based on these observation, the conclusion is that a \emph{perfect} scheduler should behave very similarly to work-stealing in the easy cases, but should have more proactive load-balancing if the need arises.
    98 
    99 \subsection{Relaxed-Fifo}
    100 An entirely different scheme is to create a ``relaxed-FIFO'' queue as in \todo{cite Trevor's paper}. This approach forgos any ownership between \gls{proc} and ready-queue, and simply creates a pool of ready-queues from which the \glspl{proc} can pick from.
    101 \Glspl{proc} choose ready-queus at random, but timestamps are added to all elements of the queue and dequeues are done by picking two queues and dequeing the oldest element.
    102 All subqueues are protected by TryLocks and \procs simply pick a different subqueue if they fail to acquire the TryLock.
    103 The result is a queue that has both decent scalability and sufficient fairness.
    104 The lack of ownership means that as long as one \gls{proc} is still able to repeatedly dequeue elements, it is unlikely that any element will stay on the queue for much longer than any other element.
    105 This contrasts with work-stealing, where \emph{any} \gls{proc} busy for an extended period of time results in all the elements on its local queue to have to wait. Unless another \gls{proc} runs out of work.
     122Chapter~\ref{microbench} shows that pathological cases work stealing can lead to indefinite starvation.
     123
     124Based on these observation, the conclusion is that a \emph{perfect} scheduler should behave similar to work-stealing in the steady-state case, but load balance proactively when the need arises.
     125
     126\subsection{Relaxed-FIFO}
     127A different scheduling approach is to create a ``relaxed-FIFO'' queue, as in \todo{cite Trevor's paper}.
     128This approach forgoes any ownership between \gls{proc} and subqueue, and simply creates a pool of ready-queues from which \glspl{proc} pick.
     129Scheduling is performed as follows:
     130\begin{itemize}
     131\item
     132All subqueues are protected by TryLocks.
     133\item
     134Timestamps are added to each element of a subqueue.
     135\item
     136A \gls{proc} randomly tests ready queues until it has acquired one or two queues.
     137\item
     138If two queues are acquired, the older of the two \ats at the front the acquired queues is dequeued.
     139\item
     140Otherwise the \ats from the single queue is dequeued.
     141\end{itemize}
     142The result is a queue that has both good scalability and sufficient fairness.
     143The lack of ownership ensures that as long as one \gls{proc} is still able to repeatedly dequeue elements, it is unlikely any element will delay longer than any other element.
     144This guarantee contrasts with work-stealing, where a \gls{proc} with a long subqueue results in unfairness for its \ats in comparison to a \gls{proc} with a short subqueue.
     145This unfairness persists until a \gls{proc} runs out of work and steals.
    106146
    107147An important aspects of this scheme's fairness approach is that the timestamps make it possible to evaluate how long elements have been on the queue.
    108 However, another major aspect is that \glspl{proc} will eagerly search for these older elements instead of focusing on specific queues.
    109 
    110 While the fairness, of this scheme is good, it does suffer in terms of performance.
    111 It requires very wide sharding, \eg at least 4 queues per \gls{hthrd}, and finding non-empty queues can be difficult if there are too few ready \ats.
     148However, \glspl{proc} eagerly search for these older elements instead of focusing on specific queues, which negatively affects locality.
     149
     150While this scheme has good fairness, its performance suffers.
     151It requires wide sharding, \eg at least 4 queues per \gls{hthrd}, and finding non-empty queues is difficult when there are few ready \ats.
    112152
    113153\section{Relaxed-FIFO++}
    114 Since it has inherent fairness quelities and decent performance in the presence of many \ats, the relaxed-FIFO queue appears as a good candidate to form the basis of a scheduler.
    115 The most obvious problems is for workloads where the number of \ats is barely greater than the number of \procs.
    116 In these situations, the wide sharding means most of the sub-queues from which the relaxed queue is formed will be empty.
    117 The consequence is that when a dequeue operations attempts to pick a sub-queue at random, it is likely that it picks an empty sub-queue and will have to pick again.
    118 This problem can repeat an unbounded number of times.
     154The inherent fairness and good performance with many \ats, makes the relaxed-FIFO queue a good candidate to form the basis of a new scheduler.
     155The problem case is workloads where the number of \ats is barely greater than the number of \procs.
     156In these situations, the wide sharding of the ready queue means most of its subqueues are empty.
     157Furthermore, the non-empty subqueues are unlikely to hold more than one item.
     158The consequence is that a random dequeue operation is likely to pick an empty subqueue, resulting in an unbounded number of selections.
     159This state is generally unstable: each subqueue is likely to frequently toggle between being empty and nonempty.
     160Indeed, when the number of \ats is \emph{equal} to the number of \procs, every pop operation is expected to empty a subqueue and every push is expected to add to an empty subqueue.
     161In the worst case, a check of the subqueues sees all are empty or full.
    119162
    120163As this is the most obvious challenge, it is worth addressing first.
    121 The obvious solution is to supplement each subqueue with some sharded data structure that keeps track of which subqueues are empty.
    122 This data structure can take many forms, for example simple bitmask or a binary tree that tracks which branch are empty.
    123 Following a binary tree on each pick has fairly good Big O complexity and many modern architectures have powerful bitmask manipulation instructions.
    124 However, precisely tracking which sub-queues are empty is actually fundamentally problematic.
    125 The reason is that each subqueues are already a form of sharding and the sharding width has presumably already chosen to avoid contention.
    126 However, tracking which ready queue is empty is only useful if the tracking mechanism uses denser sharding than the sub queues, then it will invariably create a new source of contention.
    127 But if the tracking mechanism is not denser than the sub-queues, then it will generally not provide useful because reading this new data structure risks being as costly as simply picking a sub-queue at random.
    128 Early experiments with this approach have shown that even with low success rates, randomly picking a sub-queue can be faster than a simple tree walk.
     164The obvious solution is to supplement each sharded subqueue with data that indicates if the queue is empty/nonempty to simplify finding nonempty queues, \ie ready \glspl{at}.
     165This sharded data can be organized in different forms, \eg a bitmask or a binary tree that tracks the nonempty subqueues.
     166Specifically, many modern architectures have powerful bitmask manipulation instructions or searching a binary tree has good Big-O complexity.
     167However, precisely tracking nonempty subqueues is problematic.
     168The reason is that the subqueues are initially sharded with a width presumably chosen to avoid contention.
     169However, tracking which ready queue is nonempty is only useful if the tracking data is dense, \ie denser than the sharded subqueues.
     170Otherwise, it does not provide useful information because reading this new data structure risks being as costly as simply picking a subqueue at random.
     171But if the tracking mechanism \emph{is} denser than the shared subqueues, than constant updates invariably create a new source of contention.
     172Early experiments with this approach showed that randomly picking, even with low success rates, is often faster than bit manipulations or tree walks.
    129173
    130174The exception to this rule is using local tracking.
    131 If each \proc keeps track locally of which sub-queue is empty, then this can be done with a very dense data structure without introducing a new source of contention.
    132 The consequence of local tracking however, is that the information is not complete.
    133 Each \proc is only aware of the last state it saw each subqueues but does not have any information about freshness.
    134 Even on systems with low \gls{hthrd} count, \eg 4 or 8, this can quickly lead to the local information being no better than the random pick.
    135 This is due in part to the cost of this maintaining this information and its poor quality.
    136 
    137 However, using a very low cost approach to local tracking may actually be beneficial.
    138 If the local tracking is no more costly than the random pick, than \emph{any} improvement to the succes rate, however low it is, would lead to a performance benefits.
    139 This leads to the following approach:
     175If each \proc locally keeps track of empty subqueues, than this can be done with a very dense data structure without introducing a new source of contention.
     176However, the consequence of local tracking is that the information is incomplete.
     177Each \proc is only aware of the last state it saw about each subqueue so this information quickly becomes stale.
     178Even on systems with low \gls{hthrd} count, \eg 4 or 8, this approach can quickly lead to the local information being no better than the random pick.
     179This result is due in part to the cost of maintaining information and its poor quality.
     180
     181However, using a very low cost but inaccurate approach for local tracking can actually be beneficial.
     182If the local tracking is no more costly than a random pick, than \emph{any} improvement to the success rate, however low it is, leads to a performance benefits.
     183This suggests to the following approach:
    140184
    141185\subsection{Dynamic Entropy}\cit{https://xkcd.com/2318/}
    142 The Relaxed-FIFO approach can be made to handle the case of mostly empty sub-queues by tweaking the \glsxtrlong{prng}.
    143 The \glsxtrshort{prng} state can be seen as containing a list of all the future sub-queues that will be accessed.
    144 While this is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the subqueues that were accessed.
    145 Luckily, bidirectional \glsxtrshort{prng} algorithms do exist, for example some Linear Congruential Generators\cit{https://en.wikipedia.org/wiki/Linear\_congruential\_generator} support running the algorithm backwards while offering good quality and performance.
     186The Relaxed-FIFO approach can be made to handle the case of mostly empty subqueues by tweaking the \glsxtrlong{prng}.
     187The \glsxtrshort{prng} state can be seen as containing a list of all the future subqueues that will be accessed.
     188While this concept is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the subqueues that were accessed.
     189Luckily, bidirectional \glsxtrshort{prng} algorithms do exist, \eg some Linear Congruential Generators\cit{https://en.wikipedia.org/wiki/Linear\_congruential\_generator} support running the algorithm backwards while offering good quality and performance.
    146190This particular \glsxtrshort{prng} can be used as follows:
    147 
    148 Each \proc maintains two \glsxtrshort{prng} states, which whill be refered to as \texttt{F} and \texttt{B}.
    149 
    150 When a \proc attempts to dequeue a \at, it picks the subqueues by running the \texttt{B} backwards.
    151 When a \proc attempts to enqueue a \at, it runs \texttt{F} forward to pick to subqueue to enqueue to.
    152 If the enqueue is successful, the state \texttt{B} is overwritten with the content of \texttt{F}.
    153 
    154 The result is that each \proc will tend to dequeue \ats that it has itself enqueued.
    155 When most sub-queues are empty, this technique increases the odds of finding \ats at very low cost, while also offering an improvement on locality in many cases.
    156 
    157 However, while this approach does notably improve performance in many cases, this algorithm is still not competitive with work-stealing algorithms.
     191\begin{itemize}
     192\item
     193Each \proc maintains two \glsxtrshort{prng} states, refereed to as $F$ and $B$.
     194\item
     195When a \proc attempts to dequeue a \at, it picks a subqueue by running $B$ backwards.
     196\item
     197When a \proc attempts to enqueue a \at, it runs $F$ forward picking a subqueue to enqueue to.
     198If the enqueue is successful, the state $B$ is overwritten with the content of $F$.
     199\end{itemize}
     200The result is that each \proc tends to dequeue \ats that it has itself enqueued.
     201When most subqueues are empty, this technique increases the odds of finding \ats at very low cost, while also offering an improvement on locality in many cases.
     202
     203Tests showed this approach performs better than relaxed-FIFO in many cases.
     204However, it is still not competitive with work-stealing algorithms.
    158205The fundamental problem is that the constant randomness limits how much locality the scheduler offers.
    159 This becomes problematic both because the scheduler is likely to get cache misses on internal data-structures and because migration become very frequent.
    160 Therefore since the approach of modifying to relaxed-FIFO algorithm to behave more like work stealing does not seem to pan out, the alternative is to do it the other way around.
     206This becomes problematic both because the scheduler is likely to get cache misses on internal data-structures and because migrations become frequent.
     207Therefore, the attempt to modify the relaxed-FIFO algorithm to behave more like work stealing did not pan out.
     208The alternative is to do it the other way around.
    161209
    162210\section{Work Stealing++}
    163 To add stronger fairness guarantees to workstealing a few changes.
     211To add stronger fairness guarantees to work stealing a few changes are needed.
    164212First, the relaxed-FIFO algorithm has fundamentally better fairness because each \proc always monitors all subqueues.
    165 Therefore the workstealing algorithm must be prepended with some monitoring.
    166 Before attempting to dequeue from a \proc's local queue, the \proc must make some effort to make sure remote queues are not being neglected.
    167 To make this possible, \procs must be able to determie which \at has been on the ready-queue the longest.
    168 Which is the second aspect that much be added.
    169 The relaxed-FIFO approach uses timestamps for each \at and this is also what is done here.
     213Therefore, the work-stealing algorithm must be prepended with some monitoring.
     214Before attempting to dequeue from a \proc's subqueue, the \proc must make some effort to ensure other subqueues are not being neglected.
     215To make this possible, \procs must be able to determine which \at has been on the ready queue the longest.
     216Second, the relaxed-FIFO approach needs timestamps for each \at to make this possible.
    170217
    171218\begin{figure}
    172219        \centering
    173220        \input{base.pstex_t}
    174         \caption[Base \CFA design]{Base \CFA design \smallskip\newline A Pool of sub-ready queues offers the sharding, two per \glspl{proc}. Each \gls{proc} have local subqueues, however \glspl{proc} can access any of the sub-queues. Each \at is timestamped when enqueued.}
     221        \caption[Base \CFA design]{Base \CFA design \smallskip\newline A pool of subqueues offers the sharding, two per \glspl{proc}.
     222        Each \gls{proc} can access all of the subqueues.
     223        Each \at is timestamped when enqueued.}
    175224        \label{fig:base}
    176225\end{figure}
    177 The algorithm is structure as shown in Figure~\ref{fig:base}.
    178 This is very similar to classic workstealing except the local queues are placed in an array so \procs can access eachother's queue in constant time.
    179 Sharding width can be adjusted based on need.
    180 When a \proc attempts to dequeue a \at, it first picks a random remote queue and compares its timestamp to the timestamps of the local queue(s), dequeue from the remote queue if needed.
    181 
    182 Implemented as as naively state above, this approach has some obvious performance problems.
     226
     227Figure~\ref{fig:base} shows the algorithm structure.
     228This structure is similar to classic work-stealing except the subqueues are placed in an array so \procs can access them in constant time.
     229Sharding width can be adjusted based on contention.
     230Note, as an optimization, the TS of a \at is stored in the \at in front of it, so the first TS is in the array and the last \at has no TS.
     231This organization keeps the highly accessed front TSs directly in the array.
     232When a \proc attempts to dequeue a \at, it first picks a random remote subqueue and compares its timestamp to the timestamps of its local subqueue(s).
     233The oldest waiting \at is dequeued to provide global fairness.
     234
     235However, this na\"ive implemented has performance problems.
    183236First, it is necessary to have some damping effect on helping.
    184 Random effects like cache misses and preemption can add spurious but short bursts of latency for which helping is not helpful, pun intended.
    185 The effect of these bursts would be to cause more migrations than needed and make this workstealing approach slowdown to the match the relaxed-FIFO approach.
     237Random effects like cache misses and preemption can add spurious but short bursts of latency negating the attempt to help.
     238These bursts can cause increased migrations and make this work stealing approach slowdown to the level of relaxed-FIFO.
    186239
    187240\begin{figure}
     
    192245\end{figure}
    193246
    194 A simple solution to this problem is to compare an exponential moving average\cit{https://en.wikipedia.org/wiki/Moving\_average\#Exponential\_moving\_average} instead if the raw timestamps, shown in Figure~\ref{fig:base-ma}.
    195 Note that this is slightly more complex than it sounds because since the \at at the head of a subqueue is still waiting, its wait time has not ended.
    196 Therefore the exponential moving average is actually an exponential moving average of how long each already dequeued \at have waited.
    197 To compare subqueues, the timestamp at the head must be compared to the current time, yielding the bestcase wait time for the \at at the head of the queue.
     247A simple solution to this problem is to use an exponential moving average\cit{https://en.wikipedia.org/wiki/Moving\_average\#Exponential\_moving\_average} (MA) instead of a raw timestamps, shown in Figure~\ref{fig:base-ma}.
     248Note, this is more complex because the \at at the head of a subqueue is still waiting, so its wait time has not ended.
     249Therefore, the exponential moving average is actually an exponential moving average of how long each dequeued \at has waited.
     250To compare subqueues, the timestamp at the head must be compared to the current time, yielding the best-case wait-time for the \at at the head of the queue.
    198251This new waiting is averaged with the stored average.
    199 To limit even more the amount of unnecessary migration, a bias can be added to the local queue, where a remote queue is helped only if its moving average is more than \emph{X} times the local queue's average.
    200 None of the experimentation that I have run with these scheduler seem to indicate that the choice of the weight for the moving average or the choice of bis is particularly important.
    201 Weigths and biases of similar \emph{magnitudes} have similar effects.
    202 
    203 With these additions to workstealing, scheduling can be made as fair as the relaxed-FIFO approach, well avoiding the majority of unnecessary migrations.
    204 Unfortunately, the performance of this approach does suffer in the cases with no risks of starvation.
    205 The problem is that the constant polling of remote subqueues generally entail a cache miss.
    206 To make things worst, remote subqueues that are very active, \ie \ats are frequently enqueued and dequeued from them, the higher the chances are that polling will incurr a cache-miss.
    207 Conversly, the active subqueues do not benefit much from helping since starvation is already a non-issue.
    208 This puts this algorithm in an akward situation where it is paying for a cost, but the cost itself suggests the operation was unnecessary.
     252To further limit migration, a bias can be added to a local subqueue, where a remote subqueue is helped only if its moving average is more than $X$ times the local subqueue's average.
     253Tests for this approach indicate the choice of the weight for the moving average or the bias is not important, \ie weights and biases of similar \emph{magnitudes} have similar effects.
     254
     255With these additions to work stealing, scheduling can be made as fair as the relaxed-FIFO approach, avoiding the majority of unnecessary migrations.
     256Unfortunately, the work to achieve fairness has a performance cost, especially when the workload is inherently fair, and hence, there is only short-term or no starvation.
     257The problem is that the constant polling, \ie reads, of remote subqueues generally entail a cache miss because the TSs are constantly being updated, \ie, writes.
     258To make things worst, remote subqueues that are very active, \ie \ats are frequently enqueued and dequeued from them, lead to higher chances that polling will incur a cache-miss.
     259Conversely, the active subqueues do not benefit much from helping since starvation is already a non-issue.
     260This puts this algorithm in the awkward situation of paying for a cost that is largely unnecessary.
    209261The good news is that this problem can be mitigated
    210262
    211263\subsection{Redundant Timestamps}
    212 The problem with polling remote queues is due to a tension between the consistency requirement on the subqueue.
    213 For the subqueues, correctness is critical. There must be a consensus among \procs on which subqueues hold which \ats.
    214 Since the timestamps are use for fairness, it is alco important to have consensus and which \at is the oldest.
    215 However, when deciding if a remote subqueue is worth polling, correctness is much less of a problem.
    216 Since the only need is that a subqueue will eventually be polled, some data staleness can be acceptable.
    217 This leads to a tension where stale timestamps are only problematic in some cases.
    218 Furthermore, stale timestamps can be somewhat desirable since lower freshness requirements means less tension on the cache coherence protocol.
    219 
    220 
    221 \begin{figure}
    222         \centering
    223         % \input{base_ts2.pstex_t}
    224         \caption[\CFA design with Redundant Timestamps]{\CFA design with Redundant Timestamps \smallskip\newline A array is added containing a copy of the timestamps. These timestamps are written to with relaxed atomics, without fencing, leading to fewer cache invalidations.}
    225         \label{fig:base-ts2}
    226 \end{figure}
    227 A solution to this is to create a second array containing a copy of the timestamps and average.
     264The problem with polling remote subqueues is that correctness is critical.
     265There must be a consensus among \procs on which subqueues hold which \ats, as the \ats are in constant motion.
     266Furthermore, since timestamps are use for fairness, it is critical to have consensus on which \at is the oldest.
     267However, when deciding if a remote subqueue is worth polling, correctness is less of a problem.
     268Since the only requirement is that a subqueue is eventually polled, some data staleness is acceptable.
     269This leads to a situation where stale timestamps are only problematic in some cases.
     270Furthermore, stale timestamps can be desirable since lower freshness requirements mean less cache invalidations.
     271
     272Figure~\ref{fig:base-ts2} shows a solution with a second array containing a copy of the timestamps and average.
    228273This copy is updated \emph{after} the subqueue's critical sections using relaxed atomics.
    229274\Glspl{proc} now check if polling is needed by comparing the copy of the remote timestamp instead of the actual timestamp.
    230 The result is that since there is no fencing, the writes can be buffered and cause fewer cache invalidations.
    231 
    232 The correctness argument here is somewhat subtle.
     275The result is that since there is no fencing, the writes can be buffered in the hardware and cause fewer cache invalidations.
     276
     277\begin{figure}
     278        \centering
     279        \input{base_ts2.pstex_t}
     280        \caption[\CFA design with Redundant Timestamps]{\CFA design with Redundant Timestamps \smallskip\newline An array is added containing a copy of the timestamps.
     281        These timestamps are written to with relaxed atomics, so there is no order among concurrent memory accesses, leading to fewer cache invalidations.}
     282        \label{fig:base-ts2}
     283\end{figure}
     284
     285The correctness argument is somewhat subtle.
    233286The data used for deciding whether or not to poll a queue can be stale as long as it does not cause starvation.
    234 Therefore, it is acceptable if stale data make queues appear older than they really are but not fresher.
    235 For the timestamps, this means that missing writes to the timestamp is acceptable since they will make the head \at look older.
    236 For the moving average, as long as the operation are RW-safe, the average is guaranteed to yield a value that is between the oldest and newest values written.
    237 Therefore this unprotected read of the timestamp and average satisfy the limited correctness that is required.
     287Therefore, it is acceptable if stale data makes queues appear older than they really are but appearing fresher can be a problem.
     288For the timestamps, this means missing writes to the timestamp is acceptable since they make the head \at look older.
     289For the moving average, as long as the operations are just atomic reads/writes, the average is guaranteed to yield a value that is between the oldest and newest values written.
     290Therefore, this unprotected read of the timestamp and average satisfy the limited correctness that is required.
     291
     292With redundant timestamps, this scheduling algorithm achieves both the fairness and performance requirements on most machines.
     293The problem is that the cost of polling and helping is not necessarily consistent across each \gls{hthrd}.
     294For example, on machines with a CPU containing multiple hyperthreads and cores and multiple CPU sockets, cache misses can be satisfied from the caches on same (local) CPU, or by a CPU on a different (remote) socket.
     295Cache misses satisfied by a remote CPU have significantly higher latency than from the local CPU.
     296However, these delays are not specific to systems with multiple CPUs.
     297Depending on the cache structure, cache misses can have different latency on the same CPU, \eg the AMD EPYC 7662 CPUs used in Chapter~\ref{microbench}.
    238298
    239299\begin{figure}
    240300        \centering
    241301        \input{cache-share.pstex_t}
    242         \caption[CPU design with wide L3 sharing]{CPU design with wide L3 sharing \smallskip\newline A very simple CPU with 4 \glspl{hthrd}. L1 and L2 are private to each \gls{hthrd} but the L3 is shared across to entire core.}
     302        \caption[CPU design with wide L3 sharing]{CPU design with wide L3 sharing \smallskip\newline A CPU with 4 cores, where caches L1 and L2 are private to each core, and the L3 cache is shared across all cores.}
    243303        \label{fig:cache-share}
    244 \end{figure}
    245 
    246 \begin{figure}
    247         \centering
     304
     305        \vspace{25pt}
     306
    248307        \input{cache-noshare.pstex_t}
    249         \caption[CPU design with a narrower L3 sharing]{CPU design with a narrower L3 sharing \smallskip\newline A different CPU design, still with 4 \glspl{hthrd}. L1 and L2 are still private to each \gls{hthrd} but the L3 is shared some of the CPU but there is still two distinct L3 instances.}
     308        \caption[CPU design with a narrower L3 sharing]{CPU design with a narrow L3 sharing \smallskip\newline A CPU with 4 cores, where caches L1 and L2 are private to each core, and the L3 cache is shared across a pair of cores.}
    250309        \label{fig:cache-noshare}
    251310\end{figure}
    252311
    253 With redundant tiemstamps this scheduling algorithm achieves both the fairness and performance requirements, on some machines.
    254 The problem is that the cost of polling and helping is not necessarily consistent across each \gls{hthrd}.
    255 For example, on machines where the motherboard holds multiple CPU, cache misses can be satisfied from a cache that belongs to the CPU that missed, the \emph{local} CPU, or by a different CPU, a \emph{remote} one.
    256 Cache misses that are satisfied by a remote CPU will have higher latency than if it is satisfied by the local CPU.
    257 However, this is not specific to systems with multiple CPUs.
    258 Depending on the cache structure, cache-misses can have different latency for the same CPU.
    259 The AMD EPYC 7662 CPUs that is described in Chapter~\ref{microbench} is an example of that.
    260 Figure~\ref{fig:cache-share} and Figure~\ref{fig:cache-noshare} show two different cache topologies with highlight this difference.
    261 In Figure~\ref{fig:cache-share}, all cache instances are either private to a \gls{hthrd} or shared to the entire system, this means latency due to cache-misses are likely fairly consistent.
    262 By comparison, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by a hit in either instance of the L3.
    263 However, the memory access latency to the remote L3 instance will be notably higher than the memory access latency to the local L3.
    264 The impact of these different design on this algorithm is that scheduling will scale very well on architectures similar to Figure~\ref{fig:cache-share}, both will have notably worst scalling with many narrower L3 instances.
    265 This is simply because as the number of L3 instances grow, so two does the chances that the random helping will cause significant latency.
    266 The solution is to have the scheduler be aware of the cache topology.
     312Figures~\ref{fig:cache-share} and~\ref{fig:cache-noshare} show two different cache topologies that highlight this difference.
     313In Figure~\ref{fig:cache-share}, all cache misses are either private to a CPU or shared with another CPU.
     314This means latency due to cache misses is fairly consistent.
     315In contrast, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by either instance of L3 cache.
     316However, the memory-access latency to the remote L3 is higher than the memory-access latency to the local L3.
     317The impact of these different designs on this algorithm is that scheduling only scales well on architectures with a wide L3 cache, similar to Figure~\ref{fig:cache-share}, and less well on architectures with many narrower L3 cache instances, similar to Figure~\ref{fig:cache-noshare}.
     318Hence, as the number of L3 instances grow, so too does the chance that the random helping causes significant cache latency.
     319The solution is for the scheduler be aware of the cache topology.
    267320
    268321\subsection{Per CPU Sharding}
    269 Building a scheduler that is aware of cache topology poses two main challenges: discovering cache topology and matching \procs to cache instance.
    270 Sadly, there is no standard portable way to discover cache topology in C.
    271 Therefore, while this is a significant portability challenge, it is outside the scope of this thesis to design a cross-platform cache discovery mechanisms.
    272 The rest of this work assumes discovering the cache topology based on Linux's \texttt{/sys/devices/system/cpu} directory.
    273 This leaves the challenge of matching \procs to cache instance, or more precisely identifying which subqueues of the ready queue are local to which cache instance.
    274 Once this matching is available, the helping algorithm can be changed to add bias so that \procs more often help subqueues local to the same cache instance
    275 \footnote{Note that like other biases mentioned in this section, the actual bias value does not appear to need precise tuinng.}.
    276 
    277 The obvious approach to mapping cache instances to subqueues is to statically tie subqueues to CPUs.
    278 Instead of having each subqueue local to a specific \proc, the system is initialized with subqueues for each \glspl{hthrd} up front.
    279 Then \procs dequeue and enqueue by first asking which CPU id they are local to, in order to identify which subqueues are the local ones.
    280 \Glspl{proc} can get the CPU id from \texttt{sched\_getcpu} or \texttt{librseq}.
    281 
    282 This approach solves the performance problems on systems with topologies similar to Figure~\ref{fig:cache-noshare}.
    283 However, it actually causes some subtle fairness problems in some systems, specifically systems with few \procs and many \glspl{hthrd}.
    284 In these cases, the large number of subqueues and the bias agains subqueues tied to different cache instances make it so it is very unlikely any single subqueue is picked.
    285 To make things worst, the small number of \procs mean that few helping attempts will be made.
    286 This combination of few attempts and low chances make it so a \at stranded on a subqueue that is not actively dequeued from may wait very long before it gets randomly helped.
     322Building a scheduler that is cache aware poses two main challenges: discovering the cache topology and matching \procs to this cache structure.
     323Unfortunately, there is no portable way to discover cache topology, and it is outside the scope of this thesis to solve this problem.
     324This work uses the cache topology information from Linux's @/sys/devices/system/cpu@ directory.
     325This leaves the challenge of matching \procs to cache structure, or more precisely identifying which subqueues of the ready queue are local to which subcomponents of the cache structure.
     326Once a matching is generated, the helping algorithm is changed to add bias so that \procs more often help subqueues local to the same cache substructure.\footnote{
     327Note that like other biases mentioned in this section, the actual bias value does not appear to need precise tuning.}
     328
     329The simplest approach for mapping subqueues to cache structure is to statically tie subqueues to CPUs.
     330Instead of having each subqueue local to a specific \proc, the system is initialized with subqueues for each hardware hyperthread/core up front.
     331Then \procs dequeue and enqueue by first asking which CPU id they are executing on, in order to identify which subqueues are the local ones.
     332\Glspl{proc} can get the CPU id from @sched_getcpu@ or @librseq@.
     333
     334This approach solves the performance problems on systems with topologies with narrow L3 caches, similar to Figure \ref{fig:cache-noshare}.
     335However, it can still cause some subtle fairness problems in systems with few \procs and many \glspl{hthrd}.
     336In this case, the large number of subqueues and the bias against subqueues tied to different cache substructures make it unlikely that every subqueue is picked.
     337To make things worst, the small number of \procs mean that few helping attempts are made.
     338This combination of low selection and few helping attempts allow a \at to become stranded on a subqueue for a long time until it gets randomly helped.
    287339On a system with 2 \procs, 256 \glspl{hthrd} with narrow cache sharing, and a 100:1 bias, it can actually take multiple seconds for a \at to get dequeued from a remote queue.
    288340Therefore, a more dynamic matching of subqueues to cache instance is needed.
    289341
    290342\subsection{Topological Work Stealing}
    291 The approach that is used in the \CFA scheduler is to have per-\proc subqueue, but have an excplicit data-structure track which cache instance each subqueue is tied to.
    292 This is requires some finess because reading this data structure must lead to fewer cache misses than not having the data structure in the first place.
     343\label{s:TopologicalWorkStealing}
     344Therefore, the approach used in the \CFA scheduler is to have per-\proc subqueues, but have an explicit data-structure track which cache substructure each subqueue is tied to.
     345This tracking requires some finesse because reading this data structure must lead to fewer cache misses than not having the data structure in the first place.
    293346A key element however is that, like the timestamps for helping, reading the cache instance mapping only needs to give the correct result \emph{often enough}.
    294 Therefore the algorithm can be built as follows: Before enqueuing or dequeing a \at, each \proc queries the CPU id and the corresponding cache instance.
     347Therefore the algorithm can be built as follows: before enqueueing or dequeuing a \at, each \proc queries the CPU id and the corresponding cache instance.
    295348Since subqueues are tied to \procs, each \proc can then update the cache instance mapped to the local subqueue(s).
    296349To avoid unnecessary cache line invalidation, the map is only written to if the mapping changes.
    297350
     351This scheduler is used in the remainder of the thesis for managing CPU execution, but additional scheduling is needed to handle long-term blocking and unblocking, such as I/O.
     352
  • doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex

    r9e23b446 rffec1bf  
    11\chapter{Micro-Benchmarks}\label{microbench}
    22
    3 The first step of evaluation is always to test-out small controlled cases, to ensure that the basics are working properly.
    4 This sections presents five different experimental setup, evaluating some of the basic features of \CFA's scheduler.
     3The first step in evaluating this work is to test-out small controlled cases to ensure the basics work properly.
     4This chapter presents five different experimental setup, evaluating some of the basic features of \CFA's scheduler.
    55
    66\section{Benchmark Environment}
    7 All of these benchmarks are run on two distinct hardware environment, an AMD and an INTEL machine.
    8 
    9 For all benchmarks, \texttt{taskset} is used to limit the experiment to 1 NUMA Node with no hyper threading.
     7All benchmarks are run on two distinct hardware platforms.
     8\begin{description}
     9\item[AMD] is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM.
     10The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}.
     11Each CPU has 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches, respectively.
     12Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}.
     13The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
     14
     15\item[Intel] is a server with four Intel Xeon Platinum 8160 CPUs and 384GB of DDR4 RAM.
     16The Xeon CPU has 24 cores with 2 \glspl{hthrd} per core, for 48 \glspl{hthrd} per socket with 4 sockets for a total of 196 \glspl{hthrd}.
     17Each CPU has 3 MB, 96 MB and 132 MB of L1, L2 and L3 caches respectively.
     18Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared across the entire CPU, therefore 48 \glspl{hthrd}.
     19The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
     20\end{description}
     21
     22For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA Node with no hyper threading.
    1023If more \glspl{hthrd} are needed, then 1 NUMA Node with hyperthreading is used.
    11 If still more \glspl{hthrd} are needed then the experiment is limited to as few NUMA Nodes as needed.
    12 
    13 
    14 \paragraph{AMD} The AMD machine is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM.
    15 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
    16 These EPYCs have 64 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 256 \glspl{hthrd}.
    17 The cpus each have 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches respectively.
    18 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}.
    19 
    20 \paragraph{Intel} The Intel machine is a server with four Intel Xeon Platinum 8160 CPUs and 384GB of DDR4 RAM.
    21 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
    22 These Xeon Platinums have 24 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 192 \glspl{hthrd}.
    23 The cpus each have 3 MB, 96 MB and 132 MB of L1, L2 and L3 caches respectively.
    24 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared across the entire CPU, therefore 48 \glspl{hthrd}.
    25 
    26 This limited sharing of the last level cache on the AMD machine is markedly different than the Intel machine. Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different cpu incurr a significant latency, on AMD it is also the case that cache misses served by a different L3 instance on the same cpu still incur high latency.
     24If still more \glspl{hthrd} are needed, then the experiment is limited to as few NUMA Nodes as needed.
     25
     26The limited sharing of the last-level cache on the AMD machine is markedly different than the Intel machine.
     27Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different CPU incur a significant latency, on the AMD it is also the case that cache misses served by a different L3 instance on the same CPU still incur high latency.
    2728
    2829
     
    3435        \label{fig:cycle}
    3536\end{figure}
    36 The most basic evaluation of any ready queue is to evaluate the latency needed to push and pop one element from the ready-queue.
    37 Since these two operation also describe a \texttt{yield} operation, many systems use this as the most basic benchmark.
    38 However, yielding can be treated as a special case, since it also carries the information that the number of the ready \glspl{at} will not change.
    39 Not all systems use this information, but those which do may appear to have better performance than they would for disconnected push/pop pairs.
    40 For this reason, I chose a different first benchmark, which I call the Cycle Benchmark.
    41 This benchmark arranges many \glspl{at} into multiple rings of \glspl{at}.
    42 Each ring is effectively a circular singly-linked list.
     37The most basic evaluation of any ready queue is to evaluate the latency needed to push and pop one element from the ready queue.
     38Since these two operation also describe a @yield@ operation, many systems use this operation as the most basic benchmark.
     39However, yielding can be treated as a special case by optimizing it away (dead code) since the number of ready \glspl{at} does not change.
     40Not all systems perform this optimization, but those that do have an artificial performance benefit because the yield becomes a \emph{nop}.
     41For this reason, I chose a different first benchmark, called \newterm{Cycle Benchmark}.
     42This benchmark arranges a number of \glspl{at} into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list.
    4343At runtime, each \gls{at} unparks the next \gls{at} before parking itself.
    44 This corresponds to the desired pair of ready queue operations.
    45 Unparking the next \gls{at} requires pushing that \gls{at} onto the ready queue and the ensuing park will cause the runtime to pop a \gls{at} from the ready-queue.
    46 Figure~\ref{fig:cycle} shows a visual representation of this arrangement.
    47 
    48 The goal of this ring is that the underlying runtime cannot rely on the guarantee that the number of ready \glspl{at} will stay constant over the duration of the experiment.
     44Unparking the next \gls{at} pushes that \gls{at} onto the ready queue as does the ensuing park.
     45
     46Hence, the underlying runtime cannot rely on the number of ready \glspl{at} staying constant over the duration of the experiment.
    4947In fact, the total number of \glspl{at} waiting on the ready queue is expected to vary because of the race between the next \gls{at} unparking and the current \gls{at} parking.
    50 The size of the cycle is also decided based on this race: cycles that are too small may see the chain of unparks go full circle before the first \gls{at} can park.
    51 While this would not be a correctness problem, every runtime system must handle that race, it could lead to pushes and pops being optimized away.
    52 Since silently omitting ready-queue operations would throw off the measuring of these operations, the ring of \glspl{at} must be big enough so the \glspl{at} have the time to fully park before they are unparked.
    53 Note that this problem is only present on SMP machines and is significantly mitigated by the fact that there are multiple rings in the system.
    54 
    55 To avoid this benchmark from being dominated by the idle sleep handling, the number of rings is kept at least as high as the number of \glspl{proc} available.
    56 Beyond this point, adding more rings serves to mitigate even more the idle sleep handling.
    57 This is to avoid the case where one of the \glspl{proc} runs out of work because of the variation on the number of ready \glspl{at} mentionned above.
    58 
    59 The actual benchmark is more complicated to handle termination, but that simply requires using a binary semphore or a channel instead of raw \texttt{park}/\texttt{unpark} and carefully picking the order of the \texttt{P} and \texttt{V} with respect to the loop condition.
    60 Figure~\ref{fig:cycle:code} shows pseudo code for this benchmark.
    61 
    62 \begin{figure}
    63         \begin{lstlisting}
    64                 Thread.main() {
    65                         count := 0
    66                         for {
    67                                 wait()
    68                                 this.next.wake()
    69                                 count ++
    70                                 if must_stop() { break }
    71                         }
    72                         global.count += count
    73                 }
    74         \end{lstlisting}
    75         \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code}
    76         \label{fig:cycle:code}
    77 \end{figure}
    78 
    79 
     48That is, the runtime cannot anticipate that the current task will immediately park.
     49As well, the size of the cycle is also decided based on this race, \eg a small cycle may see the chain of unparks go full circle before the first \gls{at} parks because of time-slicing or multiple \procs.
     50Every runtime system must handle this race and cannot optimized away the ready-queue pushes and pops.
     51To prevent any attempt of silently omitting ready-queue operations, the ring of \glspl{at} is made big enough so the \glspl{at} have time to fully park before being unparked again.
     52(Note, an unpark is like a V on a semaphore, so the subsequent park (P) may not block.)
     53Finally, to further mitigate any underlying push/pop optimizations, especially on SMP machines, multiple rings are created in the experiment.
     54
     55To avoid this benchmark being affected by idle-sleep handling, the number of rings is multiple times greater than the number of \glspl{proc}.
     56This design avoids the case where one of the \glspl{proc} runs out of work because of the variation on the number of ready \glspl{at} mentioned above.
     57
     58Figure~\ref{fig:cycle:code} shows the pseudo code for this benchmark.
     59There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw @park@/@unpark@ and carefully picking the order of the @P@ and @V@ with respect to the loop condition.
     60
     61\begin{figure}
     62\begin{cfa}
     63Thread.main() {
     64        count := 0
     65        for {
     66                @wait()@
     67                @this.next.wake()@
     68                count ++
     69                if must_stop() { break }
     70        }
     71        global.count += count
     72}
     73\end{cfa}
     74\caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code}
     75\label{fig:cycle:code}
     76\end{figure}
    8077
    8178\subsection{Results}
     79Figure~\ref{fig:cycle:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle.
     80
    8281\begin{figure}
    8382        \subfloat[][Throughput, 100 \ats per \proc]{
     
    106105                \label{fig:cycle:jax:low:ns}
    107106        }
    108         \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput as a function of \proc count, using 100 cycles per \proc, 5 \ats per cycle.}
     107        \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput as a function of \proc count with 100 cycles per \proc and 5 \ats per cycle.}
    109108        \label{fig:cycle:jax}
    110109\end{figure}
    111 Figure~\ref{fig:cycle:jax} shows the throughput as a function of \proc count, with the following constants:
    112 Each run uses 100 cycles per \proc, 5 \ats per cycle.
    113110
    114111\todo{results discussion}
    115112
    116113\section{Yield}
    117 For completion, I also include the yield benchmark.
    118 This benchmark is much simpler than the cycle tests, it simply creates many \glspl{at} that call \texttt{yield}.
    119 As mentionned in the previous section, this benchmark may be less representative of usages that only make limited use of \texttt{yield}, due to potential shortcuts in the routine.
    120 Its only interesting variable is the number of \glspl{at} per \glspl{proc}, where ratios close to 1 means the ready queue(s) could be empty.
    121 This sometimes puts more strain on the idle sleep handling, compared to scenarios where there is clearly plenty of work to be done.
    122 Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, the ``wait/wake-next'' is simply replaced by a yield.
    123 
    124 \begin{figure}
    125         \begin{lstlisting}
    126                 Thread.main() {
    127                         count := 0
    128                         for {
    129                                 yield()
    130                                 count ++
    131                                 if must_stop() { break }
    132                         }
    133                         global.count += count
    134                 }
    135         \end{lstlisting}
    136         \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code}
    137         \label{fig:yield:code}
     114For completion, the classic yield benchmark is included.
     115This benchmark is simpler than the cycle test: it creates many \glspl{at} that call @yield@.
     116As mentioned, this benchmark may not be representative because of optimization shortcuts in @yield@.
     117The only interesting variable in this benchmark is the number of \glspl{at} per \glspl{proc}, where ratios close to 1 means the ready queue(s) can be empty.
     118This scenario can put a strain on the idle-sleep handling compared to scenarios where there is plenty of work.
     119Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, where the @wait/next.wake@ is replaced by @yield@.
     120
     121\begin{figure}
     122\begin{cfa}
     123Thread.main() {
     124        count := 0
     125        for {
     126                @yield()@
     127                count ++
     128                if must_stop() { break }
     129        }
     130        global.count += count
     131}
     132\end{cfa}
     133\caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code}
     134\label{fig:yield:code}
    138135\end{figure}
    139136
    140137\subsection{Results}
     138
     139Figure~\ref{fig:yield:jax} shows the throughput as a function of \proc count, where each run uses 100 \ats per \proc.
     140
    141141\begin{figure}
    142142        \subfloat[][Throughput, 100 \ats per \proc]{
     
    168168        \label{fig:yield:jax}
    169169\end{figure}
    170 Figure~\ref{fig:yield:ops:jax} shows the throughput as a function of \proc count, with the following constants:
    171 Each run uses 100 \ats per \proc.
    172170
    173171\todo{results discussion}
    174172
    175 
    176173\section{Churn}
    177 The Cycle and Yield benchmark represents an ``easy'' scenario for a scheduler, \eg, an embarrassingly parallel application.
    178 In these benchmarks, \glspl{at} can be easily partitioned over the different \glspl{proc} up-front and none of the \glspl{at} communicate with each other.
    179 
    180 The Churn benchmark represents more chaotic usages, where there is no relation between the last \gls{proc} on which a \gls{at} ran and the \gls{proc} that unblocked it.
    181 When a \gls{at} is unblocked from a different \gls{proc} than the one on which it last ran, the unblocking \gls{proc} must either ``steal'' the \gls{at} or place it on a remote queue.
    182 This results can result in either contention on the remote queue or \glspl{rmr} on \gls{at} data structure.
    183 In either case, this benchmark aims to highlight how each scheduler handles these cases, since both cases can lead to performance degradation if they are not handled correctly.
    184 
    185 To achieve this the benchmark uses a fixed size array of semaphores.
    186 Each \gls{at} picks a random semaphore, \texttt{V}s it to unblock a \at waiting and then \texttt{P}s on the semaphore.
     174The Cycle and Yield benchmark represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application.
     175In these benchmarks, \glspl{at} can be easily partitioned over the different \glspl{proc} upfront and none of the \glspl{at} communicate with each other.
     176
     177The Churn benchmark represents more chaotic execution, where there is no relation between the last \gls{proc} on which a \gls{at} ran and blocked and the \gls{proc} that subsequently unblocks it.
     178With processor-specific ready-queues, when a \gls{at} is unblocked by a different \gls{proc} that means the unblocking \gls{proc} must either ``steal'' the \gls{at} from another processor or find it on a global queue.
     179This dequeuing results in either contention on the remote queue and/or \glspl{rmr} on \gls{at} data structure.
     180In either case, this benchmark aims to highlight how each scheduler handles these cases, since both cases can lead to performance degradation if not handled correctly.
     181
     182This benchmark uses a fixed-size array of counting semaphores.
     183Each \gls{at} picks a random semaphore, @V@s it to unblock any \at waiting, and then @P@s on the semaphore.
    187184This creates a flow where \glspl{at} push each other out of the semaphores before being pushed out themselves.
    188 For this benchmark to work however, the number of \glspl{at} must be equal or greater to the number of semaphores plus the number of \glspl{proc}.
    189 Note that the nature of these semaphores mean the counter can go beyond 1, which could lead to calls to \texttt{P} not blocking.
     185For this benchmark to work, the number of \glspl{at} must be equal or greater than the number of semaphores plus the number of \glspl{proc}.
     186Note, the nature of these semaphores mean the counter can go beyond 1, which can lead to nonblocking calls to @P@.
     187Figure~\ref{fig:churn:code} shows pseudo code for this benchmark, where the @yield@ is replaced by @V@ and @P@.
     188
     189\begin{figure}
     190\begin{cfa}
     191Thread.main() {
     192        count := 0
     193        for {
     194                r := random() % len(spots)
     195                @spots[r].V()@
     196                @spots[r].P()@
     197                count ++
     198                if must_stop() { break }
     199        }
     200        global.count += count
     201}
     202\end{cfa}
     203\caption[Churn Benchmark : Pseudo Code]{Churn Benchmark : Pseudo Code}
     204\label{fig:churn:code}
     205\end{figure}
     206
     207\subsection{Results}
     208Figure~\ref{fig:churn:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle.
     209
     210\begin{figure}
     211        \subfloat[][Throughput, 100 \ats per \proc]{
     212                \resizebox{0.5\linewidth}{!}{
     213                        \input{result.churn.jax.ops.pstex_t}
     214                }
     215                \label{fig:churn:jax:ops}
     216        }
     217        \subfloat[][Throughput, 1 \ats per \proc]{
     218                \resizebox{0.5\linewidth}{!}{
     219                        \input{result.churn.low.jax.ops.pstex_t}
     220                }
     221                \label{fig:churn:jax:low:ops}
     222        }
     223
     224        \subfloat[][Latency, 100 \ats per \proc]{
     225                \resizebox{0.5\linewidth}{!}{
     226                        \input{result.churn.jax.ns.pstex_t}
     227                }
     228
     229        }
     230        \subfloat[][Latency, 1 \ats per \proc]{
     231                \resizebox{0.5\linewidth}{!}{
     232                        \input{result.churn.low.jax.ns.pstex_t}
     233                }
     234                \label{fig:churn:jax:low:ns}
     235        }
     236        \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine.
     237        Throughput is the total operation per second across all cores. Latency is the duration of each operation.}
     238        \label{fig:churn:jax}
     239\end{figure}
     240
     241\todo{results discussion}
     242
     243\section{Locality}
    190244
    191245\todo{code, setup, results}
    192 \begin{lstlisting}
    193         Thread.main() {
    194                 count := 0
    195                 for {
    196                         r := random() % len(spots)
    197                         spots[r].V()
    198                         spots[r].P()
    199                         count ++
    200                         if must_stop() { break }
    201                 }
    202                 global.count += count
    203         }
    204 \end{lstlisting}
    205 
    206 \begin{figure}
    207         \subfloat[][Throughput, 100 \ats per \proc]{
    208                 \resizebox{0.5\linewidth}{!}{
    209                         \input{result.churn.jax.ops.pstex_t}
    210                 }
    211                 \label{fig:churn:jax:ops}
    212         }
    213         \subfloat[][Throughput, 1 \ats per \proc]{
    214                 \resizebox{0.5\linewidth}{!}{
    215                         \input{result.churn.low.jax.ops.pstex_t}
    216                 }
    217                 \label{fig:churn:jax:low:ops}
    218         }
    219 
    220         \subfloat[][Latency, 100 \ats per \proc]{
    221                 \resizebox{0.5\linewidth}{!}{
    222                         \input{result.churn.jax.ns.pstex_t}
    223                 }
    224 
    225         }
    226         \subfloat[][Latency, 1 \ats per \proc]{
    227                 \resizebox{0.5\linewidth}{!}{
    228                         \input{result.churn.low.jax.ns.pstex_t}
    229                 }
    230                 \label{fig:churn:jax:low:ns}
    231         }
    232         \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. Throughput is the total operation per second across all cores. Latency is the duration of each opeartion.}
    233         \label{fig:churn:jax}
    234 \end{figure}
    235 
    236 \section{Locality}
    237 
    238 \todo{code, setup, results}
    239246
    240247\section{Transfer}
    241 The last benchmark is more exactly characterize as an experiment than a benchmark.
    242 It tests the behavior of the schedulers for a particularly misbehaved workload.
     248The last benchmark is more of an experiment than a benchmark.
     249It tests the behaviour of the schedulers for a misbehaved workload.
    243250In this workload, one of the \gls{at} is selected at random to be the leader.
    244251The leader then spins in a tight loop until it has observed that all other \glspl{at} have acknowledged its leadership.
    245252The leader \gls{at} then picks a new \gls{at} to be the ``spinner'' and the cycle repeats.
    246 
    247 The benchmark comes in two flavours for the behavior of the non-leader \glspl{at}:
    248 once they acknowledged the leader, they either block on a semaphore or yield repeatadly.
    249 
    250 This experiment is designed to evaluate the short term load balancing of the scheduler.
    251 Indeed, schedulers where the runnable \glspl{at} are partitioned on the \glspl{proc} may need to balance the \glspl{at} for this experient to terminate.
    252 This is because the spinning \gls{at} is effectively preventing the \gls{proc} from runnning any other \glspl{thrd}.
    253 In the semaphore flavour, the number of runnable \glspl{at} will eventually dwindle down to only the leader.
    254 This is a simpler case to handle for schedulers since \glspl{proc} eventually run out of work.
     253The benchmark comes in two flavours for the non-leader \glspl{at}:
     254once they acknowledged the leader, they either block on a semaphore or spin yielding.
     255
     256The experiment is designed to evaluate the short-term load-balancing of a scheduler.
     257Indeed, schedulers where the runnable \glspl{at} are partitioned on the \glspl{proc} may need to balance the \glspl{at} for this experiment to terminate.
     258This problem occurs because the spinning \gls{at} is effectively preventing the \gls{proc} from running any other \glspl{thrd}.
     259In the semaphore flavour, the number of runnable \glspl{at} eventually dwindles down to only the leader.
     260This scenario is a simpler case to handle for schedulers since \glspl{proc} eventually run out of work.
    255261In the yielding flavour, the number of runnable \glspl{at} stays constant.
    256 This is a harder case to handle because corrective measures must be taken even if work is still available.
    257 Note that languages that have mandatory preemption do circumvent this problem by forcing the spinner to yield.
     262This scenario is a harder case to handle because corrective measures must be taken even when work is available.
     263Note, runtime systems with preemption circumvent this problem by forcing the spinner to yield.
    258264
    259265\todo{code, setup, results}
    260 \begin{lstlisting}
    261         Thread.lead() {
    262                 this.idx_seen = ++lead_idx
    263                 if lead_idx > stop_idx {
    264                         done := true
    265                         return
    266                 }
    267 
    268                 // Wait for everyone to acknowledge my leadership
    269                 start: = timeNow()
     266
     267\begin{figure}
     268\begin{cfa}
     269Thread.lead() {
     270        this.idx_seen = ++lead_idx
     271        if lead_idx > stop_idx {
     272                done := true
     273                return
     274        }
     275
     276        // Wait for everyone to acknowledge my leadership
     277        start: = timeNow()
     278        for t in threads {
     279                while t.idx_seen != lead_idx {
     280                        asm pause
     281                        if (timeNow() - start) > 5 seconds { error() }
     282                }
     283        }
     284
     285        // pick next leader
     286        leader := threads[ prng() % len(threads) ]
     287
     288        // wake every one
     289        if ! exhaust {
    270290                for t in threads {
    271                         while t.idx_seen != lead_idx {
    272                                 asm pause
    273                                 if (timeNow() - start) > 5 seconds { error() }
    274                         }
    275                 }
    276 
    277                 // pick next leader
    278                 leader := threads[ prng() % len(threads) ]
    279 
    280                 // wake every one
    281                 if !exhaust {
    282                         for t in threads {
    283                                 if t != me { t.wake() }
    284                         }
    285                 }
    286         }
    287 
    288         Thread.wait() {
    289                 this.idx_seen := lead_idx
    290                 if exhaust { wait() }
    291                 else { yield() }
    292         }
    293 
    294         Thread.main() {
    295                 while !done  {
    296                         if leader == me { this.lead() }
    297                         else { this.wait() }
    298                 }
    299         }
    300 \end{lstlisting}
     291                        if t != me { t.wake() }
     292                }
     293        }
     294}
     295
     296Thread.wait() {
     297        this.idx_seen := lead_idx
     298        if exhaust { wait() }
     299        else { yield() }
     300}
     301
     302Thread.main() {
     303        while !done  {
     304                if leader == me { this.lead() }
     305                else { this.wait() }
     306        }
     307}
     308\end{cfa}
     309\caption[Transfer Benchmark : Pseudo Code]{Transfer Benchmark : Pseudo Code}
     310\label{fig:transfer:code}
     311\end{figure}
     312
     313\subsection{Results}
     314Figure~\ref{fig:transfer:jax} shows the throughput as a function of \proc count, where each run uses 100 cycles per \proc and 5 \ats per cycle.
     315
     316\todo{results discussion}
  • doc/theses/thierry_delisle_PhD/thesis/text/existing.tex

    r9e23b446 rffec1bf  
    11\chapter{Previous Work}\label{existing}
    2 Scheduling is the process of assigning resources to incomming requests.
    3 A very common form of this is assigning available workers to work-requests.
    4 The need for scheduling is very common in Computer Science, \eg Operating Systems and Hypervisors schedule available CPUs, NICs schedule available bamdwith, but scheduling is also common in other fields.
    5 For example, in assmebly lines assigning parts in need of assembly to line workers is a form of scheduling.
    6 
    7 In all these cases, the choice of a scheduling algorithm generally depends first and formost on how much information is available to the scheduler.
    8 Workloads that are well-kown, consistent and homegenous can benefit from a scheduler that is optimized to use this information while ill-defined inconsistent heterogenous workloads will require general algorithms.
    9 A secondary aspect to that is how much information can be gathered versus how much information must be given as part of the input.
    10 There is therefore a spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limitted information.
    11 Note that this description includes both infomation about each requests, \eg time to complete or resources needed, and information about the relationships between request, \eg whether or not some request must be completed before another request starts.
    12 
    13 Scheduling physical resources, for example in assembly lines, is generally amenable to using very well informed scheduling since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time.
     2As stated, scheduling is the process of assigning resources to incoming requests, where the common example is assigning available workers to work requests or vice versa.
     3Common scheduling examples in Computer Science are: operating systems and hypervisors schedule available CPUs, NICs schedule available bandwidth, virtual memory and memory allocator schedule available storage, \etc.
     4Scheduling is also common in most other fields, \eg in assembly lines, assigning parts to line workers is a form of scheduling.
     5
     6In general, \emph{selecting} a scheduling algorithm depends on how much information is available to the scheduler.
     7Workloads that are well-known, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms.
     8A secondary aspect is how much information can be gathered versus how much information must be given as part of the scheduler input.
     9This information adds to the spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limited information.
     10Note, this description includes both information about each requests, \eg time to complete or resources needed, and information about the relationships among request, \eg whether or not some request must be completed before another request starts.
     11
     12Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling, since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time.
    1413When a faster pace is needed and changes are much more frequent gathering information on workloads, up-front or live, can become much more limiting and more general schedulers are needed.
    1514
    1615\section{Naming Convention}
    17 Scheduling has been studied by various different communities concentrating on different incarnation of the same problems. As a result, their is no real naming convention for scheduling that is respected across these communities. For this document, I will use the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the objects which will execute these \glspl{at}.
     16Scheduling has been studied by various communities concentrating on different incarnation of the same problems.
     17As a result, there are no standard naming conventions for scheduling that is respected across these communities.
     18This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats.
    1819
    1920\section{Static Scheduling}
    20 Static schedulers require that \glspl{at} have their dependencies and costs explicitly and exhaustively specified prior schedule.
    21 The scheduler then processes this input ahead of time and producess a \newterm{schedule} to which the system can later adhere.
    22 This approach is generally popular in real-time systems since the need for strong guarantees justifies the cost of supplying this information.
    23 In general, static schedulers are less relavant to this project since they require input from the programmers that \CFA does not have as part of its concurrency semantic.
    24 Specifying this information explicitly can add a significant burden on the programmers and reduces flexibility, for this reason the \CFA scheduler does not require this information.
    25 
     21\newterm{Static schedulers} require \ats dependencies and costs be explicitly and exhaustively specified prior to scheduling.
     22The scheduler then processes this input ahead of time and produces a \newterm{schedule} the system follows during execution.
     23This approach is popular in real-time systems since the need for strong guarantees justifies the cost of determining and supplying this information.
     24In general, static schedulers are less relevant to this project because they require input from the programmers that the programming language does not have as part of its concurrency semantic.
     25Specifying this information explicitly adds a significant burden to the programmer and reduces flexibility.
     26For this reason, the \CFA scheduler does not require this information.
    2627
    2728\section{Dynamic Scheduling}
    28 It may be difficult to fulfill the requirements of static scheduler if dependencies are conditionnal. In this case, it may be preferable to detect dependencies at runtime. This detection effectively takes the form of adding one or more new \gls{at}(s) to the system as their dependencies are resolved. As well as potentially halting or suspending a \gls{at} that dynamically detect unfulfilled dependencies. Each \gls{at} has the responsability of adding the dependent \glspl{at} back in the system once completed. As a consequence, the scheduler may have an incomplete view of the system, seeing only \glspl{at} we no pending dependencies. Schedulers that support this detection at runtime are referred to as \newterm{Dynamic Schedulers}.
     29\newterm{Dynamic schedulers} determine \ats dependencies and costs during scheduling, if at all.
     30Hence, unlike static scheduling, \ats dependencies are conditional and detected at runtime.
     31This detection takes the form of observing new \ats(s) in the system and determining dependencies from their behaviour, including suspending or halting a \ats that dynamically detects unfulfilled dependencies.
     32Furthermore, each \ats has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled.
     33As a consequence, the scheduler often has an incomplete view of the system, seeing only \ats with no pending dependencies.
    2934
    3035\subsection{Explicitly Informed Dynamic Schedulers}
    31 While dynamic schedulers do not have access to an exhaustive list of dependencies for a \gls{at}, they may require to provide more or less information about each \gls{at}, including for example: expected duration, required ressources, relative importance, etc. The scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information} Precisely providing this information can be difficult for programmers, especially \emph{predicted} behaviour, and the scheduler may need to support some amount of imprecision in the provided information. For example, specifying that a \glspl{at} takes approximately 5 seconds to complete, rather than exactly 5 seconds. User provided information can also become a significant burden depending how the effort to provide the information scales with the number of \glspl{at} and there complexity. For example, providing an exhaustive list of files read by 5 \glspl{at} is an easier requirement the providing an exhaustive list of memory addresses accessed by 10'000 distinct \glspl{at}.
    32 
    33 Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, Explicitly Informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentionnding.
    34 
    35 \subsubsection{Prority Scheduling}
    36 A commonly used information that schedulers used to direct the algorithm is priorities. Each Task is given a priority and higher-priority \glspl{at} are preferred to lower-priority ones. The simplest priority scheduling algorithm is to simply require that every \gls{at} have a distinct pre-established priority and always run the available \gls{at} with the highest priority. Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \glspl{at}. It can therefore be diserable for schedulers to support \glspl{at} with identical priorities and/or automatically setting and adjusting priorites for \glspl{at}. The most common operating some variation on priorities with overlaps and dynamic priority adjustments. For example, Microsoft Windows uses a pair of priorities
     36While dynamic schedulers may not have an exhaustive list of dependencies for a \ats, some information may be available about each \ats, \eg expected duration, required resources, relative importance, \etc.
     37When available, a scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information}
     38However, most programmers do not determine or even \emph{predict} this information;
     39at best, the scheduler has only some imprecise information provided by the programmer, \eg, indicating a \ats takes approximately 3--7 seconds to complete, rather than exactly 5 seconds.
     40Providing this kind of information is a significant programmer burden especially if the information does not scale with the number of \ats and their complexity.
     41For example, providing an exhaustive list of files read by 5 \ats is an easier requirement then providing an exhaustive list of memory addresses accessed by 10,000 independent \ats.
     42
     43Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, explicitly informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentioning.
     44
     45\subsubsection{Priority Scheduling}
     46Common information used by schedulers to direct their algorithm is priorities.
     47Each \ats is given a priority and higher-priority \ats are preferred to lower-priority ones.
     48The simplest priority scheduling algorithm is to require that every \ats have a distinct pre-established priority and always run the available \ats with the highest priority.
     49Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \ats.
     50It can therefore be desirable for schedulers to support \ats with identical priorities and/or automatically setting and adjusting priorities for \ats.
     51Most common operating systems use some variant on priorities with overlaps and dynamic priority adjustments.
     52For example, Microsoft Windows uses a pair of priorities
    3753\cit{https://docs.microsoft.com/en-us/windows/win32/procthread/scheduling-priorities,https://docs.microsoft.com/en-us/windows/win32/taskschd/taskschedulerschema-priority-settingstype-element}, one specified by users out of ten possible options and one adjusted by the system.
    3854
    3955\subsection{Uninformed and Self-Informed Dynamic Schedulers}
    40 Several scheduling algorithms do not require programmers to provide additionnal information on each \gls{at}, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
     56Several scheduling algorithms do not require programmers to provide additional information on each \ats, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
    4157
    4258
    4359\subsubsection{Feedback Scheduling}
    44 As mentionned, Schedulers may also gather information about each \glspl{at} to direct their decisions. This design effectively moves the scheduler to some extent into the realm of \newterm{Control Theory}\cite{wiki:controltheory}. This gathering does not generally involve programmers and as such does not increase programmer burden the same way explicitly provided information may. However, some feedback schedulers do offer the option to programmers to offer additionnal information on certain \glspl{at}, in order to direct scheduling decision. The important distinction being whether or not the scheduler can function without this additionnal information.
     60As mentioned, schedulers may also gather information about each \ats to direct their decisions.
     61This design effectively moves the scheduler into the realm of \newterm{Control Theory}~\cite{wiki:controltheory}.
     62This information gathering does not generally involve programmers, and as such, does not increase programmer burden the same way explicitly provided information may.
     63However, some feedback schedulers do allow programmers to offer additional information on certain \ats, in order to direct scheduling decisions.
     64The important distinction being whether or not the scheduler can function without this additional information.
    4565
    4666
    4767\section{Work Stealing}\label{existing:workstealing}
    48 One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work-stealing. This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker work on its local \glspl{at} first, but allows the possibility for other workers to steal local \glspl{at} if they run out of \glspl{at}. \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has queue of \glspl{at} to accomplish and workers without \glspl{at} steal \glspl{at} from random workers. (The Burton and Sleep algorithm had trees of \glspl{at} and stole only among neighbours). Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations.
    49 
    50 Many variations of this algorithm have been proposed over the years\cite{DBLP:journals/ijpp/YangH18}, both optmizations of existing implementations and approaches that account for new metrics.
    51 
    52 \paragraph{Granularity} A significant portion of early Work Stealing research was concentrating on \newterm{Implicit Parellelism}\cite{wiki:implicitpar}. Since the system was responsible to split the work, granularity is a challenge that cannot be left to the programmers (as opposed to \newterm{Explicit Parellelism}\cite{wiki:explicitpar} where the burden can be left to programmers). In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead. The best performance generally means finding a middle ground between the two. Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained.
    53 
    54 \paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating \glspl{at} from one core to another can be .  \cite{DBLP:journals/tpds/SquillanteL93}
     68One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work stealing.
     69This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \ats first, but allows the possibility for other workers to steal local \ats if they run out of \ats.
     70\cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has a queue of \ats and workers without \ats steal \ats from random workers\footnote{The Burton and Sleep algorithm had trees of \ats and steal only among neighbours.}.
     71Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations.
     72
     73Many variations of this algorithm have been proposed over the years~\cite{DBLP:journals/ijpp/YangH18}, both optimizations of existing implementations and approaches that account for new metrics.
     74
     75\paragraph{Granularity} A significant portion of early work-stealing research concentrated on \newterm{Implicit Parallelism}~\cite{wiki:implicitpar}.
     76Since the system is responsible for splitting the work, granularity is a challenge that cannot be left to programmers, as opposed to \newterm{Explicit Parallelism}\cite{wiki:explicitpar} where the burden can be left to programmers.
     77In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead.
     78The best performance generally means finding a middle ground between the two.
     79Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained.
     80
     81\paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating \ats from one core to another can be .  \cite{DBLP:journals/tpds/SquillanteL93}
    5582
    5683\todo{The survey is not great on this subject}
    5784
    58 \paragraph{Complex Machine Architecture} Another aspect that has been looked at is how well Work Stealing is applicable to different machine architectures.
     85\paragraph{Complex Machine Architecture} Another aspect that has been examined is how well work stealing is applicable to different machine architectures.
    5986
    6087\subsection{Theoretical Results}
    61 There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogenous systems\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}. \cite{DBLP:journals/jacm/BlellochGM99} examine the space bounds of Work Stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} show that for underloaded systems, the scheduler will complete computations in finite time, \ie is \newterm{stable}. Others show that Work-Stealing is applicable to various scheduling contexts\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}. \cite{DBLP:conf/ipps/ColeR13} also studied how Randomized Work Stealing affects false sharing among \glspl{at}.
    62 
    63 However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentionning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a Direct Acyclic Graph. It is unclear how well these distributions represent workloads in real world scenarios.
     88There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}.
     89\cite{DBLP:journals/jacm/BlellochGM99} examines the space bounds of work stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} shows that for under-loaded systems, the scheduler completes its computations in finite time, \ie is \newterm{stable}.
     90Others show that work stealing is applicable to various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}.
     91\cite{DBLP:conf/ipps/ColeR13} also studied how randomized work-stealing affects false sharing among \ats.
     92
     93However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph.
     94It is unclear how well these distributions represent workloads in real world scenarios.
    6495
    6596\section{Preemption}
    66 One last aspect of scheduling worth mentionning is preemption since many schedulers rely on it for some of their guarantees. Preemption is the idea of interrupting \glspl{at} that have been running for too long, effectively injecting suspend points in the applications. There are multiple techniques to achieve this but they all aim to have the effect of guaranteeing that suspend points in a \gls{at} are never further apart than some fixed duration. While this helps schedulers guarantee that no \glspl{at} will unfairly monopolize a worker, preemption can effectively added to any scheduler. Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it.
    67 
    68 \section{Schedulers in Production}\label{existing:prod}
    69 This section will show a quick overview of several schedulers which are generally available a the time of writing. While these schedulers don't necessarily represent to most recent advances in scheduling, they are what is generally accessible to programmers. As such, I believe that these schedulers are at least as relevant as those presented in published work. I chose both schedulers that operating in kernel space and in user space, as both can offer relevant insight for this project. However, I did not list any schedulers aimed for real-time applications, as these have constraints that are much stricter than what is needed for this project.
     97One last aspect of scheduling is preemption since many schedulers rely on it for some of their guarantees.
     98Preemption is the idea of interrupting \ats that have been running too long, effectively injecting suspend points into the application.
     99There are multiple techniques to achieve this effect but they all aim to guarantee that the suspend points in a \ats are never further apart than some fixed duration.
     100While this helps schedulers guarantee that no \ats unfairly monopolizes a worker, preemption can effectively be added to any scheduler.
     101Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it.
     102
     103\section{Production Schedulers}\label{existing:prod}
     104This section presents a quick overview of several current schedulers.
     105While these schedulers do not necessarily represent the most recent advances in scheduling, they are what is generally accessible to programmers.
     106As such, I believe these schedulers are at least as relevant as those presented in published work.
     107Schedulers that operate in kernel space and in user space are considered, as both can offer relevant insight for this project.
     108However, real-time schedulers are not considered, as these have constraints that are much stricter than what is needed for this project.
    70109
    71110\subsection{Operating System Schedulers}
    72 Operating System Schedulers tend to be fairly complex schedulers, they generally support some amount of real-time, aim to balance interactive and non-interactive \glspl{at} and support for multiple users sharing hardware without requiring these users to cooperate. Here are more details on a few schedulers used in the common operating systems: Linux, FreeBsd, Microsoft Windows and Apple's OS X. The information is less complete for operating systems behind closed source.
     111Operating System Schedulers tend to be fairly complex as they generally support some amount of real-time, aim to balance interactive and non-interactive \ats and support multiple users sharing hardware without requiring these users to cooperate.
     112Here are more details on a few schedulers used in the common operating systems: Linux, FreeBSD, Microsoft Windows and Apple's OS X.
     113The information is less complete for operating systems with closed source.
    73114
    74115\paragraph{Linux's CFS}
    75 The default scheduler used by Linux (the Completely Fair Scheduler)\cite{MAN:linux/cfs,MAN:linux/cfs2} is a feedback scheduler based on CPU time. For each processor, it constructs a Red-Black tree of \glspl{at} waiting to run, ordering them by amount of CPU time spent. The scheduler schedules the \gls{at} that has spent the least CPU time. It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time spent. The ordering of \glspl{at} is also impacted by a group based notion of fairness, where \glspl{at} belonging to groups having spent less CPU time are preferred to \glspl{at} beloning to groups having spent more CPU time. Linux achieves load-balancing by regularly monitoring the system state\cite{MAN:linux/cfs/balancing} and using some heuristic on the load (currently CPU time spent in the last millisecond plus decayed version of the previous time slots\cite{MAN:linux/cfs/pelt}.).
    76 
    77 \cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work-stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly. The issues highlighted sem to stem from Linux's need to support fairness across \glspl{at} \emph{and} across users\footnote{Enforcing fairness across users means, for example, that given two users: one with a single \gls{at} and the other with one thousand \glspl{at}, the user with a single \gls{at} does not receive one one thousandth of the CPU time.}, increasing the complexity.
    78 
    79 Linux also offers a FIFO scheduler, a real-time schedulerwhich runs the highest-priority \gls{at}, and a round-robin scheduler, which is an extension of the fifo-scheduler that adds fixed time slices. \cite{MAN:linux/sched}
     116The default scheduler used by Linux, the Completely Fair Scheduler~\cite{MAN:linux/cfs,MAN:linux/cfs2}, is a feedback scheduler based on CPU time.
     117For each processor, it constructs a Red-Black tree of \ats waiting to run, ordering them by the amount of CPU time used.
     118The \ats that has used the least CPU time is scheduled.
     119It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time used.
     120The ordering of \ats is also affected by a group based notion of fairness, where \ats belonging to groups having used less CPU time are preferred to \ats belonging to groups having used more CPU time.
     121Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the load, currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}.
     122
     123\cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly.
     124The issues highlighted stem from Linux's need to support fairness across \ats \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \ats and the other with one thousand \ats, the user with a single \ats does not receive one thousandth of the CPU time.}, increasing the complexity.
     125
     126Linux also offers a FIFO scheduler, a real-time scheduler, which runs the highest-priority \ats, and a round-robin scheduler, which is an extension of the FIFO-scheduler that adds fixed time slices. \cite{MAN:linux/sched}
    80127
    81128\paragraph{FreeBSD}
    82 The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS. It uses different data structures and heuristics but also schedules according to some combination of CPU time spent and niceness values. It also periodically balances the load of the system(according to a different heuristic), but uses a simpler Work Stealing approach.
     129The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS.
     130It uses different data structures and heuristics but also schedules according to some combination of CPU time used and niceness values.
     131It also periodically balances the load of the system (according to a different heuristic), but uses a simpler work stealing approach.
    83132
    84133\paragraph{Windows(OS)}
    85 Microsoft's Operating System's Scheduler\cite{MAN:windows/scheduler} is a feedback scheduler with priorities. It supports 32 levels of priorities, some of which are reserved for real-time and prviliged applications. It schedules \glspl{at} based on the highest priorities (lowest number) and how much cpu time each \glspl{at} have used. The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests.
     134Microsoft's Operating System's Scheduler~\cite{MAN:windows/scheduler} is a feedback scheduler with priorities.
     135It supports 32 levels of priorities, some of which are reserved for real-time and privileged applications.
     136It schedules \ats based on the highest priorities (lowest number) and how much CPU time each \ats has used.
     137The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests.
    86138
    87139\todo{load balancing}
     
    100152
    101153\subsection{User-Level Schedulers}
    102 By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \glspl{at} have the same user, and therefore cooperation is both feasible and probable.
    103 \paragraph{Go}
    104 Go's scheduler uses a Randomized Work Stealing algorithm that has a global runqueue(\emph{GRQ}) and each processor(\emph{P}) has both a fixed-size runqueue(\emph{LRQ}) and a high-priority next ``chair'' holding a single element.\cite{GITHUB:go,YTUBE:go} Preemption is present, but only at function call boundaries.
     154By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \ats have the same user, and therefore cooperation is both feasible and probable.
     155
     156\paragraph{Go}\label{GoSafePoint}
     157Go's scheduler uses a randomized work-stealing algorithm that has a global run-queue (\emph{GRQ}) and each processor (\emph{P}) has both a fixed-size run-queue (\emph{LRQ}) and a high-priority next ``chair'' holding a single element~\cite{GITHUB:go,YTUBE:go}.
     158Preemption is present, but only at safe-points,~\cit{https://go.dev/src/runtime/preempt.go} which are inserted detection code at various frequent access boundaries.
    105159
    106160The algorithm is as follows :
    107161\begin{enumerate}
    108         \item Once out of 61 times, directly pick 1 element from the \emph{GRQ}.
     162        \item Once out of 61 times, pick 1 element from the \emph{GRQ}.
    109163        \item If there is an item in the ``chair'' pick it.
    110164        \item Else pick an item from the \emph{LRQ}.
    111         \item If it was empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ}.
    112         \item If it was empty steal \emph{half} the \emph{LRQ} of another \emph{P} chosen randomly.
     165        \begin{itemize}
     166        \item If it is empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ}
     167        \item and steal \emph{half} the \emph{LRQ} of another \emph{P} chosen randomly.
     168        \end{itemize}
    113169\end{enumerate}
    114170
    115171\paragraph{Erlang}
    116 Erlang is a functionnal language that supports concurrency in the form of processes, threads that share no data. It seems to be some kind of Round-Robin Scheduler. It currently uses some mix of Work Sharing and Work Stealing to achieve load balancing\cite{:erlang}, where underloaded workers steal from other workers, but overloaded workers also push work to other workers. This migration logic seems to be directed by monitoring logic that evaluates the load a few times per seconds.
     172Erlang is a functional language that supports concurrency in the form of processes: threads that share no data.
     173It uses a kind of round-robin scheduler, with a mix of work sharing and stealing to achieve load balancing~\cite{:erlang}, where under-loaded workers steal from other workers, but overloaded workers also push work to other workers.
     174This migration logic is directed by monitoring logic that evaluates the load a few times per seconds.
    117175
    118176\paragraph{Intel\textregistered ~Threading Building Blocks}
    119 \newterm{Thread Building Blocks}(TBB) is Intel's task parellelism\cite{wiki:taskparallel} framework. It runs \newterm{jobs}, uninterruptable \glspl{at}, schedulable objects that must always run to completion, on a pool of worker threads. TBB's scheduler is a variation of Randomized Work Stealing that also supports higher-priority graph-like dependencies\cite{MAN:tbb/scheduler}. It schedules \glspl{at} as follows (where \textit{t} is the last \gls{at} completed):
     177\newterm{Thread Building Blocks} (TBB) is Intel's task parallelism \cite{wiki:taskparallel} framework.
     178It runs \newterm{jobs}, which are uninterruptable \ats that must always run to completion, on a pool of worker threads.
     179TBB's scheduler is a variation of randomized work-stealing that also supports higher-priority graph-like dependencies~\cite{MAN:tbb/scheduler}.
     180It schedules \ats as follows (where \textit{t} is the last \ats completed):
    120181\begin{displayquote}
    121182        \begin{enumerate}
    122                 \item The task returned by \textit{t}\texttt{.execute()}
     183                \item The task returned by \textit{t}@.execute()@
    123184                \item The successor of t if \textit{t} was its last completed predecessor.
    124                 \item A task popped from the end of the threads own deque.
     185                \item A task popped from the end of the thread's own deque.
    125186                \item A task with affinity for the thread.
    126187                \item A task popped from approximately the beginning of the shared queue.
    127                 \item A task popped from the beginning of another randomly chosen threads deque.
     188                \item A task popped from the beginning of another randomly chosen thread's deque.
    128189        \end{enumerate}
    129190
     
    134195
    135196\paragraph{Quasar/Project Loom}
    136 Java has two projects that are attempting to introduce lightweight threading into java in the form of Fibers, Quasar\cite{MAN:quasar} and Project Loom\cite{MAN:project-loom}\footnote{It is unclear to me if these are distinct projects or not}. Both projects seem to be based on the \texttt{ForkJoinPool} in Java which appears to be a simple incarnation of Randomized Work Stealing\cite{MAN:java/fork-join}.
     197Java has two projects, Quasar~\cite{MAN:quasar} and Project Loom~\cite{MAN:project-loom}\footnote{It is unclear if these are distinct projects.}, that are attempting to introduce lightweight thread\-ing in the form of Fibers.
     198Both projects seem to be based on the @ForkJoinPool@ in Java, which appears to be a simple incarnation of randomized work-stealing~\cite{MAN:java/fork-join}.
    137199
    138200\paragraph{Grand Central Dispatch}
    139 This is an API produce by Apple\cit{Official GCD source} that offers task parellelism\cite{wiki:taskparallel}. Its distinctive aspect is that it uses multiple ``Dispatch Queues'', some of which are created by programmers. These queues each have their own local ordering guarantees, \eg \glspl{at} on queue $A$ are executed in \emph{FIFO} order.
     201An Apple\cit{Official GCD source} API that offers task parallelism~\cite{wiki:taskparallel}.
     202Its distinctive aspect is multiple ``Dispatch Queues'', some of which are created by programmers.
     203Each queue has its own local ordering guarantees, \eg \ats on queue $A$ are executed in \emph{FIFO} order.
    140204
    141205\todo{load balancing and scheduling}
     
    143207% http://web.archive.org/web/20090920043909/http://images.apple.com/macosx/technology/docs/GrandCentral_TB_brief_20090903.pdf
    144208
    145 In terms of semantics, the Dispatch Queues seem to be very similar in semantics to Intel\textregistered ~TBB \texttt{execute()} and predecessor semantics. Where it would be possible to convert from one to the other.
     209In terms of semantics, the Dispatch Queues seem to be very similar to Intel\textregistered ~TBB @execute()@ and predecessor semantics.
    146210
    147211\paragraph{LibFibre}
    148 LibFibre\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developt at the University of Waterloo. Similarly to Go, it uses a variation of Work Stealing with a global queue that is higher priority than stealing. Unlock Go it does not have the high-priority next ``chair'' and does not use Randomized Work Stealing.
    149 
     212LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developed at the University of Waterloo.
     213Similarly to Go, it uses a variation of work stealing with a global queue that is higher priority than stealing.
     214Unlike Go, it does not have the high-priority next ``chair'' and does not use randomized work-stealing.
  • doc/theses/thierry_delisle_PhD/thesis/text/intro.tex

    r9e23b446 rffec1bf  
    1 \chapter*{Introduction}\label{intro}
    2 \todo{A proper intro}
     1\chapter{Introduction}\label{intro}
     2\section{\CFA programming language}
    33
    4 The C programming language~\cite{C11}
     4The \CFA programming language~\cite{cfa:frontpage,cfa:typesystem} extends the C programming language by adding modern safety and productivity features, while maintaining backwards compatibility.
     5Among its productivity features, \CFA supports user-level threading~\cite{Delisle21} allowing programmers to write modern concurrent and parallel programs.
     6My previous master's thesis on concurrent in \CFA focused on features and interfaces.
     7This Ph.D.\ thesis focuses on performance, introducing \glsxtrshort{api} changes only when required by performance considerations.
     8Specifically, this work concentrates on scheduling and \glsxtrshort{io}.
     9Prior to this work, the \CFA runtime used a strict \glsxtrshort{fifo} \gls{rQ} and no \glsxtrshort{io} capabilities at the user-thread level\footnote{C supports \glsxtrshort{io} capabilities at the kernel level, which means blocking operations block kernel threads where blocking user-level threads whould be more appropriate for \CFA.}.
    510
    6 The \CFA programming language~\cite{cfa:frontpage,cfa:typesystem} extends the C programming language by adding modern safety and productivity features, while maintaining backwards compatibility. Among its productivity features, \CFA supports user-level threading~\cite{Delisle21} allowing programmers to write modern concurrent and parallel programs.
    7 My previous master's thesis on concurrent in \CFA focused on features and interfaces.
    8 This Ph.D.\ thesis focuses on performance, introducing \glsxtrshort{api} changes only when required by performance considerations. Specifically, this work concentrates on scheduling and \glsxtrshort{io}. Prior to this work, the \CFA runtime used a strict \glsxtrshort{fifo} \gls{rQ} and  no non-blocking I/O capabilities at the user-thread level.
     11As a research project, this work builds exclusively on newer versions of the Linux operating-system and gcc/clang compilers.
     12While \CFA is released, supporting older versions of Linux ($<$~Ubuntu 16.04) and gcc/clang compilers ($<$~gcc 6.0) is not a goal of this work.
    913
    10 As a research project, this work builds exclusively on newer versions of the Linux operating-system and gcc/clang compilers. While \CFA is released, supporting older versions of Linux ($<$~Ubuntu 16.04) and gcc/clang compilers ($<$~gcc 6.0) is not a goal of this work.
     14\section{Scheduling}
     15Computer systems share multiple resources across many threads of execution, even on single user computers like laptops or smartphones.
     16On a computer system with multiple processors and work units, there exists the problem of mapping work onto processors in an efficient manner, called \newterm{scheduling}.
     17These systems are normally \newterm{open}, meaning new work arrives from an external source or is spawned from an existing work unit.
     18On a computer system, the scheduler takes a sequence of work requests in the form of threads and attempts to complete the work, subject to performance objectives, such as resource utilization.
     19A general-purpose dynamic-scheduler for an open system cannot anticipate future work requests, so its performance is rarely optimal.
     20With complete knowledge of arrive order and work, creating an optimal solution still effectively needs solving the bin packing problem\cite{wiki:binpak}.
     21However, optimal solutions are often not required.
     22Schedulers do produce excellent solutions, whitout needing optimality, by taking advantage of regularities in work patterns.
     23
     24Scheduling occurs at discreet points when there are transitions in a system.
     25For example, a thread cycles through the following transitions during its execution.
     26\begin{center}
     27\input{executionStates.pstex_t}
     28\end{center}
     29These \newterm{state transition}s are initiated in response to events (\Index{interrupt}s):
     30\begin{itemize}
     31\item
     32entering the system (new $\rightarrow$ ready)
     33\item
     34timer alarm for preemption (running $\rightarrow$ ready)
     35\item
     36long term delay versus spinning (running $\rightarrow$ blocked)
     37\item
     38blocking ends, \ie network or I/O completion (blocked $\rightarrow$ ready)
     39\item
     40normal completion or error, \ie segment fault (running $\rightarrow$ halted)
     41\item
     42scheduler assigns a thread to a resource (ready $\rightarrow$ running)
     43\end{itemize}
     44Key to scheduling is that a thread cannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system.
     45
     46When the workload exceeds the capacity of the processors, \ie work cannot be executed immediately, it is placed on a queue for subsequent service, called a \newterm{ready queue}.
     47Ready queues organize threads for scheduling, which indirectly organizes the work to be performed.
     48The structure of ready queues can take many different forms.
     49Where simple examples include single-queue multi-server (SQMS) and the multi-queue multi-server (MQMS).
     50\begin{center}
     51\begin{tabular}{l|l}
     52\multicolumn{1}{c|}{\textbf{SQMS}} & \multicolumn{1}{c}{\textbf{MQMS}} \\
     53\hline
     54\raisebox{0.5\totalheight}{\input{SQMS.pstex_t}} & \input{MQMSG.pstex_t}
     55\end{tabular}
     56\end{center}
     57Beyond these two schedulers are a host of options, \ie adding an optional global, shared queue to MQMS.
     58
     59The three major optimization criteria for a scheduler are:
     60\begin{enumerate}[leftmargin=*]
     61\item
     62\newterm{load balancing}: available work is distributed so no processor is idle when work is available.
     63
     64\noindent
     65Eventual progress for each work unit is often an important consideration, \ie no starvation.
     66\item
     67\newterm{affinity}: processors access state through a complex memory hierarchy, so it is advantageous to keep a work unit's state on a single or closely bound set of processors.
     68
     69\noindent
     70Essentially, all multi-processor computers have non-uniform memory access (NUMA), with one or more quantized steps to access data at different levels in the memory hierarchy.
     71When a system has a large number of independently executing threads, affinity becomes difficult because of \newterm{thread churn}.
     72That is, threads must be scheduled on multiple processors to obtain high processors utilization because the number of threads $\ggg$ processors.
     73
     74\item
     75\newterm{contention}: safe access of shared objects by multiple processors requires mutual exclusion in some form, generally locking\footnote{
     76Lock-free data-structures do not involve locking but incurr similar costs to achieve mutual exclusion.}
     77
     78\noindent
     79Mutual exclusion cost and latency increases significantly with the number of processors accessing a shared object.
     80\end{enumerate}
     81
     82Nevertheless, schedulers are a series of compromises, occasionally with some static or dynamic tuning parameters to enhance specific patterns.
     83Scheduling is a zero-sum game as computer processors normally have a fixed, maximum number of cycles per unit time\footnote{Frequency scaling and turbot boost add a degree of complexity that can be ignored in this discussion without loss of generality.}.
     84SQMS has perfect load-balancing but poor affinity and high contention by the processors, because of the single queue.
     85MQMS has poor load-balancing but perfect affinity and no contention, because each processor has its own queue.
     86
     87Significant research effort has also looked at load sharing/stealing among queues, when a ready queue is too long or short, respectively.
     88These approaches attempt to perform better load-balancing at the cost of affinity and contention.
     89Load sharing/stealing schedulers attempt to push/pull work units to/from other ready queues
     90
     91Note however that while any change comes at a cost, hence the zero-sum game, not all compromises are necessarily equivalent.
     92Some schedulers can perform very well only in very specific workload scenarios, others might offer acceptable performance but be applicable to a wider range of workloads.
     93Since \CFA attempts to improve the safety and productivity of C, the scheduler presented in this thesis attempts to achieve the same goals.
     94More specifically, safety and productivity for scheduling means supporting a wide range of workloads so that programmers can rely on progress guarantees (safety) and more easily achieve acceptable performance (productivity).
     95
     96
     97\section{Contributions}\label{s:Contributions}
     98This work provides the following contributions in the area of user-level scheduling in an advanced programming-language runtime-system:
     99\begin{enumerate}[leftmargin=*]
     100\item
     101A scalable scheduling algorithm that offers progress guarantees.
     102\item
     103An algorithm for load-balancing and idle sleep of processors, including NUMA awareness.
     104\item
     105Support for user-level \glsxtrshort{io} capabilities based on Linux's @io_uring@.
     106\end{enumerate}
  • doc/theses/thierry_delisle_PhD/thesis/text/io.tex

    r9e23b446 rffec1bf  
    11\chapter{User Level \io}
    2 As mentioned in Section~\ref{prev:io}, User-Level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations.
     2As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations.
    33Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating-system.
    44
    55\section{Kernel Interface}
    6 Since this work fundamentally depends on operating-system support, the first step of any design is to discuss the available interfaces and pick one (or more) as the foundations of the non-blocking \io subsystem.
     6Since this work fundamentally depends on operating-system support, the first step of this design is to discuss the available interfaces and pick one (or more) as the foundation for the non-blocking \io subsystem in this work.
    77
    88\subsection{\lstinline{O_NONBLOCK}}
     
    1010In this mode, ``Neither the @open()@ nor any subsequent \io operations on the [opened file descriptor] will cause the calling process to wait''~\cite{MAN:open}.
    1111This feature can be used as the foundation for the non-blocking \io subsystem.
    12 However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be use in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait
    13 \footnote{In this context, ready means \emph{some} operation can be performed without blocking.
     12However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be used in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait.\footnote{
     13In this context, ready means \emph{some} operation can be performed without blocking.
    1414It does not mean an operation returning \lstinline{EAGAIN} succeeds on the next try.
    15 For example, a ready read may only return a subset of bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}.
     15For example, a ready read may only return a subset of requested bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}
    1616This mechanism is also crucial in determining when all \glspl{thrd} are blocked and the application \glspl{kthrd} can now block.
    1717
    18 There are three options to monitor file descriptors in Linux
    19 \footnote{For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}.
     18There are three options to monitor file descriptors in Linux:\footnote{
     19For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}.
    2020The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.},
    2121@select@~\cite{MAN:select}, @poll@~\cite{MAN:poll} and @epoll@~\cite{MAN:epoll}.
    2222All three of these options offer a system call that blocks a \gls{kthrd} until at least one of many file descriptors becomes ready.
    23 The group of file descriptors being waited is called the \newterm{interest set}.
    24 
    25 \paragraph{\lstinline{select}} is the oldest of these options, it takes as an input a contiguous array of bits, where each bits represent a file descriptor of interest.
    26 On return, it modifies the set in place to identify which of the file descriptors changed status.
    27 This destructive change means that calling select in a loop requires re-initializing the array each time and the number of file descriptors supported has a hard limit.
    28 Another limit of @select@ is that once the call is started, the interest set can no longer be modified.
    29 Monitoring a new file descriptor generally requires aborting any in progress call to @select@
    30 \footnote{Starting a new call to \lstinline{select} is possible but requires a distinct kernel thread, and as a result is not an acceptable multiplexing solution when the interest set is large and highly dynamic unless the number of parallel calls to \lstinline{select} can be strictly bounded.}.
    31 
    32 \paragraph{\lstinline{poll}} is an improvement over select, which removes the hard limit on the number of file descriptors and the need to re-initialize the input on every call.
    33 It works using an array of structures as an input rather than an array of bits, thus allowing a more compact input for small interest sets.
    34 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed while the call is blocked.
    35 
    36 \paragraph{\lstinline{epoll}} further improves these two functions by allowing the interest set to be dynamically added to and removed from while a \gls{kthrd} is blocked on an @epoll@ call.
     23The group of file descriptors being waited on is called the \newterm{interest set}.
     24
     25\paragraph{\lstinline{select}} is the oldest of these options, and takes as input a contiguous array of bits, where each bit represents a file descriptor of interest.
     26Hence, the array length must be as long as the largest FD currently of interest.
     27On return, it outputs the set in place to identify which of the file descriptors changed state.
     28This destructive change means selecting in a loop requires re-initializing the array for each iteration.
     29Another limit of @select@ is that calls from different \glspl{kthrd} sharing FDs are independent.
     30Hence, if one \gls{kthrd} is managing the select calls, other threads can only add/remove to/from the manager's interest set through synchronized calls to update the interest set.
     31However, these changes are only reflected when the manager makes its next call to @select@.
     32Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/ttys it is waiting on never get data again.
     33Often the I/O manager has a timeout, polls, or is sent a signal on changes to mitigate this problem.
     34
     35\begin{comment}
     36From: Tim Brecht <brecht@uwaterloo.ca>
     37Subject: Re: FD sets
     38Date: Wed, 6 Jul 2022 00:29:41 +0000
     39
     40Large number of open files
     41--------------------------
     42
     43In order to be able to use more than the default number of open file
     44descriptors you may need to:
     45
     46o increase the limit on the total number of open files /proc/sys/fs/file-max
     47  (on Linux systems)
     48
     49o increase the size of FD_SETSIZE
     50  - the way I often do this is to figure out which include file __FD_SETSIZE
     51    is defined in, copy that file into an appropriate directory in ./include,
     52    and then modify it so that if you use -DBIGGER_FD_SETSIZE the larger size
     53    gets used
     54
     55  For example on a RH 9.0 distribution I've copied
     56  /usr/include/bits/typesizes.h into ./include/i386-linux/bits/typesizes.h
     57
     58  Then I modify typesizes.h to look something like:
     59
     60  #ifdef BIGGER_FD_SETSIZE
     61  #define __FD_SETSIZE            32767
     62  #else
     63  #define __FD_SETSIZE            1024
     64  #endif
     65
     66  Note that the since I'm moving and testing the userver on may different
     67  machines the Makefiles are set up to use -I ./include/$(HOSTTYPE)
     68
     69  This way if you redefine the FD_SETSIZE it will get used instead of the
     70  default original file.
     71\end{comment}
     72
     73\paragraph{\lstinline{poll}} is the next oldest option, and takes as input an array of structures containing the FD numbers rather than their position in an array of bits, allowing a more compact input for interest sets that contain widely spaced FDs.
     74For small interest sets with densely packed FDs, the @select@ bit mask can take less storage, and hence, copy less information into the kernel.
     75Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialize on every call.
     76Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \gls{kthrd}, while a manager thread is blocked in @poll@.
     77
     78\paragraph{\lstinline{epoll}} follows after @poll@, and places the interest set in the kernel rather than the application, where it is managed by an internal \gls{kthrd}.
     79There are two separate functions: one to add to the interest set and another to check for FDs with state changes.
    3780This dynamic capability is accomplished by creating an \emph{epoll instance} with a persistent interest set, which is used across multiple calls.
    38 This capability significantly reduces synchronization overhead on the part of the caller (in this case the \io subsystem), since the interest set can be modified when adding or removing file descriptors without having to synchronize with other \glspl{kthrd} potentially calling @epoll@.
    39 
    40 However, all three of these system calls have limitations.
     81As the interest set is augmented, the changes become implicitly part of the interest set for a blocked manager \gls{kthrd}.
     82This capability significantly reduces synchronization between \glspl{kthrd} and the manager calling @epoll@.
     83
     84However, all three of these I/O systems have limitations.
    4185The @man@ page for @O_NONBLOCK@ mentions that ``[@O_NONBLOCK@] has no effect for regular files and block devices'', which means none of these three system calls are viable multiplexing strategies for these types of \io operations.
    4286Furthermore, @epoll@ has been shown to have problems with pipes and ttys~\cit{Peter's examples in some fashion}.
     
    5397It also supports batching multiple operations in a single system call.
    5498
    55 AIO offers two different approach to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.
     99AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.
    56100For the purpose of \io multiplexing, @aio_suspend@ is the best interface.
    57101However, even if AIO requests can be submitted concurrently, @aio_suspend@ suffers from the same limitation as @select@ and @poll@, \ie, the interest set cannot be dynamically changed while a call to @aio_suspend@ is in progress.
     
    70114
    71115        \begin{flushright}
    72                 -- Linus Torvalds\cit{https://lwn.net/Articles/671657/}
     116                -- Linus Torvalds~\cite{AIORant}
    73117        \end{flushright}
    74118\end{displayquote}
     
    85129A very recent addition to Linux, @io_uring@~\cite{MAN:io_uring}, is a framework that aims to solve many of the problems listed in the above interfaces.
    86130Like AIO, it represents \io operations as entries added to a queue.
    87 But like @epoll@, new requests can be submitted while a blocking call waiting for requests to complete is already in progress.
     131But like @epoll@, new requests can be submitted, while a blocking call waiting for requests to complete, is already in progress.
    88132The @io_uring@ interface uses two ring buffers (referred to simply as rings) at its core: a submit ring to which programmers push \io requests and a completion ring from which programmers poll for completion.
    89133
     
    97141In the worst case, where all \glspl{thrd} are consistently blocking on \io, it devolves into 1-to-1 threading.
    98142However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \glspl{thrd} are ready to run.
    99 This approach is used by languages like Go\cit{Go} and frameworks like libuv\cit{libuv}, since it has the advantage that it can easily be used across multiple operating systems.
     143This approach is used by languages like Go\cit{Go}, frameworks like libuv\cit{libuv}, and web servers like Apache~\cite{apache} and Nginx~\cite{nginx}, since it has the advantage that it can easily be used across multiple operating systems.
    100144This advantage is especially relevant for languages like Go, which offer a homogeneous \glsxtrshort{api} across all platforms.
    101145As opposed to C, which has a very limited standard api for \io, \eg, the C standard library has no networking.
     
    111155\section{Event-Engine}
    112156An event engine's responsibility is to use the kernel interface to multiplex many \io operations onto few \glspl{kthrd}.
    113 In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engines then starts the operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.
     157In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engine then starts an operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.
    114158The parked \glspl{thrd} are then rescheduled by the event engine once the desired operation has completed.
    115159
     
    134178\begin{enumerate}
    135179\item
    136 An SQE is allocated from the pre-allocated array (denoted \emph{S} in Figure~\ref{fig:iouring}).
     180An SQE is allocated from the pre-allocated array \emph{S}.
    137181This array is created at the same time as the @io_uring@ instance, is in kernel-locked memory visible by both the kernel and the application, and has a fixed size determined at creation.
    138 How these entries are allocated is not important for the functioning of @io_uring@, the only requirement is that no entry is reused before the kernel has consumed it.
     182How these entries are allocated is not important for the functioning of @io_uring@;
     183the only requirement is that no entry is reused before the kernel has consumed it.
    139184\item
    140185The SQE is filled according to the desired operation.
    141 This step is straight forward, the only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries.
     186This step is straight forward.
     187The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries.
    142188\item
    143189The SQE is submitted to the submission ring by appending the index of the SQE to the ring following regular ring buffer steps: \lstinline{buffer[head] = item; head++}.
    144190Since the head is visible to the kernel, some memory barriers may be required to prevent the compiler from reordering these operations.
    145191Since the submission ring is a regular ring buffer, more than one SQE can be added at once and the head is updated only after all entries are updated.
     192Note, SQE can be filled and submitted in any order, \eg in Figure~\ref{fig:iouring} the submission order is S0, S3, S2 and S1 has not been submitted.
    146193\item
    147194The kernel is notified of the change to the ring using the system call @io_uring_enter@.
     
    161208The @io_uring_enter@ system call is protected by a lock inside the kernel.
    162209This protection means that concurrent call to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@.
    163 It is possible to do the first three submission steps in parallel, however, doing so requires careful synchronization.
     210It is possible to do the first three submission steps in parallel;
     211however, doing so requires careful synchronization.
    164212
    165213@io_uring@ also introduces constraints on the number of simultaneous operations that can be ``in flight''.
    166 Obviously, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once.
    167 In addition, the @io_uring_enter@ system call can fail because ``The  kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can  have pending.''.
     214First, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once.
     215Second, the @io_uring_enter@ system call can fail because ``The  kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can have pending.''.
    168216This restriction means \io request bursts may have to be subdivided and submitted in chunks at a later time.
    169217
    170218\subsection{Multiplexing \io: Submission}
     219
    171220The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made in the submission side.
    172 While it is possible to do the first steps of submission in parallel, the duration of the system call scales with number of entries submitted.
     221While there is freedom in designing the submission side, there are some realities of @io_uring@ that must be taken into account.
     222It is possible to do the first steps of submission in parallel;
     223however, the duration of the system call scales with the number of entries submitted.
    173224The consequence is that the amount of parallelism used to prepare submissions for the next system call is limited.
    174225Beyond this limit, the length of the system call is the throughput limiting factor.
    175 I concluded from early experiments that preparing submissions seems to take at most as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}.
    176 Therefore the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances.
    177 Similarly to scheduling, this sharding can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.
    178 Since completions are sent to the instance where requests were submitted, all instances with pending operations must be polled continously
    179 \footnote{As will be described in Chapter~\ref{practice}, this does not translate into constant cpu usage.}.
     226I concluded from early experiments that preparing submissions seems to take almost as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}.
     227Therefore, the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances.
     228Since completions are sent to the instance where requests were submitted, all instances with pending operations must be polled continuously\footnote{
     229As described in Chapter~\ref{practice}, this does not translate into constant CPU usage.}.
    180230Note that once an operation completes, there is nothing that ties it to the @io_uring@ instance that handled it.
    181 There is nothing preventing a new operation with, for example, the same file descriptors to a different @io_uring@ instance.
     231There is nothing preventing a new operation with, \eg the same file descriptors to a different @io_uring@ instance.
    182232
    183233A complicating aspect of submission is @io_uring@'s support for chains of operations, where the completion of an operation triggers the submission of the next operation on the link.
    184234SQEs forming a chain must be allocated from the same instance and must be contiguous in the Submission Ring (see Figure~\ref{fig:iouring}).
    185 The consequence of this feature is that filling SQEs can be arbitrarly complex and therefore users may need to run arbitrary code between allocation and submission.
    186 Supporting chains is a requirement of the \io subsystem, but it is still valuable.
    187 Support for this feature can be fulfilled simply to supporting arbitrary user code between allocation and submission.
    188 
    189 \subsubsection{Public Instances}
    190 One approach is to have multiple shared instances.
    191 \Glspl{thrd} attempting \io operations pick one of the available instances and submit operations to that instance.
    192 Since there is no coupling between \glspl{proc} and @io_uring@ instances in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently.
    193 Since @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects: the synchronization needed to submit does not induce more contention than @io_uring@ already does and the scheme to route \io requests to specific @io_uring@ instances does not introduce contention.
    194 This second aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
    195 
    196 Allocation in this scheme can be handled fairly easily.
    197 Free SQEs, \ie, SQEs that aren't currently being used to represent a request, can be written to safely and have a field called @user_data@ which the kernel only reads to copy to @cqe@s.
    198 Allocation also requires no ordering guarantee as all free SQEs are interchangeable.
    199 This requires a simple concurrent bag.
    200 The only added complexity is that the number of SQEs is fixed, which means allocation can fail.
    201 
    202 Allocation failures need to be pushed up to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.
    203 Furthermore, the routing algorithm should block operations up-front if none of the instances have available SQEs.
    204 
    205 Once an SQE is allocated, \glspl{thrd} can fill them normally, they simply need to keep track of the SQE index and which instance it belongs to.
    206 
    207 Once an SQE is filled in, what needs to happen is that the SQE must be added to the submission ring buffer, an operation that is not thread-safe on itself, and the kernel must be notified using the @io_uring_enter@ system call.
    208 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail
    209 \footnote{This is because it is invalid to have the same \lstinline{sqe} multiple times in the ring buffer.}.
    210 However, as mentioned, the system call itself can fail with the expectation that it will be retried once some of the already submitted operations complete.
    211 Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency.
    212 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted.
    213 This can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section.
    214 
    215 In the case of designating a \gls{thrd}, ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests would be batched together and one of the \glspl{thrd} would do the system call on behalf of the others, referred to as the \newterm{submitter}.
    216 In practice however, it is important that the \io requests are not left pending indefinitely and as such, it may be required to have a ``next submitter'' that guarentees everything that is missed by the current submitter is seen by the next one.
    217 Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call will include their request.
    218 Once the system call is done, the submitter must also free SQEs so that the allocator can reused them.
    219 
    220 Finally, the completion side is much simpler since the @io_uring@ system call enforces a natural synchronization point.
    221 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}.
    222 Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}.
    223 If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events.
    224 A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled.
    225 
    226 With this pool of instances approach, the big advantage is that it is fairly flexible.
    227 It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions.
    228 It also can gracefully handle running out of ressources, SQEs or the kernel returning @EBUSY@.
    229 The down side to this is that many of the steps used for submitting need complex synchronization to work properly.
    230 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed.
    231 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused and handle the kernel returning @EBUSY@.
    232 All this synchronization may have a significant cost and, compared to the next approach presented, this synchronization is entirely overhead.
     235The consequence of this feature is that filling SQEs can be arbitrarily complex, and therefore, users may need to run arbitrary code between allocation and submission.
     236Supporting chains is not a requirement of the \io subsystem, but it is still valuable.
     237Support for this feature can be fulfilled simply by supporting arbitrary user code between allocation and submission.
     238
     239Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.
     240These three sharding approaches are analyzed.
    233241
    234242\subsubsection{Private Instances}
    235 Another approach is to simply create one ring instance per \gls{proc}.
    236 This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not interrupted in between two submission steps.
    237 This is effectively the same requirement as using @thread_local@ variables.
    238 Since SQEs that are allocated must be submitted to the same ring, on the same \gls{proc}, this effectively forces the application to submit SQEs in allocation order
    239 \footnote{The actual requirement is that \glspl{thrd} cannot context switch between allocation and submission.
    240 This requirement means that from the subsystem's point of view, the allocation and submission are sequential.
    241 To remove this requirement, a \gls{thrd} would need the ability to ``yield to a specific \gls{proc}'', \ie, park with the promise that it will be run next on a specific \gls{proc}, the \gls{proc} attached to the correct ring.}
    242 , greatly simplifying both allocation and submission.
    243 In this design, allocation and submission form a partitionned ring buffer as shown in Figure~\ref{fig:pring}.
    244 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to do the system call.
    245 Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, etc.
     243The private approach creates one ring instance per \gls{proc}, \ie one-to-one coupling.
     244This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not time-sliced during submission steps.
     245This requirement is the same as accessing @thread_local@ variables, where a \gls{thrd} is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data.
     246This failure is the serially reusable problem~\cite{SeriallyReusable}.
     247Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in allocation order.\footnote{
     248To remove this requirement, a \gls{thrd} needs the ability to ``yield to a specific \gls{proc}'', \ie, park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.}
     249From the subsystem's point of view, the allocation and submission are sequential, greatly simplifying both.
     250In this design, allocation and submission form a partitioned ring buffer as shown in Figure~\ref{fig:pring}.
     251Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to perform the system call.
     252Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, \etc.
    246253
    247254\begin{figure}
     
    254261\end{figure}
    255262
    256 This approach has the advantage that it does not require much of the synchronization needed in the shared approach.
    257 This comes at the cost that \glspl{thrd} submitting \io operations have less flexibility, they cannot park or yield, and several exceptional cases are handled poorly.
    258 Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations, in such a case the \gls{thrd} needs to be moved to a different \gls{proc}, the only current way of achieving this would be to @yield()@ hoping to be scheduled on a different \gls{proc}, which is not guaranteed.
    259 
    260 A more involved version of this approach can seem to solve most of these problems, using a pattern called \newterm{helping}.
    261 \Glspl{thrd} that wish to submit \io operations but cannot do so
    262 \footnote{either because of an allocation failure or because they were migrate to a different \gls{proc} between allocation and submission}
    263 create an object representing what they wish to achieve and add it to a list somewhere.
    264 For this particular problem, one solution would be to have a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.
    265 The problem with these ``solutions'' is that they are still bound by the strong coupling between \glspl{proc} and @io_uring@ instances.
    266 These data structures would allow moving \glspl{thrd} to a specific \gls{proc} when the current \gls{proc} cannot fulfill the \io request.
    267 
    268 Imagine a simple case with two \glspl{thrd} on two \glspl{proc}, one \gls{thrd} submits an \io operation and then sets a flag, the other \gls{thrd} spins until the flag is set.
    269 If the first \gls{thrd} is preempted between allocation and submission and moves to the other \gls{proc}, the original \gls{proc} could start running the spinning \gls{thrd}.
    270 If this happens, the helping ``solution'' is for the \io \gls{thrd}to added append an item to the submission list of the \gls{proc} where the allocation was made.
     263This approach has the advantage that it does not require much of the synchronization needed in a shared approach.
     264However, this benefit means \glspl{thrd} submitting \io operations have less flexibility: they cannot park or yield, and several exceptional cases are handled poorly.
     265Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations.
     266In this case, the \io \gls{thrd} needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed.
     267
     268A more involved version of this approach tries to solve these problems using a pattern called \newterm{helping}.
     269\Glspl{thrd} that cannot submit \io operations, either because of an allocation failure or migration to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.
     270While there is still the strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \glspl{thrd} to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request.
     271
     272Imagine a simple scenario with two \glspl{thrd} on two \glspl{proc}, where one \gls{thrd} submits an \io operation and then sets a flag, while the other \gls{thrd} spins until the flag is set.
     273Assume both \glspl{thrd} are running on the same \gls{proc}, and the \io \gls{thrd} is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \gls{thrd}.
     274In this case, the helping solution has the \io \gls{thrd} append an \io object to the submission list of the first \gls{proc}, where the allocation was made.
    271275No other \gls{proc} can help the \gls{thrd} since @io_uring@ instances are strongly coupled to \glspl{proc}.
    272 However, in this case, the \gls{proc} is unable to help because it is executing the spinning \gls{thrd} mentioned when first expression this case
    273 \footnote{This particular example is completely artificial, but in the presence of many more \glspl{thrd}, it is not impossible that this problem would arise ``in the wild''.
    274 Furthermore, this pattern is difficult to reliably detect and avoid.}
    275 resulting in a deadlock.
    276 Once in this situation, the only escape is to interrupted the execution of the \gls{thrd}, either directly or due to regular preemption, only then can the \gls{proc} take the time to handle the pending request to help.
    277 Interrupting \glspl{thrd} for this purpose is far from desireable, the cost is significant and the situation may be hard to detect.
    278 However, a more subtle reason why interrupting the \gls{thrd} is not a satisfying solution is that the \gls{proc} is not actually using the instance it is tied to.
    279 If it were to use it, then helping could be done as part of the usage.
     276However, the \io \gls{proc} is unable to help because it is executing the spinning \gls{thrd} resulting in a deadlock.
     277While this example is artificial, in the presence of many \glspl{thrd}, it is possible for this problem to arise ``in the wild''.
     278Furthermore, this pattern is difficult to reliably detect and avoid.
     279Once in this situation, the only escape is to interrupted the spinning \gls{thrd}, either directly or via some regular preemption, \eg time slicing.
     280Having to interrupt \glspl{thrd} for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect.
    280281Interrupts are needed here entirely because the \gls{proc} is tied to an instance it is not using.
    281 Therefore a more satisfying solution would be for the \gls{thrd} submitting the operation to simply notice that the instance is unused and simply go ahead and use it.
    282 This is the approach presented next.
     282Therefore, a more satisfying solution is for the \gls{thrd} submitting the operation to notice that the instance is unused and simply go ahead and use it.
     283This approach is presented shortly.
     284
     285\subsubsection{Public Instances}
     286The public approach creates decoupled pools of @io_uring@ instances and processors, \ie without one-to-one coupling.
     287\Glspl{thrd} attempting an \io operation pick one of the available instances and submit the operation to that instance.
     288Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently.
     289Because @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects:
     290\begin{itemize}
     291\item
     292The synchronization needed to submit does not induce more contention than @io_uring@ already does.
     293\item
     294The scheme to route \io requests to specific @io_uring@ instances does not introduce contention.
     295This aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
     296\end{itemize}
     297
     298Allocation in this scheme is fairly easy.
     299Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written to safely and have a field called @user_data@ that the kernel only reads to copy to @cqe@s.
     300Allocation also requires no ordering guarantee as all free SQEs are interchangeable.
     301The only added complexity is that the number of SQEs is fixed, which means allocation can fail.
     302
     303Allocation failures need to be pushed to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.
     304Furthermore, the routing algorithm should block operations up-front, if none of the instances have available SQEs.
     305
     306Once an SQE is allocated, \glspl{thrd} insert the \io request information, and keep track of the SQE index and the instance it belongs to.
     307
     308Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread-safe, and then the kernel must be notified using the @io_uring_enter@ system call.
     309The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean a \lstinline{sqe} multiple times in the ring buffer, which is undefined behaviour.
     310However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations complete.
     311
     312Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency.
     313Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted.
     314Balancing submission can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section.
     315
     316Ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \glspl{thrd} is designated to do the system call on behalf of the others, called the \newterm{submitter}.
     317However, in practice, \io requests must be handed promptly so there is a need to guarantee everything missed by the current submitter is seen by the next one.
     318Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call includes their request.
     319Once the system call is done, the submitter must also free SQEs so that the allocator can reused them.
     320
     321Finally, the completion side is much simpler since the @io_uring@ system-call enforces a natural synchronization point.
     322Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}.
     323Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}.
     324If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events.
     325A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled.
     326
     327With the pool of SEQ instances approach, the big advantage is that it is fairly flexible.
     328It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions.
     329It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@.
     330The down side to this approach is that many of the steps used for submitting need complex synchronization to work properly.
     331The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed.
     332The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused, and handle the kernel returning @EBUSY@.
     333All this synchronization has a significant cost, and compared to the private-instance approach, this synchronization is entirely overhead.
    283334
    284335\subsubsection{Instance borrowing}
    285 Both of the approaches presented above have undesirable aspects that stem from too loose or too tight coupling between @io_uring@ and \glspl{proc}.
    286 In the first approach, loose coupling meant that all operations have synchronization overhead that a tighter coupling can avoid.
    287 The second approach on the other hand suffers from tight coupling causing problems when the \gls{proc} do not benefit from the coupling.
    288 While \glspl{proc} are continously issuing \io operations tight coupling is valuable since it avoids synchronization costs.
    289 However, in unlikely failure cases or when \glspl{proc} are not making use of their instance, tight coupling is no longer advantageous.
    290 A compromise between these approaches would be to allow tight coupling but have the option to revoke this coupling dynamically when failure cases arise.
    291 I call this approach ``instance borrowing''\footnote{While it looks similar to work-sharing and work-stealing, I think it is different enough from either to warrant a different verb to avoid confusion.}.
    292 
    293 In this approach, each cluster owns a pool of @io_uring@ instances managed by an arbiter.
     336Both of the prior approaches have undesirable aspects that stem from tight or loose coupling between @io_uring@ and \glspl{proc}.
     337The first approach suffers from tight coupling causing problems when a \gls{proc} does not benefit from the coupling.
     338The second approach suffers from loose coupling causing operations to have synchronization overhead, which tighter coupling avoids.
     339When \glspl{proc} are continuously issuing \io operations, tight coupling is valuable since it avoids synchronization costs.
     340However, in unlikely failure cases or when \glspl{proc} are not using their instances, tight coupling is no longer advantageous.
     341A compromise between these approaches is to allow tight coupling but have the option to revoke the coupling dynamically when failure cases arise.
     342I call this approach \newterm{instance borrowing}.\footnote{
     343While instance borrowing looks similar to work sharing and stealing, I think it is different enough to warrant a different verb to avoid confusion.}
     344
     345In this approach, each cluster, see Figure~\ref{fig:system}, owns a pool of @io_uring@ instances managed by an \newterm{arbiter}.
    294346When a \gls{thrd} attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance.
    295 However, in doing so it ties to the instance to the \gls{proc} it is currently running on.
    296 This coupling is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.
    297 This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at any given time, akin to the private instances approach.
    298 However, where it differs is that revocation from the arbiter means this approach does not suffer from the deadlock scenario described above.
     347This instance is now bound to the \gls{proc} the \gls{thrd} is running on.
     348This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.
     349This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at a time, akin to the private instances approach.
     350However, it differs in that revocation by the arbiter means this approach does not suffer from the deadlock scenario described above.
    299351
    300352Arbitration is needed in the following cases:
    301353\begin{enumerate}
    302         \item The current \gls{proc} does not currently hold an instance.
     354        \item The current \gls{proc} does not hold an instance.
    303355        \item The current instance does not have sufficient SQEs to satisfy the request.
    304         \item The current \gls{proc} has the wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission.
    305         I will refer to these as \newterm{External Submissions}.
     356        \item The current \gls{proc} has a wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission, called \newterm{external submissions}.
    306357\end{enumerate}
    307 However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their ownership of the instance is not being revoked.
    308 This can be accomplished by a lock-less handshake\footnote{Note that the handshake is not Lock-\emph{Free} since it lacks the proper progress guarantee.}.
     358However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their instance ownership is not being revoked, which is accomplished by a lock-\emph{less} handshake.\footnote{
     359Note the handshake is not lock \emph{free} since it lacks the proper progress guarantee.}
    309360A \gls{proc} raises a local flag before using its borrowed instance and checks if the instance is marked as revoked or if the arbiter has raised its flag.
    310 If not it proceeds, otherwise it delegates the operation to the arbiter.
     361If not, it proceeds, otherwise it delegates the operation to the arbiter.
    311362Once the operation is completed, the \gls{proc} lowers its local flag.
    312363
    313 Correspondingly, before revoking an instance the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag.
     364Correspondingly, before revoking an instance, the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag.
    314365Only then does it reclaim the instance and potentially assign it to an other \gls{proc}.
    315366
     
    323374
    324375\paragraph{External Submissions} are handled by the arbiter by revoking the appropriate instance and adding the submission to the submission ring.
    325 There is no need to immediately revoke the instance however.
     376However, there is no need to immediately revoke the instance.
    326377External submissions must simply be added to the ring before the next system call, \ie, when the submission ring is flushed.
    327 This means that whoever is responsible for the system call first checks if the instance has any external submissions.
    328 If it is the case, it asks the arbiter to revoke the instance and add the external submissions to the ring.
    329 
    330 \paragraph{Pending Allocations} can be more complicated to handle.
    331 If the arbiter has available instances, the arbiter can attempt to directly hand over the instance and satisfy the request.
    332 Otherwise it must hold onto the list of threads until SQEs are made available again.
    333 This handling becomes that much more complex if pending allocation require more than one SQE, since the arbiter must make a decision between statisfying requests in FIFO ordering or satisfy requests for fewer SQEs first.
    334 
    335 While this arbiter has the potential to solve many of the problems mentionned in above, it also introduces a significant amount of complexity.
     378This means whoever is responsible for the system call, first checks if the instance has any external submissions.
     379If so, it asks the arbiter to revoke the instance and add the external submissions to the ring.
     380
     381\paragraph{Pending Allocations} are handled by the arbiter when it has available instances and can directly hand over the instance and satisfy the request.
     382Otherwise, it must hold onto the list of threads until SQEs are made available again.
     383This handling is more complex when an allocation requires multiple SQEs, since the arbiter must make a decision between satisfying requests in FIFO ordering or for fewer SQEs.
     384
     385While an arbiter has the potential to solve many of the problems mentioned above, it also introduces a significant amount of complexity.
    336386Tracking which processors are borrowing which instances and which instances have SQEs available ends-up adding a significant synchronization prelude to any I/O operation.
    337387Any submission must start with a handshake that pins the currently borrowed instance, if available.
    338388An attempt to allocate is then made, but the arbiter can concurrently be attempting to allocate from the same instance from a different \gls{hthrd}.
    339 Once the allocation is completed, the submission must still check that the instance is still burrowed before attempt to flush.
    340 These extra synchronization steps end-up having a similar cost to the multiple shared instances approach.
     389Once the allocation is completed, the submission must check that the instance is still burrowed before attempting to flush.
     390These synchronization steps turn out to have a similar cost to the multiple shared-instances approach.
    341391Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end-up cycling the processors, which leads to significant cache deterioration.
    342 Because of these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice.
     392For these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice.
    343393
    344394\subsubsection{Private Instances V2}
    345395
    346 
    347 
    348396% Verbs of this design
    349397
    350398% Allocation: obtaining an sqe from which to fill in the io request, enforces the io instance to use since it must be the one which provided the sqe. Must interact with the arbiter if the instance does not have enough sqe for the allocation. (Typical allocation will ask for only one sqe, but chained sqe must be allocated from the same context so chains of sqe must be allocated in bulks)
    351399
    352 % Submition: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.
     400% Submission: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.
    353401
    354402% Flushing: Taking all the sqes that were submitted and making them visible to the kernel, also counting them in order to figure out what to_submit should be. Must be thread-safe with submission. Has to interact with the Arbiter if there are external submissions. Can't simply use a protected queue because adding to the array is not safe if the ring is still available for submitters. Flushing must therefore: check if there are external pending requests if so, ask the arbiter to flush otherwise use the fast flush operation.
     
    357405
    358406% Handle: process all the produced cqe. No need to interact with any of the submission operations or the arbiter.
    359 
    360 
    361407
    362408
     
    404450
    405451\section{Interface}
    406 Finally, the last important part of the \io subsystem is it's interface. There are multiple approaches that can be offered to programmers, each with advantages and disadvantages. The new \io subsystem can replace the C runtime's API or extend it. And in the later case the interface can go from very similar to vastly different. The following sections discuss some useful options using @read@ as an example. The standard Linux interface for C is :
    407 
    408 @ssize_t read(int fd, void *buf, size_t count);@
     452The last important part of the \io subsystem is its interface.
     453There are multiple approaches that can be offered to programmers, each with advantages and disadvantages.
     454The new \io subsystem can replace the C runtime API or extend it, and in the later case, the interface can go from very similar to vastly different.
     455The following sections discuss some useful options using @read@ as an example.
     456The standard Linux interface for C is :
     457\begin{cfa}
     458ssize_t read(int fd, void *buf, size_t count);
     459\end{cfa}
    409460
    410461\subsection{Replacement}
    411462Replacing the C \glsxtrshort{api} is the more intrusive and draconian approach.
    412463The goal is to convince the compiler and linker to replace any calls to @read@ to direct them to the \CFA implementation instead of glibc's.
    413 This has the advantage of potentially working transparently and supporting existing binaries without needing recompilation.
     464This rerouting has the advantage of working transparently and supporting existing binaries without needing recompilation.
    414465It also offers a, presumably, well known and familiar API that C programmers can simply continue to work with.
    415 However, this approach also entails a plethora of subtle technical challenges which generally boils down to making a perfect replacement.
     466However, this approach also entails a plethora of subtle technical challenges, which generally boils down to making a perfect replacement.
    416467If the \CFA interface replaces only \emph{some} of the calls to glibc, then this can easily lead to esoteric concurrency bugs.
    417 Since the gcc ecosystems does not offer a scheme for such perfect replacement, this approach was rejected as being laudable but infeasible.
     468Since the gcc ecosystems does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible.
    418469
    419470\subsection{Synchronous Extension}
    420 An other interface option is to simply offer an interface that is different in name only. For example:
    421 
    422 @ssize_t cfa_read(int fd, void *buf, size_t count);@
    423 
    424 \noindent This is much more feasible but still familiar to C programmers.
    425 It comes with the caveat that any code attempting to use it must be recompiled, which can be a big problem considering the amount of existing legacy C binaries.
     471Another interface option is to offer an interface different in name only.
     472For example:
     473\begin{cfa}
     474ssize_t cfa_read(int fd, void *buf, size_t count);
     475\end{cfa}
     476This approach is feasible and still familiar to C programmers.
     477It comes with the caveat that any code attempting to use it must be recompiled, which is a problem considering the amount of existing legacy C binaries.
    426478However, it has the advantage of implementation simplicity.
     479Finally, there is a certain irony to using a blocking synchronous interfaces for a feature often referred to as ``non-blocking'' \io.
    427480
    428481\subsection{Asynchronous Extension}
    429 It is important to mention that there is a certain irony to using only synchronous, therefore blocking, interfaces for a feature often referred to as ``non-blocking'' \io.
    430 A fairly traditional way of doing this is using futures\cit{wikipedia futures}.
    431 As simple way of doing so is as follows:
    432 
    433 @future(ssize_t) read(int fd, void *buf, size_t count);@
    434 
    435 \noindent Note that this approach is not necessarily the most idiomatic usage of futures.
    436 The definition of read above ``returns'' the read content through an output parameter which cannot be synchronized on.
    437 A more classical asynchronous API could look more like:
    438 
    439 @future([ssize_t, void *]) read(int fd, size_t count);@
    440 
    441 \noindent However, this interface immediately introduces memory lifetime challenges since the call must effectively allocate a buffer to be returned.
    442 Because of the performance implications of this, the first approach is considered preferable as it is more familiar to C programmers.
    443 
    444 \subsection{Interface directly to \lstinline{io_uring}}
    445 Finally, an other interface that can be relevant is to simply expose directly the underlying \texttt{io\_uring} interface. For example:
    446 
    447 @array(SQE, want) cfa_io_allocate(int want);@
    448 
    449 @void cfa_io_submit( const array(SQE, have) & );@
    450 
    451 \noindent This offers more flexibility to users wanting to fully use all of the \texttt{io\_uring} features.
     482A fairly traditional way of providing asynchronous interactions is using a future mechanism~\cite{multilisp}, \eg:
     483\begin{cfa}
     484future(ssize_t) read(int fd, void *buf, size_t count);
     485\end{cfa}
     486where the generic @future@ is fulfilled when the read completes and it contains the number of bytes read, which may be less than the number of bytes requested.
     487The data read is placed in @buf@.
     488The problem is that both the bytes read and data form the synchronization object, not just the bytes read.
     489Hence, the buffer cannot be reused until the operation completes but the synchronization does not cover the buffer.
     490A classical asynchronous API is:
     491\begin{cfa}
     492future([ssize_t, void *]) read(int fd, size_t count);
     493\end{cfa}
     494where the future tuple covers the components that require synchronization.
     495However, this interface immediately introduces memory lifetime challenges since the call must effectively allocate a buffer to be returned.
     496Because of the performance implications of this API, the first approach is considered preferable as it is more familiar to C programmers.
     497
     498\subsection{Direct \lstinline{io_uring} Interface}
     499The last interface directly exposes the underlying @io_uring@ interface, \eg:
     500\begin{cfa}
     501array(SQE, want) cfa_io_allocate(int want);
     502void cfa_io_submit( const array(SQE, have) & );
     503\end{cfa}
     504where the generic @array@ contains an array of SQEs with a size that may be less than the request.
     505This offers more flexibility to users wanting to fully utilize all of the @io_uring@ features.
    452506However, it is not the most user-friendly option.
    453 It obviously imposes a strong dependency between user code and \texttt{io\_uring} but at the same time restricting users to usages that are compatible with how \CFA internally uses \texttt{io\_uring}.
    454 
    455 
     507It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricting users to usages that are compatible with how \CFA internally uses @io_uring@.
  • doc/theses/thierry_delisle_PhD/thesis/text/practice.tex

    r9e23b446 rffec1bf  
    11\chapter{Scheduling in practice}\label{practice}
    2 The scheduling algorithm discribed in Chapter~\ref{core} addresses scheduling in a stable state.
    3 However, it does not address problems that occur when the system changes state.
     2The scheduling algorithm described in Chapter~\ref{core} addresses scheduling in a stable state.
     3This chapter addresses problems that occur when the system state changes.
    44Indeed the \CFA runtime, supports expanding and shrinking the number of \procs, both manually and, to some extent, automatically.
    5 This entails that the scheduling algorithm must support these transitions.
    6 
    7 More precise \CFA supports adding \procs using the RAII object @processor@.
    8 These objects can be created at any time and can be destroyed at any time.
    9 They are normally created as automatic stack variables, but this is not a requirement.
    10 
    11 The consequence is that the scheduler and \io subsystems must support \procs comming in and out of existence.
     5These changes affect the scheduling algorithm, which must dynamically alter its behaviour.
     6
     7In detail, \CFA supports adding \procs using the type @processor@, in both RAII and heap coding scenarios.
     8\begin{cfa}
     9{
     10        processor p[4]; // 4 new kernel threads
     11        ... // execute on 4 processors
     12        processor * dp = new( processor, 6 ); // 6 new kernel threads
     13        ... // execute on 10 processors
     14        delete( dp );   // delete 6 kernel threads
     15        ... // execute on 4 processors
     16} // delete 4 kernel threads
     17\end{cfa}
     18Dynamically allocated processors can be deleted an any time, \ie their lifetime exceeds the block of creation.
     19The consequence is that the scheduler and \io subsystems must know when these \procs come in and out of existence and roll them into the appropriate scheduling algorithms.
    1220
    1321\section{Manual Resizing}
    1422Manual resizing is expected to be a rare operation.
    15 Programmers are mostly expected to resize clusters on startup or teardown.
    16 Therefore dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state.
    17 As such all internal arrays that are sized based on the number of \procs need to be \texttt{realloc}ed.
    18 This also means that any references into these arrays, pointers or indexes, may need to be fixed when shrinking\footnote{Indexes may still need fixing when shrinkingbecause some indexes are expected to refer to dense contiguous resources and there is no guarantee the resource being removed has the highest index.}.
     23Programmers normally create/delete processors on a clusters at startup/teardown.
     24Therefore, dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state.
     25As such, all internal scheduling arrays that are sized based on the number of \procs need to be @realloc@ed.
     26This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or for any other reason.
    1927
    2028There are no performance requirements, within reason, for resizing since it is expected to be rare.
    21 However, this operation has strict correctness requirements since shrinking and idle sleep can easily lead to deadlocks.
     29However, this operation has strict correctness requirements since updating and idle sleep can easily lead to deadlocks.
    2230It should also avoid as much as possible any effect on performance when the number of \procs remain constant.
    2331This later requirement prohibits naive solutions, like simply adding a global lock to the ready-queue arrays.
    2432
    2533\subsection{Read-Copy-Update}
    26 One solution is to use the Read-Copy-Update\cite{wiki:rcu} pattern.
    27 In this pattern, resizing is done by creating a copy of the internal data strucures, updating the copy with the desired changes, and then attempt an Idiana Jones Switch to replace the original witht the copy.
    28 This approach potentially has the advantage that it may not need any synchronization to do the switch.
    29 However, there is a race where \procs could still use the previous, original, data structure after the copy was switched in.
    30 This race not only requires some added memory reclamation scheme, it also requires that operations made on the stale original version be eventually moved to the copy.
    31 
    32 For linked-lists, enqueing is only somewhat problematic, \ats enqueued to the original queues need to be transferred to the new, which might not preserve ordering.
    33 Dequeing is more challenging.
    34 Dequeing from the original will not necessarily update the copy which could lead to multiple \procs dequeing the same \at.
    35 Fixing this requires more synchronization or more indirection on the queues.
    36 
    37 Another challenge is that the original must be kept until all \procs have witnessed the change.
    38 This is a straight forward memory reclamation challenge but it does mean that every operation will need \emph{some} form of synchronization.
    39 If each of these operation does need synchronization then it is possible a simpler solution achieves the same performance.
    40 Because in addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges.
     34One solution is to use the Read-Copy-Update pattern~\cite{wiki:rcu}.
     35In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempt an Indiana Jones Switch to replace the original with the copy.
     36This approach has the advantage that it may not need any synchronization to do the switch.
     37However, there is a race where \procs still use the original data structure after the copy is switched.
     38This race not only requires adding a memory-reclamation scheme, it also requires that operations made on the stale original version are eventually moved to the copy.
     39
     40Specifically, the original data structure must be kept until all \procs have witnessed the change.
     41This requirement is the \newterm{memory reclamation challenge} and means every operation needs \emph{some} form of synchronization.
     42If all operations need synchronization, then the overall cost of this technique is likely to be similar to an uncontended lock approach.
     43In addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges.
    4144Especially merging subqueues while having a minimal impact on fairness and locality.
    4245
    43 \subsection{Read-Writer Lock}
    44 A simpler approach would be to use a \newterm{Readers-Writer Lock}\cite{wiki:rwlock} where the resizing requires acquiring the lock as a writer while simply enqueing/dequeing \ats requires acquiring the lock as a reader.
     46For example, given a linked-list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure.
     47If the list supports arbitrary insertions, then inconsistencies in the tail pointer do not break the list;
     48however, ordering may not be preserved.
     49Furthermore, nodes enqueued to the original queues eventually need to be uniquely transferred to the new queues, which may further perturb ordering.
     50Dequeuing is more challenging when nodes appear on both lists because of pending reclamation: dequeuing a node from one list does not remove it from the other nor is that node in the same place on the other list.
     51This situation can lead to multiple \procs dequeuing the same \at.
     52Fixing these challenges requires more synchronization or more indirection to the queues, plus coordinated searching to ensure unique elements.
     53
     54\subsection{Readers-Writer Lock}
     55A simpler approach is to use a \newterm{Readers-Writer Lock}~\cite{wiki:rwlock}, where the resizing requires acquiring the lock as a writer while simply enqueueing/dequeuing \ats requires acquiring the lock as a reader.
    4556Using a Readers-Writer lock solves the problem of dynamically resizing and leaves the challenge of finding or building a lock with sufficient good read-side performance.
    46 Since this is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.
    47 
    48 To maximize reader scalability, the readers should not contend with eachother when attempting to acquire and release the critical sections.
    49 This effectively requires that each reader have its own piece of memory to mark as locked and unlocked.
    50 Reades then acquire the lock wait for writers to finish the critical section and then acquire their local spinlocks.
    51 Writers acquire the global lock, so writers have mutual exclusion among themselves, and then acquires each of the local reader locks.
    52 Acquiring all the local locks guarantees mutual exclusion between the readers and the writer, while the wait on the read side prevents readers from continously starving the writer.
    53 \todo{reference listings}
    54 
    55 \begin{lstlisting}
     57Since this approach is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.
     58
     59To maximize reader scalability, readers should not contend with each other when attempting to acquire and release a critical section.
     60To achieve this goal requires each reader to have its own memory to mark as locked and unlocked.
     61The read acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock.
     62The write acquire acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks.
     63Acquiring all the local read locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer.
     64
     65Figure~\ref{f:SpecializedReadersWriterLock} shows the outline for this specialized readers-writer lock.
     66The lock in nonblocking, so both readers and writers spin while the lock is held.
     67\todo{finish explanation}
     68
     69\begin{figure}
     70\begin{cfa}
    5671void read_lock() {
    5772        // Step 1 : make sure no writers in
    5873        while write_lock { Pause(); }
    59 
    60         // May need fence here
    61 
    6274        // Step 2 : acquire our local lock
    63         while atomic_xchg( tls.lock ) {
    64                 Pause();
    65         }
    66 }
    67 
     75        while atomic_xchg( tls.lock ) { Pause(); }
     76}
    6877void read_unlock() {
    6978        tls.lock = false;
    7079}
    71 \end{lstlisting}
    72 
    73 \begin{lstlisting}
    7480void write_lock()  {
    7581        // Step 1 : lock global lock
    76         while atomic_xchg( write_lock ) {
    77                 Pause();
    78         }
    79 
     82        while atomic_xchg( write_lock ) { Pause(); }
    8083        // Step 2 : lock per-proc locks
    8184        for t in all_tls {
    82                 while atomic_xchg( t.lock ) {
    83                         Pause();
    84                 }
     85                while atomic_xchg( t.lock ) { Pause(); }
    8586        }
    8687}
    87 
    8888void write_unlock() {
    8989        // Step 1 : release local locks
    90         for t in all_tls {
    91                 t.lock = false;
    92         }
    93 
     90        for t in all_tls { t.lock = false; }
    9491        // Step 2 : release global lock
    9592        write_lock = false;
    9693}
    97 \end{lstlisting}
    98 
    99 \section{Idle-Sleep}
    100 In addition to users manually changing the number of \procs, it is desireable to support ``removing'' \procs when there is not enough \ats for all the \procs to be useful.
    101 While manual resizing is expected to be rare, the number of \ats is expected to vary much more which means \procs may need to be ``removed'' for only short periods of time.
    102 Furthermore, race conditions that spuriously lead to the impression that no \ats are ready are actually common in practice.
    103 Therefore resources associated with \procs should not be freed but \procs simply put into an idle state where the \gls{kthrd} is blocked until more \ats become ready.
    104 This state is referred to as \newterm{Idle-Sleep}.
     94\end{cfa}
     95\caption{Specialized Readers-Writer Lock}
     96\label{f:SpecializedReadersWriterLock}
     97\end{figure}
     98
     99\section{Idle-Sleep}\label{idlesleep}
     100While manual resizing of \procs is expected to be rare, the number of \ats can vary significantly over an application's lifetime, which means there are times when there are too few or too many \procs.
     101For this work, it is the programer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue.
     102This leaves too many \procs when there are not enough \ats for all the \procs to be useful.
     103These idle \procs cannot be removed because their lifetime is controlled by the application, and only the application knows when the number of \ats may increase or decrease.
     104While idle \procs can spin until work appears, this approach wastes energy, unnecessarily produces heat and prevents other applications from using the processor.
     105Therefore, idle \procs are put into an idle state, called \newterm{Idle-Sleep}, where the \gls{kthrd} is blocked until the scheduler deems it is needed.
    105106
    106107Idle sleep effectively encompasses several challenges.
    107 First some data structure needs to keep track of all \procs that are in idle sleep.
    108 Because of idle sleep can be spurious, this data structure has strict performance requirements in addition to the strict correctness requirements.
    109 Next, some tool must be used to block kernel threads \glspl{kthrd}, \eg \texttt{pthread\_cond\_wait}, pthread semaphores.
    110 The complexity here is to support \at parking and unparking, timers, \io operations and all other \CFA features with minimal complexity.
    111 Finally, idle sleep also includes a heuristic to determine the appropriate number of \procs to be in idle sleep an any given time.
    112 This third challenge is however outside the scope of this thesis because developping a general heuristic is involved enough to justify its own work.
    113 The \CFA scheduler simply follows the ``Race-to-Idle'\cit{https://doi.org/10.1137/1.9781611973099.100}' approach where a sleeping \proc is woken any time an \at becomes ready and \procs go to idle sleep anytime they run out of work.
     108First, a data structure needs to keep track of all \procs that are in idle sleep.
     109Because idle sleep is spurious, this data structure has strict performance requirements, in addition to strict correctness requirements.
     110Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ on a pthread semaphore.
     111The complexity here is to support \at parking and unparking, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity.
     112Finally, the scheduler needs a heuristic to determine when to block and unblock an appropriate number of \procs.
     113However, this third challenge is outside the scope of this thesis because developing a general heuristic is complex enough to justify its own work.
     114Therefore, the \CFA scheduler simply follows the ``Race-to-Idle''~\cite{Albers12} approach where a sleeping \proc is woken any time a \at becomes ready and \procs go to idle sleep anytime they run out of work.
     115An interesting sub-part of this heuristic is what to do with bursts of \ats that become ready.
     116Since waking up a sleeping \proc can have notable latency, it is possible multiple \ats become ready while a single \proc is waking up.
     117This facts begs the question, if many \procs are available, how many should be woken?
     118If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelisation.
     119If the ready \ats will run for a short very short time, waking many \procs may be wasteful.
     120As mentioned, a heuristic to handle these complex cases is outside the scope of this thesis, the behaviour of the scheduler in this particular case is left unspecified.
    114121
    115122\section{Sleeping}
    116123As usual, the corner-stone of any feature related to the kernel is the choice of system call.
    117 In terms of blocking a \gls{kthrd} until some event occurs the linux kernel has many available options:
    118 
    119 \paragraph{\texttt{pthread\_mutex}/\texttt{pthread\_cond}}
    120 The most classic option is to use some combination of \texttt{pthread\_mutex} and \texttt{pthread\_cond}.
    121 These serve as straight forward mutual exclusion and synchronization tools and allow a \gls{kthrd} to wait on a \texttt{pthread\_cond} until signalled.
    122 While this approach is generally perfectly appropriate for \glspl{kthrd} waiting after eachother, \io operations do not signal \texttt{pthread\_cond}s.
    123 For \io results to wake a \proc waiting on a \texttt{pthread\_cond} means that a different \glspl{kthrd} must be woken up first, and then the \proc can be signalled.
    124 
    125 \subsection{\texttt{io\_uring} and Epoll}
    126 An alternative is to flip the problem on its head and block waiting for \io, using \texttt{io\_uring} or even \texttt{epoll}.
    127 This creates the inverse situation, where \io operations directly wake sleeping \procs but waking \proc from a running \gls{kthrd} must use an indirect scheme.
    128 This generally takes the form of creating a file descriptor, \eg, a dummy file, a pipe or an event fd, and using that file descriptor when \procs need to wake eachother.
    129 This leads to additional complexity because there can be a race between these artificial \io operations and genuine \io operations.
    130 If not handled correctly, this can lead to the artificial files going out of sync.
     124In terms of blocking a \gls{kthrd} until some event occurs, the Linux kernel has many available options.
     125
     126\subsection{\lstinline{pthread_mutex}/\lstinline{pthread_cond}}
     127The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe park/unpark of a \gls{kthrd} to/from a @pthread_cond@.
     128While this approach works for \glspl{kthrd} waiting among themselves, \io operations do not provide a mechanism to signal @pthread_cond@s.
     129For \io results to wake a \proc waiting on a @pthread_cond@ means a different \glspl{kthrd} must be woken up first, which then signals the \proc.
     130
     131\subsection{\lstinline{io_uring} and Epoll}
     132An alternative is to flip the problem on its head and block waiting for \io, using @io_uring@ or @epoll@.
     133This creates the inverse situation, where \io operations directly wake sleeping \procs but waking blocked \procs must use an indirect scheme.
     134This generally takes the form of creating a file descriptor, \eg, dummy file, pipe, or event fd, and using that file descriptor when \procs need to wake each other.
     135This leads to additional complexity because there can be a race between these artificial \io and genuine \io operations.
     136If not handled correctly, this can lead to artificial files getting delayed too long behind genuine files, resulting in longer latency.
    131137
    132138\subsection{Event FDs}
    133139Another interesting approach is to use an event file descriptor\cit{eventfd}.
    134 This is a Linux feature that is a file descriptor that behaves like \io, \ie, uses \texttt{read} and \texttt{write}, but also behaves like a semaphore.
    135 Indeed, all read and writes must use 64bits large values\footnote{On 64-bit Linux, a 32-bit Linux would use 32 bits values.}.
    136 Writes add their values to the buffer, that is arithmetic addition and not buffer append, and reads zero out the buffer and return the buffer values so far\footnote{This is without the \texttt{EFD\_SEMAPHORE} flag. This flags changes the behavior of \texttt{read} but is not needed for this work.}.
     140This Linux feature is a file descriptor that behaves like \io, \ie, uses @read@ and @write@, but also behaves like a semaphore.
     141Indeed, all reads and writes must use a word-sized values, \ie 64 or 32 bits.
     142Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero out the buffer and return the buffer values so far.\footnote{
     143This behaviour is without the \lstinline{EFD_SEMAPHORE} flag, which changes the behaviour of \lstinline{read} but is not needed for this work.}
    137144If a read is made while the buffer is already 0, the read blocks until a non-0 value is added.
    138 What makes this feature particularly interesting is that \texttt{io\_uring} supports the \texttt{IORING\_REGISTER\_EVENTFD} command, to register an event fd to a particular instance.
    139 Once that instance is registered, any \io completion will result in \texttt{io\_uring} writing to the event FD.
    140 This means that a \proc waiting on the event FD can be \emph{directly} woken up by either other \procs or incomming \io.
     145What makes this feature particularly interesting is that @io_uring@ supports the @IORING_REGISTER_EVENTFD@ command to register an event @fd@ to a particular instance.
     146Once that instance is registered, any \io completion results in @io_uring@ writing to the event @fd@.
     147This means that a \proc waiting on the event @fd@ can be \emph{directly} woken up by either other \procs or incoming \io.
     148
     149\section{Tracking Sleepers}
     150Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc.
     151The classic challenge occurs when a \at is made ready while a \proc is going to sleep: there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at.
     152Since \ats can be made ready by timers, \io operations, or other events outside a cluster, this race can occur even if the \proc going to sleep is the only \proc awake.
     153As a result, improper handling of this race leads to all \procs going to sleep when there are ready \ats and the system deadlocks.
     154
     155The handshake closing the race is done with both the notifier and the idle \proc executing two ordered steps.
     156The notifier first make sure the newly ready \at is visible to \procs searching for \ats, and then attempt to notify an idle \proc.
     157On the other side, \procs make themselves visible as idle \procs and then search for any \ats they may have missed.
     158Unlike regular work-stealing, this search must be exhaustive to make sure that pre-existing \at is missed.
     159These steps from both sides guarantee that if the search misses a newly ready \at, then the notifier is guaranteed to see at least one idle \proc.
     160Conversly, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search.
     161
     162Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers.
     163Contention can be tolerated for \procs attempting to sleep or wake-up because these \procs are not doing useful work, and therefore, not contributing to overall performance.
     164However, notifying, checking if a \proc must be woken-up, and doing so if needed, can significantly affect overall performance and must be low cost.
     165
     166\subsection{Sleepers List}
     167Each cluster maintains a list of idle \procs, organized as a stack.
     168This ordering allows \procs at the tail to stay in idle sleep for extended period of times while those at the head of the list wake-up for bursts of activity.
     169Because of unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \procs handle as much of the work as possible.
     170The idle \procs maintain the stack of sleepers among themselves and notifying a sleeping \proc takes as little work as possible.
     171This approach means that maintaining the list is fairly straightforward.
     172The list can simply use a single lock per cluster and only \procs that are getting in and out of the idle state contend for that lock.
     173
     174This approach also simplifies notification.
     175Indeed, \procs not only need to be notify when a new \at is readied, but also must be notified during manual resizing, so the \gls{kthrd} can be joined.
     176These requirements mean whichever entity removes idle \procs from the sleeper list must be able to do so in any order.
     177Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure.
     178The single lock also means the notification process simply needs to wake-up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest.
     179
     180\subsection{Reducing Latency}
     181As mentioned in this section, \procs going to sleep for extremely short periods of time is likely in certain scenarios.
     182Therefore, the latency of doing a system call to read from and writing to an event @fd@ can negatively affect overall performance in a notable way.
     183Hence, it is important to reduce latency and contention of the notification as much as possible.
     184Figure~\ref{fig:idle1} shows the basic idle-sleep data structure.
     185For the notifiers, this data structure can cause contention on the lock and the event @fd@ syscall can cause notable latency.
    141186
    142187\begin{figure}
     
    144189        \input{idle1.pstex_t}
    145190        \caption[Basic Idle Sleep Data Structure]{Basic Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock.
    146         Each \proc has a private event FD.}
     191        Each \proc has a private event \lstinline{fd}.}
    147192        \label{fig:idle1}
    148193\end{figure}
    149194
    150 
    151 \section{Tracking Sleepers}
    152 Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc.
    153 The classic challenge is when a \at is made ready while a \proc is going to sleep, there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at.
    154 Since \ats can be made ready by timers, \io operations or other events outside a clusre, this race can occur even if the \proc going to sleep is the only \proc awake.
    155 As a result, improper handling of this race can lead to all \procs going to sleep and the system deadlocking.
    156 
    157 Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers.
    158 Contention slowing down \procs attempting to sleep or wake-up can be tolerated.
    159 These \procs are not doing useful work and therefore not contributing to overall performance.
    160 However, notifying, checking if a \proc must be woken-up and doing so if needed, can significantly affect overall performance and must be low cost.
    161 
    162 \subsection{Sleepers List}
    163 Each cluster maintains a list of idle \procs, organized as a stack.
    164 This ordering hopefully allows \proc at the tail to stay in idle sleep for extended period of times.
    165 Because of these unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \proc handle as much of the work as possible.
    166 The idle \procs maintain the of sleepers among themselves and notifying a sleeping \proc takes as little work as possible.
    167 This approach means that maintaining the list is fairly straightforward.
    168 The list can simply use a single lock per cluster and only \procs that are getting in and out of idle state will contend for that lock.
    169 
    170 This approach also simplifies notification.
    171 Indeed, \procs need to be notify when a new \at is readied, but they also must be notified during resizing, so the \gls{kthrd} can be joined.
    172 This means that whichever entity removes idle \procs from the sleeper list must be able to do so in any order.
    173 Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure.
    174 The notification process then simply needs to wake-up the desired idle \proc, using \texttt{pthread\_cond\_signal}, \texttt{write} on an fd, etc., and the \proc will handle the rest.
    175 
    176 \subsection{Reducing Latency}
    177 As mentioned in this section, \procs going idle for extremely short periods of time is likely in certain common scenarios.
    178 Therefore, the latency of doing a system call to read from and writing to the event fd can actually negatively affect overall performance in a notable way.
    179 Is it important to reduce latency and contention of the notification as much as possible.
    180 Figure~\ref{fig:idle1} shoes the basic idle sleep data structure.
    181 For the notifiers, this data structure can cause contention on the lock and the event fd syscall can cause notable latency.
    182 
    183 \begin{figure}
     195Contention occurs because the idle-list lock must be held to access the idle list, \eg by \procs attempting to go to sleep, \procs waking, or notification attempts.
     196The contention from the \procs attempting to go to sleep can be mitigated slightly by using @try_acquire@, so the \procs simply busy wait again searching for \ats if the lock is held.
     197This trick cannot be used when waking \procs since the waker needs to return immediately to what it was doing.
     198
     199Interestingly, general notification, \ie waking any idle processor versus a specific one, does not strictly require modifying the list.
     200Here, contention can be reduced notably by having notifiers avoid the lock entirely by adding a pointer to the event @fd@ of the first idle \proc, as in Figure~\ref{fig:idle2}.
     201To avoid contention among notifiers, notifiers atomically exchange the pointer with @NULL@.
     202The first notifier succeeds on the exchange and obtains the @fd@ of an idle \proc;
     203hence, only one notifier contends on the system call.
     204This notifier writes to the @fd@ to wake a \proc.
     205The woken \proc then updates the atomic pointer, while it is updating the head of the list, as it removes itself from the list.
     206Notifiers that obtained a @NULL@ in the exchange simply move on knowing that another notifier is already waking a \proc.
     207This behaviour is equivalent to having multiple notifier write to the @fd@ since reads consume all previous writes.
     208Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares witht the latency of \procs waking up.
     209As mentioned in section~\ref{idlesleep}, there is no optimal approach to handle these bursts.
     210It is therefore difficult to justify the cost of any extra synchronization here.
     211
     212\begin{figure}[t]
    184213        \centering
    185214        \input{idle2.pstex_t}
    186         \caption[Improved Idle Sleep Data Structure]{Improved Idle Sleep Data Structure \smallskip\newline An atomic pointer is added to the list, pointing to the Event FD of the first \proc on the list.}
     215        \caption[Improved Idle-Sleep Data Structure]{Improved Idle-Sleep Data Structure \smallskip\newline An atomic pointer is added to the list pointing to the Event FD of the first \proc on the list.}
    187216        \label{fig:idle2}
    188217\end{figure}
    189218
    190 The contention is mostly due to the lock on the list needing to be held to get to the head \proc.
    191 That lock can be contended by \procs attempting to go to sleep, \procs waking or notification attempts.
    192 The contentention from the \procs attempting to go to sleep can be mitigated slightly by using \texttt{try\_acquire} instead, so the \procs simply continue searching for \ats if the lock is held.
    193 This trick cannot be used for waking \procs since they are not in a state where they can run \ats.
    194 However, it is worth nothing that notification does not strictly require accessing the list or the head \proc.
    195 Therefore, contention can be reduced notably by having notifiers avoid the lock entirely and adding a pointer to the event fd of the first idle \proc, as in Figure~\ref{fig:idle2}.
    196 To avoid contention between the notifiers, instead of simply reading the atomic pointer, notifiers atomically exchange it to \texttt{null} so only only notifier will contend on the system call.
     219The next optimization is to avoid the latency of the event @fd@, which can be done by adding what is effectively a binary benaphore\cit{benaphore} in front of the event @fd@.
     220The benaphore over the event @fd@ logically provides a three state flag to avoid unnecessary system calls, where the states are expressed explicit in Figure~\ref{fig:idle:state}.
     221A \proc begins its idle sleep by adding itself to the idle list before searching for an \at.
     222In the process of adding itself to the idle list, it sets the state flag to @SEARCH@.
     223If no \ats can be found during the search, the \proc then confirms it is going to sleep by atomically swapping the state to @SLEEP@.
     224If the previous state is still @SEARCH@, then the \proc does read the event @fd@.
     225Meanwhile, notifiers atomically exchange the state to @AWAKE@ state.
     226If the previous state is @SLEEP@, then the notifier must write to the event @fd@.
     227However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event @fd@ can be omitted, which reduces latency notably.
     228These extensions leads to the final data structure shown in Figure~\ref{fig:idle}.
    197229
    198230\begin{figure}
    199231        \centering
    200232        \input{idle_state.pstex_t}
    201         \caption[Improved Idle Sleep Data Structure]{Improved Idle Sleep Data Structure \smallskip\newline An atomic pointer is added to the list, pointing to the Event FD of the first \proc on the list.}
     233        \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three state flag is added to the event \lstinline{fd}.}
    202234        \label{fig:idle:state}
    203235\end{figure}
    204 
    205 The next optimization that can be done is to avoid the latency of the event fd when possible.
    206 This can be done by adding what is effectively a benaphore\cit{benaphore} in front of the event fd.
    207 A simple three state flag is added beside the event fd to avoid unnecessary system calls, as shown in Figure~\ref{fig:idle:state}.
    208 The flag starts in state \texttt{SEARCH}, while the \proc is searching for \ats to run.
    209 The \proc then confirms the sleep by atomically swaping the state to \texttt{SLEEP}.
    210 If the previous state was still \texttt{SEARCH}, then the \proc does read the event fd.
    211 Meanwhile, notifiers atomically exchange the state to \texttt{AWAKE} state.
    212 if the previous state was \texttt{SLEEP}, then the notifier must write to the event fd.
    213 However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event fd can be omitted, which reduces latency notably.
    214 This leads to the final data structure shown in Figure~\ref{fig:idle}.
    215236
    216237\begin{figure}
     
    218239        \input{idle.pstex_t}
    219240        \caption[Low-latency Idle Sleep Data Structure]{Low-latency Idle Sleep Data Structure \smallskip\newline Each idle \proc is put unto a doubly-linked stack protected by a lock.
    220         Each \proc has a private event FD with a benaphore in front of it.
    221         The list also has an atomic pointer to the event fd and benaphore of the first \proc on the list.}
     241        Each \proc has a private event \lstinline{fd} with a benaphore in front of it.
     242        The list also has an atomic pointer to the event \lstinline{fd} and benaphore of the first \proc on the list.}
    222243        \label{fig:idle}
    223244\end{figure}
  • doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex

    r9e23b446 rffec1bf  
    22This chapter presents an overview of the capabilities of the \CFA runtime prior to this thesis work.
    33
    4 \Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@. Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC). While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC. This model uses \glspl{kthrd} to achieve parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completely true due to primitives like \lstinline|futex|es, which have a significant portion of their logic in user space.}.
     4\section{C Threading}
     5
     6\Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@.
     7Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC).
     8While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC.
     9This model uses \glspl{kthrd} to achieve parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel.
     10The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking.
     11A consequence of this approach is that the kernel has a perfect view of every thread executing in the system\footnote{This is not completely true due to primitives like \lstinline|futex|es, which have a significant portion of their logic in user space.}.
    512
    613\section{M:N Threading}\label{prev:model}
     
    815Threading in \CFA is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \glspl{thrd} and switch among \glspl{thrd} liberally without many concerns for performance.
    916
    10 The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run.
     17The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}.
     18The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack.
     19The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run.
    1120
    1221\section{Clusters}
    13 \CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. It also opens the door to handling effects like NUMA, by pining clusters to a specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
     22\CFA allows the option to group user-level threading, in the form of clusters.
     23Both \glspl{thrd} and \glspl{proc} belong to a specific cluster.
     24\Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters.
     25Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism.
     26It also opens the door to handling effects like NUMA, by pinning clusters to a specific NUMA node\footnote{This capability is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for CPU masks.}.
    1427
    1528\begin{figure}
     
    1730                \input{system.pstex_t}
    1831        \end{center}
    19         \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.}
     32        \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} that lives outside any cluster and does not run \glspl{thrd}.}
    2033        \label{fig:system}
    2134\end{figure}
     
    2841
    2942\begin{quote}
    30 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request. With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
     43Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server.
     44If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet.
     45In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request.
     46With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}.
     47If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client.
     48However, this solution is neither general nor appropriate even in this simple case.}.
    3149\end{quote}
    3250
    33 Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, like \glslink{uthrding}{User-Level \emph{Threading}} blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations, which entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple \glsxtrshort{io} operations in parallel. This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. Executing \glsxtrshort{io} operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block.
     51Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} ope      rations.
     52This feature entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}.
     53The multiplexing requires a single \gls{proc} to execute multiple \glsxtrshort{io} operations in parallel.
     54This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration.
     55Executing \glsxtrshort{io} operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block.
    3456
    35 \section{Interoperating with \texttt{C}}
     57\section{Interoperating with C}
    3658While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the non-blocking challenge extends to all blocking system-calls. The POSIX standard states~\cite[\S~2.9.1]{POSIX17}:
    3759\begin{quote}
    38 All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions1 need not be thread-safe. ... (list of 70+ potentially excluded functions)
     60All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions need not be thread-safe. ... (list of 70+ excluded functions)
    3961\end{quote}
    40 Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthread lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
     62Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthreads lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
    4163
    4264Languages like Go and Java, which have strict interoperability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them, \eg a blocking function may be delegated to a \gls{kthrd}. Sandboxing may help towards guaranteeing that the kind of deadlock mentioned above does not occur.
     
    4567\begin{enumerate}
    4668        \item Precisely identifying blocking C calls is difficult.
    47         \item Introducing control points code can have a significant impact on general performance.
     69        \item Introducing safe-point code (see Go~page~\pageref{GoSafePoint}) can have a significant impact on general performance.
    4870\end{enumerate}
    49 Because of these consequences, this work does not attempt to ``sandbox'' calls to C. Therefore, it is possible calls from an unidentified library will block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. Therefore, a complete solution to this problem is outside the scope of this thesis.
     71Because of these consequences, this work does not attempt to ``sandbox'' calls to C.
     72Therefore, it is possible calls to an unknown library function can block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model.
     73Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications.
     74Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks are redirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.}
  • doc/theses/thierry_delisle_PhD/thesis/thesis.tex

    r9e23b446 rffec1bf  
    8383\usepackage{graphicx} % For including graphics
    8484\usepackage{subcaption}
     85\usepackage{comment} % Removes large sections of the document.
    8586
    8687% Hyperlinks make it very easy to navigate an electronic document.
     
    107108        citecolor=OliveGreen,   % color of links to bibliography
    108109        filecolor=magenta,      % color of file links
    109         urlcolor=cyan           % color of external links
     110        urlcolor=blue,           % color of external links
     111        breaklinks=true
    110112}
    111113\ifthenelse{\boolean{PrintVersion}}{   % for improved print quality, change some hyperref options
  • libcfa/Makefile.am

    r9e23b446 rffec1bf  
    1818ACLOCAL_AMFLAGS  = -I automake
    1919SUBDIRS = prelude src      # order important
     20
     21DISTCLEANFILES = config.data
  • libcfa/configure.ac

    r9e23b446 rffec1bf  
    181181AH_TEMPLATE([CFA_HAVE_SPLICE_F_FD_IN_FIXED],[Defined if io_uring support is present when compiling libcfathread and supports the flag SPLICE_F_FD_IN_FIXED.])
    182182AH_TEMPLATE([CFA_HAVE_IORING_SETUP_ATTACH_WQ],[Defined if io_uring support is present when compiling libcfathread and supports the flag IORING_SETUP_ATTACH_WQ.])
     183AH_TEMPLATE([CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS],[Defined if io_uring support is present when compiling libcfathread and supports the flag IORING_REGISTER_IOWQ_MAX_WORKERS.])
    183184AH_TEMPLATE([CFA_HAVE_PREADV2],[Defined if preadv2 support is present when compiling libcfathread.])
    184185AH_TEMPLATE([CFA_HAVE_PWRITEV2],[Defined if pwritev2 support is present when compiling libcfathread.])
     
    189190
    190191define(ioring_ops, [IORING_OP_NOP,IORING_OP_READV,IORING_OP_WRITEV,IORING_OP_FSYNC,IORING_OP_READ_FIXED,IORING_OP_WRITE_FIXED,IORING_OP_POLL_ADD,IORING_OP_POLL_REMOVE,IORING_OP_SYNC_FILE_RANGE,IORING_OP_SENDMSG,IORING_OP_RECVMSG,IORING_OP_TIMEOUT,IORING_OP_TIMEOUT_REMOVE,IORING_OP_ACCEPT,IORING_OP_ASYNC_CANCEL,IORING_OP_LINK_TIMEOUT,IORING_OP_CONNECT,IORING_OP_FALLOCATE,IORING_OP_OPENAT,IORING_OP_CLOSE,IORING_OP_FILES_UPDATE,IORING_OP_STATX,IORING_OP_READ,IORING_OP_WRITE,IORING_OP_FADVISE,IORING_OP_MADVISE,IORING_OP_SEND,IORING_OP_RECV,IORING_OP_OPENAT2,IORING_OP_EPOLL_CTL,IORING_OP_SPLICE,IORING_OP_PROVIDE_BUFFERS,IORING_OP_REMOVE_BUFFER,IORING_OP_TEE])
    191 define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,IOSQE_ASYNC,IOSQE_BUFFER_SELECT,SPLICE_F_FD_IN_FIXED,IORING_SETUP_ATTACH_WQ])
     192define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,IOSQE_ASYNC,IOSQE_BUFFER_SELECT,SPLICE_F_FD_IN_FIXED,IORING_SETUP_ATTACH_WQ,IORING_REGISTER_IOWQ_MAX_WORKERS])
    192193
    193194define(ioring_from_decls, [
  • libcfa/src/Makefile.am

    r9e23b446 rffec1bf  
    216216nobase_cfa_include_HEADERS = ${stdhdr} ${inst_headers_src} ${inst_headers_nosrc} ${inst_thread_headers_src} ${inst_thread_headers_nosrc}
    217217EXTRA_DIST = stdhdr
     218DISTCLEANFILES = $(libdeps) $(thread_libdeps)
    218219
    219220#----------------------------------------------------------------------------------------------------------------
     
    221222        -rm -rf ${CFA_INCDIR} ${CFA_LIBDIR}
    222223
    223 distclean-local:
    224         find ${builddir} -path '*.Plo' -delete
     224#distclean-local:
     225#       find ${builddir} -path '*.Plo' -delete
    225226
    226227
  • libcfa/src/concurrency/kernel/fwd.hfa

    r9e23b446 rffec1bf  
    254254                        // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
    255255                        bool setup( future_t & this, oneshot & wait_ctx ) {
    256                                 /* paranoid */ verify( wait_ctx.ptr == 0p );
     256                                /* paranoid */ verify( wait_ctx.ptr == 0p || wait_ctx.ptr == 1p );
    257257                                // The future needs to set the wait context
    258258                                for() {
     
    274274                        // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
    275275                        bool retract( future_t & this, oneshot & wait_ctx ) {
    276                                 for() {
    277                                         struct oneshot * expected = this.ptr;
    278 
    279                                         // expected == 0p: future was never actually setup, just return
    280                                         if( expected == 0p ) return false;
    281 
    282                                         // expected == 1p: the future is ready and the context was fully consumed
    283                                         // the server won't use the pointer again
    284                                         // It is safe to delete (which could happen after the return)
    285                                         if( expected == 1p ) return true;
    286 
    287                                         // expected == 2p: the future is ready but the context hasn't fully been consumed
    288                                         // spin until it is safe to move on
    289                                         if( expected == 2p ) {
    290                                                 while( this.ptr != 1p ) Pause();
    291                                                 /* paranoid */ verify( this.ptr == 1p );
    292                                                 return true;
    293                                         }
    294 
    295                                         // expected != wait_ctx: the future was setup with a different context ?!?!
    296                                         // something went wrong here, abort
    297                                         if( expected != &wait_ctx ) abort("Future in unexpected state");
    298 
     276                                struct oneshot * expected = this.ptr;
     277
     278                                // attempt to remove the context so it doesn't get consumed.
     279                                if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
    299280                                        // we still have the original context, then no one else saw it
    300                                         // attempt to remove the context so it doesn't get consumed.
    301                                         if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
    302                                                 return false;
    303                                         }
    304                                 }
     281                                        return false;
     282                                }
     283
     284                                // expected == 0p: future was never actually setup, just return
     285                                if( expected == 0p ) return false;
     286
     287                                // expected == 1p: the future is ready and the context was fully consumed
     288                                // the server won't use the pointer again
     289                                // It is safe to delete (which could happen after the return)
     290                                if( expected == 1p ) return true;
     291
     292                                // expected == 2p: the future is ready but the context hasn't fully been consumed
     293                                // spin until it is safe to move on
     294                                if( expected == 2p ) {
     295                                        while( this.ptr != 1p ) Pause();
     296                                        /* paranoid */ verify( this.ptr == 1p );
     297                                        return true;
     298                                }
     299
     300                                // anything else: the future was setup with a different context ?!?!
     301                                // something went wrong here, abort
     302                                abort("Future in unexpected state");
    305303                        }
    306304
  • libcfa/src/concurrency/locks.cfa

    r9e23b446 rffec1bf  
    237237                // This pthread_cond_var member is called from the kernel, and therefore, cannot block, but it can spin.
    238238                lock( cond->lock __cfaabi_dbg_ctx2 );
    239 
    240239                // this check is necessary to avoid a race condition since this timeout handler
    241240                //      may still be called after a thread has been removed from the queue but
     
    347346                size_t recursion_count = queue_and_get_recursion(this, &info);
    348347                alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info };
     348                unlock( lock );
     349
     350                // registers alarm outside cond lock to avoid deadlock
    349351                register_self( &node_wrap.alarm_node );
    350                 unlock( lock );
    351352
    352353                // blocks here
     
    437438                if ( ret ) {
    438439                        info_thread(L) & popped = try_pop_front( blocked_threads );
     440                        popped.signalled = true;
    439441                        on_notify(*popped.lock, popped.t);
    440442                }
     
    448450                while( ! blocked_threads`isEmpty ) {
    449451                        info_thread(L) & popped = try_pop_front( blocked_threads );
     452                        popped.signalled = true;
    450453                        on_notify(*popped.lock, popped.t);
    451454                }
     
    469472                size_t recursion_count = queue_and_get_recursion(this, &info);
    470473                pthread_alarm_node_wrap(L) node_wrap = { t, 0`s, callback, &this, &info };
     474                unlock( lock );
     475
     476                // registers alarm outside cond lock to avoid deadlock
    471477                register_self( &node_wrap.alarm_node );
    472                 unlock( lock );
    473478
    474479                // blocks here
     
    500505                return i.signalled;
    501506
     507        Duration getDuration(timespec t) {
     508                timespec currTime;
     509                clock_gettime(CLOCK_REALTIME, &currTime);
     510                Duration waitUntil = { t };
     511                Duration currDur = { currTime };
     512                if ( currDur >= waitUntil ) return currDur - waitUntil;
     513                Duration zero = { 0 };
     514                return zero;
     515        }
     516
    502517        bool wait( pthread_cond_var(L) & this, L & l, timespec t ) {
    503                 Duration d = { t };
    504                 WAIT_TIME( 0, &l , d )
     518                PTHREAD_WAIT_TIME( 0, &l , getDuration( t ) )
    505519        }
    506520       
    507521        bool wait( pthread_cond_var(L) & this, L & l, uintptr_t info, timespec t  ) {
    508                 Duration d = { t };
    509                 WAIT_TIME( info, &l , d )
     522                PTHREAD_WAIT_TIME( info, &l , getDuration( t ) )
    510523        }
    511524}
  • libcfa/src/concurrency/locks.hfa

    r9e23b446 rffec1bf  
    478478        #endif
    479479        lock( lock, node );
    480         while(held) Pause();
    481         held = true;
    482         // printf("locked\n");
     480        while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();
     481        __atomic_store_n(&held, true, __ATOMIC_SEQ_CST);
    483482        unlock( lock, node );
    484483        #ifdef __CFA_DEBUG__
     
    488487
    489488static inline void unlock(spin_queue_lock & this) with(this) {
    490         // printf("unlocked\n");
    491489        #ifdef __CFA_DEBUG__
    492490        owner = 0p;
    493491        #endif
    494         held = false;
     492        __atomic_store_n(&held, false, __ATOMIC_RELEASE);
    495493}
    496494
     
    535533        #endif
    536534        lock( lock, node );
    537         while(held) Pause();
    538         held = true;
     535        while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();
     536        __atomic_store_n(&held, true, __ATOMIC_SEQ_CST);
    539537        unlock( lock, node );
    540538        #ifdef __CFA_DEBUG__
     
    547545        owner = 0p;
    548546        #endif
    549         held = false;
     547        __atomic_store_n(&held, false, __ATOMIC_SEQ_CST);
    550548}
    551549
     
    588586        #endif
    589587        lock( lock );
    590         while(held) Pause();
    591         held = true;
     588        while(__atomic_load_n(&held, __ATOMIC_SEQ_CST)) Pause();
     589        __atomic_store_n(&held, true, __ATOMIC_RELEASE);
    592590        unlock( lock );
    593591        #ifdef __CFA_DEBUG__
     
    600598        owner = 0p;
    601599        #endif
    602         held = false;
     600        __atomic_store_n(&held, false, __ATOMIC_RELEASE);
    603601}
    604602
  • libcfa/src/concurrency/ready_subqueue.hfa

    r9e23b446 rffec1bf  
    4949        // Get the relevant nodes locally
    5050        this.prev->link.next = node;
    51         this.prev->link.ts   = rdtscl();
     51        __atomic_store_n(&this.prev->link.ts, rdtscl(), __ATOMIC_RELAXED);
    5252        this.prev = node;
    5353        #if !defined(__CFA_NO_STATISTICS__)
  • libcfa/src/heap.cfa

    r9e23b446 rffec1bf  
    509509        checkHeader( header < (Heap.Storage.Header *)heapBegin || (Heap.Storage.Header *)heapEnd < header, name, addr ); // bad address ? (offset could be + or -)
    510510
    511         Heap * homeManager;
    512511        if ( unlikely( freeHead == 0p || // freed and only free-list node => null link
    513512                                   // freed and link points at another free block not to a bucket in the bucket array.
  • src/AST/Convert.cpp

    r9e23b446 rffec1bf  
    168168                auto attr = get<Attribute>().acceptL( node->attributes );
    169169
     170                // This field can be unset very early on (Pre-FixReturnTypes).
     171                auto newType = (type) ? type->clone() : nullptr;
     172
    170173                auto decl = new ObjectDecl(
    171174                        node->name,
     
    173176                        LinkageSpec::Spec( node->linkage.val ),
    174177                        bfwd,
    175                         type->clone(),
     178                        newType,
    176179                        nullptr, // prevent infinite loop
    177180                        attr,
     
    15791582
    15801583        virtual void visit( const ObjectDecl * old ) override final {
     1584                if ( inCache( old ) ) {
     1585                        return;
     1586                }
    15811587                auto&& type = GET_ACCEPT_1(type, Type);
    15821588                auto&& init = GET_ACCEPT_1(init, Init);
    15831589                auto&& bfwd = GET_ACCEPT_1(bitfieldWidth, Expr);
    15841590                auto&& attr = GET_ACCEPT_V(attributes, Attribute);
    1585                 if ( inCache( old ) ) {
    1586                         return;
    1587                 }
     1591
    15881592                auto decl = new ast::ObjectDecl(
    15891593                        old->location,
  • src/AST/Decl.hpp

    r9e23b446 rffec1bf  
    315315
    316316        EnumDecl( const CodeLocation& loc, const std::string& name,
    317                 std::vector<ptr<Attribute>>&& attrs = {}, Linkage::Spec linkage = Linkage::Cforall, Type * base = nullptr,
    318                  std::unordered_map< std::string, long long > enumValues = std::unordered_map< std::string, long long >() )
     317                std::vector<ptr<Attribute>>&& attrs = {}, Linkage::Spec linkage = Linkage::Cforall, Type const * base = nullptr,
     318                std::unordered_map< std::string, long long > enumValues = std::unordered_map< std::string, long long >() )
    319319        : AggregateDecl( loc, name, std::move(attrs), linkage ), base(base), enumValues(enumValues) {}
    320320
  • src/AST/Expr.cpp

    r9e23b446 rffec1bf  
    272272        // Adjust the length of the string for the terminator.
    273273        const Expr * strSize = from_ulong( loc, str.size() + 1 );
    274         const Type * strType = new ArrayType( charType, strSize, FixedLen, StaticDim );
     274        const Type * strType = new ArrayType( charType, strSize, FixedLen, DynamicDim );
    275275        const std::string strValue = "\"" + str + "\"";
    276276        return new ConstantExpr( loc, strType, strValue, std::nullopt );
  • src/AST/Pass.impl.hpp

    r9e23b446 rffec1bf  
    681681        if ( __visit_children() ) {
    682682                // unlike structs, traits, and unions, enums inject their members into the global scope
     683                maybe_accept( node, &EnumDecl::base );
    683684                maybe_accept( node, &EnumDecl::params     );
    684685                maybe_accept( node, &EnumDecl::members    );
  • src/AST/module.mk

    r9e23b446 rffec1bf  
    3737        AST/Init.cpp \
    3838        AST/Init.hpp \
     39        AST/Inspect.cpp \
     40        AST/Inspect.hpp \
    3941        AST/Label.hpp \
    4042        AST/LinkageSpec.cpp \
  • src/CodeGen/CodeGenerator.cc

    r9e23b446 rffec1bf  
    99// Author           : Richard C. Bilson
    1010// Created On       : Mon May 18 07:44:20 2015
    11 // Last Modified By : Peter A. Buhr
    12 // Last Modified On : Wed Feb  2 20:30:30 2022
    13 // Update Count     : 541
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Wed Jun 29 14:34:00 2022
     13// Update Count     : 542
    1414//
    1515#include "CodeGenerator.h"
     
    1818#include <list>                      // for _List_iterator, list, list<>::it...
    1919
     20#include "AST/Decl.hpp"              // for DeclWithType
    2021#include "Common/UniqueName.h"       // for UniqueName
    2122#include "Common/utility.h"          // for CodeLocation, toString
     
    294295                                } else {
    295296                                        if ( obj->get_init() ) {
    296                                                 obj->get_init()->accept( *visitor ); 
     297                                                obj->get_init()->accept( *visitor );
    297298                                        } else {
    298299                                                // Should not reach here!
     
    683684                extension( variableExpr );
    684685                const OperatorInfo * opInfo;
    685                 if ( variableExpr->get_var()->get_linkage() == LinkageSpec::Intrinsic && (opInfo = operatorLookup( variableExpr->get_var()->get_name() )) && opInfo->type == OT_CONSTANT ) {
     686                if( dynamic_cast<ZeroType*>( variableExpr->get_var()->get_type() ) ) {
     687                        output << "0";
     688                } else if ( variableExpr->get_var()->get_linkage() == LinkageSpec::Intrinsic && (opInfo = operatorLookup( variableExpr->get_var()->get_name() )) && opInfo->type == OT_CONSTANT ) {
    686689                        output << opInfo->symbol;
    687690                } else {
    688                         // if (dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type()) 
     691                        // if (dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type())
    689692                        // && dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type())->baseEnum->base) {
    690693                        //      output << '(' <<genType(dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type())->baseEnum->base, "", options) << ')';
     
    12361239                } // if
    12371240        }
     1241
     1242std::string genName( ast::DeclWithType const * decl ) {
     1243        if ( const OperatorInfo * opInfo = operatorLookup( decl->name ) ) {
     1244                return opInfo->outputName;
     1245        } else {
     1246                return decl->name;
     1247        }
     1248}
     1249
    12381250} // namespace CodeGen
    12391251
  • src/CodeGen/CodeGenerator.h

    r9e23b446 rffec1bf  
    99// Author           : Richard C. Bilson
    1010// Created On       : Mon May 18 07:44:20 2015
    11 // Last Modified By : Peter A. Buhr
    12 // Last Modified On : Tue Feb  1 09:23:21 2022
    13 // Update Count     : 64
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Wed Jun 29 14:32:00 2022
     13// Update Count     : 65
    1414//
    1515
     
    2626#include "SynTree/Visitor.h"      // for Visitor
    2727#include "SynTree/SynTree.h"      // for Visitor Nodes
     28
     29namespace ast {
     30        class DeclWithType;
     31}
    2832
    2933namespace CodeGen {
     
    182186        /// returns C-compatible name of declaration
    183187        std::string genName( DeclarationWithType * decl );
     188        std::string genName( ast::DeclWithType const * decl );
    184189
    185190        inline std::ostream & operator<<( std::ostream & os, const CodeGenerator::LineEnder & endl ) {
  • src/CodeGen/FixNames.cc

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixNames.cc --
     7// FixNames.cc -- Adjustments to typed declarations.
    88//
    99// Author           : Richard C. Bilson
    1010// Created On       : Mon May 18 07:44:20 2015
    1111// Last Modified By : Andrew Beach
    12 // Last Modified On : Fri Oct 29 15:49:00 2021
    13 // Update Count     : 23
     12// Last Modified On : Wed Jul 20 11:49:00 2022
     13// Update Count     : 24
    1414//
    1515
     
    8787
    8888/// Does work with the main function and scopeLevels.
    89 class FixNames_new : public ast::WithGuards {
     89class FixNames_new final {
    9090        int scopeLevel = 1;
    9191
     
    103103
    104104        const ast::FunctionDecl *postvisit( const ast::FunctionDecl *functionDecl ) {
    105                 // This store is used to ensure a maximum of one call to mutate.
    106                 ast::FunctionDecl * mutDecl = nullptr;
     105                if ( FixMain::isMain( functionDecl ) ) {
     106                        auto mutDecl = ast::mutate( functionDecl );
    107107
    108                 if ( shouldSetScopeLevel( functionDecl ) ) {
    109                         mutDecl = ast::mutate( functionDecl );
    110                         mutDecl->scopeLevel = scopeLevel;
    111                 }
    112 
    113                 if ( FixMain::isMain( functionDecl ) ) {
    114                         if ( !mutDecl ) { mutDecl = ast::mutate( functionDecl ); }
     108                        if ( shouldSetScopeLevel( mutDecl ) ) {
     109                                mutDecl->scopeLevel = scopeLevel;
     110                        }
    115111
    116112                        int nargs = mutDecl->params.size();
     
    124120                                )
    125121                        );
     122
     123                        return mutDecl;
     124                } else if ( shouldSetScopeLevel( functionDecl ) ) {
     125                        return ast::mutate_field( functionDecl, &ast::FunctionDecl::scopeLevel, scopeLevel );
     126                } else {
     127                        return functionDecl;
    126128                }
    127                 return mutDecl ? mutDecl : functionDecl;
    128129        }
    129130
    130131        void previsit( const ast::CompoundStmt * ) {
    131                 GuardValue( scopeLevel ) += 1;
     132                scopeLevel += 1;
     133        }
     134
     135        void postvisit( const ast::CompoundStmt * ) {
     136                scopeLevel -= 1;
    132137        }
    133138};
  • src/CodeGen/FixNames.h

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixNames.h --
     7// FixNames.h -- Adjustments to typed declarations.
    88//
    99// Author           : Richard C. Bilson
     
    2626        /// mangles object and function names
    2727        void fixNames( std::list< Declaration* > & translationUnit );
    28         void fixNames( ast::TranslationUnit & translationUnit );
     28/// Sets scope levels and fills in main's default return.
     29void fixNames( ast::TranslationUnit & translationUnit );
    2930} // namespace CodeGen
    3031
  • src/CodeGen/GenType.cc

    r9e23b446 rffec1bf  
    254254
    255255        void GenType::postvisit( EnumInstType * enumInst ) {
    256                 if ( enumInst->baseEnum->base ) {
     256                if ( enumInst->baseEnum && enumInst->baseEnum->base ) {
    257257                        typeString = genType(enumInst->baseEnum->base, "", options) + typeString;
    258258                } else {
  • src/Common/Eval.cc

    r9e23b446 rffec1bf  
    1010// Created On       : Mon May 18 07:44:20 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Wed Jul 24 15:09:06 2019
    13 // Update Count     : 64
     12// Last Modified On : Fri Jul  1 08:41:03 2022
     13// Update Count     : 117
    1414//
    1515
     
    1717
    1818#include "Common/PassVisitor.h"
     19#include "CodeGen/OperatorTable.h"                                              // access: OperatorInfo
    1920#include "AST/Pass.hpp"
    2021#include "InitTweak/InitTweak.h"
     
    2425// Old AST
    2526struct EvalOld : public WithShortCircuiting {
    26         long long int value = 0;
    27         bool valid = true;
     27        long long int value = 0;                                                        // compose the result of the constant expression
     28        bool valid = true;                                                                      // true => constant expression and value is the result
     29                                                                                                                // false => not constant expression, e.g., ++i
     30        bool cfavalid = true;                                                           // true => constant expression and value computable
     31                                                                                                                // false => constant expression but value not computable, e.g., sizeof(int)
    2832
    2933        void previsit( const BaseSyntaxNode * ) { visit_children = false; }
     
    8993// New AST
    9094struct EvalNew : public ast::WithShortCircuiting {
    91         long long int value = 0;
    92         bool valid = true;
     95        long long int value = 0;                                                        // compose the result of the constant expression
     96        bool valid = true;                                                                      // true => constant expression and value is the result
     97                                                                                                                // false => not constant expression, e.g., ++i
     98        bool cfavalid = true;                                                           // true => constant expression and value computable
     99                                                                                                                // false => constant expression but value not computable, e.g., sizeof(int)
    93100
    94101        void previsit( const ast::Node * ) { visit_children = false; }
    95         void postvisit( const ast::Node * ) { valid = false; }
    96 
    97         void postvisit( const ast::ConstantExpr * expr ) {
     102        void postvisit( const ast::Node * ) { cfavalid = valid = false; }
     103
     104        void postvisit( const ast::UntypedExpr * ) {
     105                assertf( false, "UntypedExpr in constant expression evaluation" ); // FIX ME, resolve variable
     106        }
     107
     108        void postvisit( const ast::ConstantExpr * expr ) {      // only handle int constants
    98109                value = expr->intValue();
    99110        }
    100111
    101         void postvisit( const ast::SizeofExpr * expr ) {
    102                 if ( expr->expr ) value = eval(expr->expr).first;
    103                 else if ( expr->type ) value = eval(expr->expr).first;
    104                 else SemanticError( expr->location, ::toString( "Internal error: SizeofExpr has no expression or type value" ) );
    105         }
    106 
    107         void postvisit( const ast::CastExpr * expr ) {
     112        void postvisit( const ast::SizeofExpr * ) {
     113                // do not change valid or value => let C figure it out
     114                cfavalid = false;
     115        }
     116
     117        void postvisit( const ast::AlignofExpr * ) {
     118                // do not change valid or value => let C figure it out
     119                cfavalid = false;
     120        }
     121
     122        void postvisit( const ast::OffsetofExpr * ) {
     123                // do not change valid or value => let C figure it out
     124                cfavalid = false;
     125        }
     126
     127        void postvisit( const ast::LogicalExpr * expr ) {
     128                std::pair<long long int, bool> arg1, arg2;
     129                arg1 = eval( expr->arg1 );
     130                valid &= arg1.second;
     131                if ( ! valid ) return;
     132                arg2 = eval( expr->arg2 );
     133                valid &= arg2.second;
     134                if ( ! valid ) return;
     135
     136                if ( expr->isAnd ) {
     137                        value = arg1.first && arg2.first;
     138                } else {
     139                        value = arg1.first || arg2.first;
     140                } // if
     141        }
     142
     143        void postvisit( const ast::ConditionalExpr * expr ) {
     144                std::pair<long long int, bool> arg1, arg2, arg3;
     145                arg1 = eval( expr->arg1 );
     146                valid &= arg1.second;
     147                if ( ! valid ) return;
     148                arg2 = eval( expr->arg2 );
     149                valid &= arg2.second;
     150                if ( ! valid ) return;
     151                arg3 = eval( expr->arg3 );
     152                valid &= arg3.second;
     153                if ( ! valid ) return;
     154
     155                value = arg1.first ? arg2.first : arg3.first;
     156        }
     157
     158        void postvisit( const ast::CastExpr * expr ) {         
     159                // cfa-cc generates a cast before every constant and many other places, e.g., (int)3, so the cast argument must
     160                // be evaluated to get the constant value.
    108161                auto arg = eval(expr->arg);
    109162                valid = arg.second;
    110163                value = arg.first;
    111                 // TODO: perform type conversion on value if valid
    112         }
    113 
    114         void postvisit( const ast::VariableExpr * expr ) { // No hit
     164                cfavalid = false;
     165        }
     166
     167        void postvisit( const ast::VariableExpr * expr ) {
    115168                if ( const ast::EnumInstType * inst = dynamic_cast<const ast::EnumInstType *>(expr->result.get()) ) {
    116169                        if ( const ast::EnumDecl * decl = inst->base ) {
     
    128181                const std::string & fname = function->name;
    129182                assertf( expr->args.size() == 1 || expr->args.size() == 2, "Intrinsic function with %zd arguments: %s", expr->args.size(), fname.c_str() );
    130                 std::pair<long long int, bool> arg1, arg2;
    131                 arg1 = eval(expr->args.front());
    132                 valid = valid && arg1.second;
    133                 if ( ! valid ) return;
    134                 if ( expr->args.size() == 2 ) {
     183
     184                if ( expr->args.size() == 1 ) {
     185                        // pre/postfix operators ++ and -- => assignment, which is not constant
     186                        std::pair<long long int, bool> arg1;
     187                        arg1 = eval(expr->args.front());
     188                        valid &= arg1.second;
     189                        if ( ! valid ) return;
     190
     191                        if (fname == "+?") {
     192                                value = arg1.first;
     193                        } else if (fname == "-?") {
     194                                value = -arg1.first;
     195                        } else if (fname == "~?") {
     196                                value = ~arg1.first;
     197                        } else if (fname == "!?") {
     198                                value = ! arg1.first;
     199                        } else {
     200                                valid = false;
     201                        } // if
     202                } else { // => expr->args.size() == 2
     203                        // infix assignment operators => assignment, which is not constant
     204                        std::pair<long long int, bool> arg1, arg2;
     205                        arg1 = eval(expr->args.front());
     206                        valid &= arg1.second;
     207                        if ( ! valid ) return;
    135208                        arg2 = eval(expr->args.back());
    136                         valid = valid && arg2.second;
    137                         if ( ! valid ) return;
    138                 }
    139                 if (fname == "?+?") {
    140                         value = arg1.first + arg2.first;
    141                 } else if (fname == "?-?") {
    142                         value = arg1.first - arg2.first;
    143                 } else if (fname == "?*?") {
    144                         value = arg1.first * arg2.first;
    145                 } else if (fname == "?/?") {
    146                         value = arg1.first / arg2.first;
    147                 } else if (fname == "?%?") {
    148                         value = arg1.first % arg2.first;
    149                 } else {
    150                         valid = false;
    151                 }
     209                        valid &= arg2.second;
     210                        if ( ! valid ) return;
     211
     212                        if (fname == "?+?") {
     213                                value = arg1.first + arg2.first;
     214                        } else if (fname == "?-?") {
     215                                value = arg1.first - arg2.first;
     216                        } else if (fname == "?*?") {
     217                                value = arg1.first * arg2.first;
     218                        } else if (fname == "?/?") {
     219                                value = arg1.first / arg2.first;
     220                        } else if (fname == "?%?") {
     221                                value = arg1.first % arg2.first;
     222                        } else if (fname == "?<<?") {
     223                                value = arg1.first << arg2.first;
     224                        } else if (fname == "?>>?") {
     225                                value = arg1.first >> arg2.first;
     226                        } else if (fname == "?<?") {
     227                                value = arg1.first < arg2.first;
     228                        } else if (fname == "?>?") {
     229                                value = arg1.first > arg2.first;
     230                        } else if (fname == "?<=?") {
     231                                value = arg1.first <= arg2.first;
     232                        } else if (fname == "?>=?") {
     233                                value = arg1.first >= arg2.first;
     234                        } else if (fname == "?==?") {
     235                                value = arg1.first == arg2.first;
     236                        } else if (fname == "?!=?") {
     237                                value = arg1.first != arg2.first;
     238                        } else if (fname == "?&?") {
     239                                value = arg1.first & arg2.first;
     240                        } else if (fname == "?^?") {
     241                                value = arg1.first ^ arg2.first;
     242                        } else if (fname == "?|?") {
     243                                value = arg1.first | arg2.first;
     244                        } else {
     245                                valid = false;
     246                        }
     247                } // if
    152248                // TODO: implement other intrinsic functions
    153249        }
    154250};
    155251
    156 std::pair<long long int, bool> eval( const Expression * expr) {
     252std::pair<long long int, bool> eval( const Expression * expr ) {
    157253        PassVisitor<EvalOld> ev;
    158         if (expr) {
    159                 expr->accept(ev);
    160                 return std::make_pair(ev.pass.value, ev.pass.valid);
     254        if ( expr ) {
     255                expr->accept( ev );
     256                return std::make_pair( ev.pass.value, ev.pass.valid );
    161257        } else {
    162                 return std::make_pair(0, false);
     258                return std::make_pair( 0, false );
    163259        }
    164260}
    165261
    166 std::pair<long long int, bool> eval(const ast::Expr * expr) {
     262std::pair<long long int, bool> eval( const ast::Expr * expr ) {
    167263        ast::Pass<EvalNew> ev;
    168         if (expr) {
    169                 expr->accept(ev);
    170                 return std::make_pair(ev.core.value, ev.core.valid);
     264        if ( expr ) {
     265                expr->accept( ev );
     266                return std::make_pair( ev.core.value, ev.core.valid );
    171267        } else {
    172                 return std::make_pair(0, false);
     268                return std::make_pair( 0, false );
    173269        }
    174270}
  • src/Common/ResolvProtoDump.cpp

    r9e23b446 rffec1bf  
    227227        }
    228228
    229         void previsit( const ast::EnumInstType * enumInst) {
     229        void previsit( const ast::EnumInstType * ) {
    230230                // TODO: Add the meaningful text representation of typed enum
    231231                ss << (int)ast::BasicType::SignedInt;
  • src/Concurrency/Keywords.h

    r9e23b446 rffec1bf  
    2828        void implementThreadStarter( std::list< Declaration * > & translationUnit );
    2929
    30 /// Implement the sue-like keywords and the suspend keyword.
     30/// Implement the sue-like keywords and the suspend keyword. Pre-Autogen
    3131void implementKeywords( ast::TranslationUnit & translationUnit );
    32 /// Implement the mutex parameters and mutex statement.
     32/// Implement the mutex parameters and mutex statement. Post-Autogen
    3333void implementMutex( ast::TranslationUnit & translationUnit );
    34 /// Add the thread starter code to constructors.
     34/// Add the thread starter code to constructors. Post-Autogen
    3535void implementThreadStarter( ast::TranslationUnit & translationUnit );
    3636};
  • src/ControlStruct/ExceptDecl.cc

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // ExceptDecl.cc --
     7// ExceptDecl.cc -- Handles declarations of exception types.
    88//
    99// Author           : Henry Xue
  • src/ControlStruct/ExceptDecl.h

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // ExceptDecl.h --
     7// ExceptDecl.h -- Handles declarations of exception types.
    88//
    99// Author           : Henry Xue
    1010// Created On       : Tue Jul 20 04:10:50 2021
    11 // Last Modified By : Henry Xue
    12 // Last Modified On : Tue Jul 20 04:10:50 2021
    13 // Update Count     : 1
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Tue Jul 12 15:49:00 2022
     13// Update Count     : 2
    1414//
    1515
     
    2020class Declaration;
    2121
     22namespace ast {
     23        class TranslationUnit;
     24}
     25
    2226namespace ControlStruct {
    23         void translateExcept( std::list< Declaration *> & translationUnit );
     27/// Unfold exception declarations into raw structure declarations.
     28/// Also builds vtable declarations and converts vtable types.
     29void translateExcept( std::list< Declaration *> & translationUnit );
     30void translateExcept( ast::TranslationUnit & translationUnit );
    2431}
  • src/ControlStruct/HoistControlDecls.hpp

    r9e23b446 rffec1bf  
    2121
    2222namespace ControlStruct {
    23 // Hoist declarations out of control flow statements into compound statement.
     23/// Hoist declarations out of control flow statements into compound statement.
     24/// Must happen before auto-gen routines are added.
    2425void hoistControlDecls( ast::TranslationUnit & translationUnit );
    2526} // namespace ControlStruct
  • src/ControlStruct/MultiLevelExit.cpp

    r9e23b446 rffec1bf  
    149149};
    150150
    151 NullStmt * labelledNullStmt(
    152         const CodeLocation & cl, const Label & label ) {
     151NullStmt * labelledNullStmt( const CodeLocation & cl, const Label & label ) {
    153152        return new NullStmt( cl, vector<Label>{ label } );
    154153}
     
    164163
    165164const CompoundStmt * MultiLevelExitCore::previsit(
    166         const CompoundStmt * stmt ) {
     165                const CompoundStmt * stmt ) {
    167166        visit_children = false;
    168167
     
    189188}
    190189
    191 size_t getUnusedIndex(
    192         const Stmt * stmt, const Label & originalTarget ) {
     190size_t getUnusedIndex( const Stmt * stmt, const Label & originalTarget ) {
    193191        const size_t size = stmt->labels.size();
    194192
     
    210208}
    211209
    212 const Stmt * addUnused(
    213         const Stmt * stmt, const Label & originalTarget ) {
     210const Stmt * addUnused( const Stmt * stmt, const Label & originalTarget ) {
    214211        size_t i = getUnusedIndex( stmt, originalTarget );
    215212        if ( i == stmt->labels.size() ) {
     
    356353
    357354// Mimic what the built-in push_front would do anyways. It is O(n).
    358 void push_front(
    359         vector<ptr<Stmt>> & vec, const Stmt * element ) {
     355void push_front( vector<ptr<Stmt>> & vec, const Stmt * element ) {
    360356        vec.emplace_back( nullptr );
    361357        for ( size_t i = vec.size() - 1 ; 0 < i ; --i ) {
     
    590586
    591587                ptr<Stmt> else_stmt = nullptr;
    592                 Stmt * loop_kid = nullptr;
     588                const Stmt * loop_kid = nullptr;
    593589                // check if loop node and if so add else clause if it exists
    594                 const WhileDoStmt * whilePtr = dynamic_cast<const WhileDoStmt *>(kid.get());
    595                 if ( whilePtr && whilePtr->else_) {
     590                const WhileDoStmt * whilePtr = kid.as<WhileDoStmt>();
     591                if ( whilePtr && whilePtr->else_ ) {
    596592                        else_stmt = whilePtr->else_;
    597                         WhileDoStmt * mutate_ptr = mutate(whilePtr);
    598                         mutate_ptr->else_ = nullptr;
    599                         loop_kid = mutate_ptr;
    600                 }
    601                 const ForStmt * forPtr = dynamic_cast<const ForStmt *>(kid.get());
    602                 if ( forPtr && forPtr->else_) {
     593                        loop_kid = mutate_field( whilePtr, &WhileDoStmt::else_, nullptr );
     594                }
     595                const ForStmt * forPtr = kid.as<ForStmt>();
     596                if ( forPtr && forPtr->else_ ) {
    603597                        else_stmt = forPtr->else_;
    604                         ForStmt * mutate_ptr = mutate(forPtr);
    605                         mutate_ptr->else_ = nullptr;
    606                         loop_kid = mutate_ptr;
     598                        loop_kid = mutate_field( forPtr, &ForStmt::else_, nullptr );
    607599                }
    608600
  • src/ControlStruct/module.mk

    r9e23b446 rffec1bf  
    1717SRC += \
    1818        ControlStruct/ExceptDecl.cc \
     19        ControlStruct/ExceptDeclNew.cpp \
    1920        ControlStruct/ExceptDecl.h \
    2021        ControlStruct/ExceptTranslateNew.cpp \
  • src/GenPoly/Box.cc

    r9e23b446 rffec1bf  
    189189                        /// Enters a new scope for type-variables, adding the type variables from ty
    190190                        void beginTypeScope( Type *ty );
    191                         /// Exits the type-variable scope
    192                         void endTypeScope();
    193191                        /// Enters a new scope for knowLayouts and knownOffsets and queues exit calls
    194192                        void beginGenericScope();
     
    198196                        UniqueName bufNamer;                           ///< Namer for VLA buffers
    199197                        Expression * addrMember = nullptr;             ///< AddressExpr argument is MemberExpr?
     198                        bool expect_func_type = false;                 ///< used to avoid recursing too deep in type decls
    200199                };
    201200
     
    12771276                        FunctionType * ftype = functionDecl->type;
    12781277                        if ( ! ftype->returnVals.empty() && functionDecl->statements ) {
    1279                                 if ( ! isPrefix( functionDecl->name, "_thunk" ) && ! isPrefix( functionDecl->name, "_adapter" ) ) { // xxx - remove check for prefix once thunks properly use ctor/dtors
     1278                                // intrinsic functions won't be using the _retval so no need to generate it.
     1279                                if ( functionDecl->linkage != LinkageSpec::Intrinsic && !isPrefix( functionDecl->name, "_thunk" ) && ! isPrefix( functionDecl->name, "_adapter" ) ) { // xxx - remove check for prefix once thunks properly use ctor/dtors
    12801280                                        assert( ftype->returnVals.size() == 1 );
    12811281                                        DeclarationWithType * retval = ftype->returnVals.front();
     
    14181418                void PolyGenericCalculator::beginGenericScope() {
    14191419                        GuardScope( *this );
     1420                        // We expect the first function type see to be the type relating to this scope
     1421                        // but any further type is probably some unrelated function pointer
     1422                        // keep track of which is the first
     1423                        GuardValue( expect_func_type );
     1424                        expect_func_type = true;
    14201425                }
    14211426
     
    14671472                void PolyGenericCalculator::premutate( FunctionType *funcType ) {
    14681473                        beginTypeScope( funcType );
     1474
     1475                        GuardValue( expect_func_type );
     1476
     1477                        if(!expect_func_type) {
     1478                                GuardAction( [this]() {
     1479                                        knownLayouts.endScope();
     1480                                        knownOffsets.endScope();
     1481                                });
     1482                                // If this is the first function type we see
     1483                                // Then it's the type of the declaration and we care about it
     1484                                knownLayouts.beginScope();
     1485                                knownOffsets.beginScope();
     1486                        }
     1487
     1488                        // The other functions type we will see in this scope are probably functions parameters
     1489                        // they don't help us with the layout and offsets so don't mark them as known in this scope
     1490                        expect_func_type = false;
    14691491
    14701492                        // make sure that any type information passed into the function is accounted for
     
    17451767                                }
    17461768
     1769                                // std::cout << "TRUE 2" << std::endl;
     1770
    17471771                                return true;
    17481772                        } else if ( UnionInstType *unionTy = dynamic_cast< UnionInstType* >( ty ) ) {
  • src/GenPoly/GenPoly.cc

    r9e23b446 rffec1bf  
    6464                }
    6565
    66                 __attribute__((ununsed))
     66                __attribute__((unused))
    6767                bool hasPolyParams( const std::vector<ast::ptr<ast::Expr>> & params, const TyVarMap & tyVars, const ast::TypeSubstitution * env) {
    6868                        for (auto &param : params) {
  • src/InitTweak/GenInit.cc

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // GenInit.cc --
     7// GenInit.cc -- Generate initializers, and other stuff.
    88//
    99// Author           : Rob Schluntz
     
    642642
    643643ast::ConstructorInit * genCtorInit( const CodeLocation & loc, const ast::ObjectDecl * objDecl ) {
    644         // call into genImplicitCall from Autogen.h to generate calls to ctor/dtor for each 
     644        // call into genImplicitCall from Autogen.h to generate calls to ctor/dtor for each
    645645        // constructable object
    646646        InitExpander_new srcParam{ objDecl->init }, nullParam{ (const ast::Init *)nullptr };
    647647        ast::ptr< ast::Expr > dstParam = new ast::VariableExpr(loc, objDecl);
    648        
    649         ast::ptr< ast::Stmt > ctor = SymTab::genImplicitCall( 
     648
     649        ast::ptr< ast::Stmt > ctor = SymTab::genImplicitCall(
    650650                srcParam, dstParam, loc, "?{}", objDecl );
    651         ast::ptr< ast::Stmt > dtor = SymTab::genImplicitCall( 
    652                 nullParam, dstParam, loc, "^?{}", objDecl, 
     651        ast::ptr< ast::Stmt > dtor = SymTab::genImplicitCall(
     652                nullParam, dstParam, loc, "^?{}", objDecl,
    653653                SymTab::LoopBackward );
    654        
     654
    655655        // check that either both ctor and dtor are present, or neither
    656656        assert( (bool)ctor == (bool)dtor );
    657657
    658658        if ( ctor ) {
    659                 // need to remember init expression, in case no ctors exist. If ctor does exist, want to 
     659                // need to remember init expression, in case no ctors exist. If ctor does exist, want to
    660660                // use ctor expression instead of init.
    661                 ctor.strict_as< ast::ImplicitCtorDtorStmt >(); 
     661                ctor.strict_as< ast::ImplicitCtorDtorStmt >();
    662662                dtor.strict_as< ast::ImplicitCtorDtorStmt >();
    663663
  • src/InitTweak/GenInit.h

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // GenInit.h --
     7// GenInit.h -- Generate initializers, and other stuff.
    88//
    99// Author           : Rodolfo G. Esteves
     
    2929        void genInit( ast::TranslationUnit & translationUnit );
    3030
    31         /// Converts return statements into copy constructor calls on the hidden return variable
     31        /// Converts return statements into copy constructor calls on the hidden return variable.
     32        /// This pass must happen before auto-gen.
    3233        void fixReturnStatements( std::list< Declaration * > & translationUnit );
    3334        void fixReturnStatements( ast::TranslationUnit & translationUnit );
  • src/Parser/lex.ll

    r9e23b446 rffec1bf  
    8282// Stop warning due to incorrectly generated flex code.
    8383#pragma GCC diagnostic ignored "-Wsign-compare"
     84
     85// lex uses __null in a boolean context, it's fine.
     86#pragma GCC diagnostic ignored "-Wnull-conversion"
    8487%}
    8588
  • src/Parser/parser.yy

    r9e23b446 rffec1bf  
    1010// Created On       : Sat Sep  1 20:22:55 2001
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sat May 14 09:16:22 2022
    13 // Update Count     : 5401
     12// Last Modified On : Fri Jul  1 15:35:08 2022
     13// Update Count     : 5405
    1414//
    1515
     
    5656
    5757#include "SynTree/Attribute.h"     // for Attribute
     58
     59// lex uses __null in a boolean context, it's fine.
     60#pragma GCC diagnostic ignored "-Wparentheses-equality"
    5861
    5962extern DeclarationNode * parseTree;
     
    12401243                {
    12411244                        $$ = new StatementNode( build_while( new CondCtl( nullptr, new ExpressionNode( build_constantInteger( *new string( "1" ) ) ) ), maybe_build_compound( $4 ) ) );
    1242                         SemanticWarning( yylloc, Warning::SuperfluousElse );
     1245                        SemanticWarning( yylloc, Warning::SuperfluousElse, "" );
    12431246                }
    12441247        | WHILE '(' conditional_declaration ')' statement       %prec THEN
     
    12511254                {
    12521255                        $$ = new StatementNode( build_do_while( new ExpressionNode( build_constantInteger( *new string( "1" ) ) ), maybe_build_compound( $2 ) ) );
    1253                         SemanticWarning( yylloc, Warning::SuperfluousElse );
     1256                        SemanticWarning( yylloc, Warning::SuperfluousElse, "" );
    12541257                }
    12551258        | DO statement WHILE '(' comma_expression ')' ';'
     
    12621265                {
    12631266                        $$ = new StatementNode( build_for( new ForCtrl( (ExpressionNode * )nullptr, (ExpressionNode * )nullptr, (ExpressionNode * )nullptr ), maybe_build_compound( $4 ) ) );
    1264                         SemanticWarning( yylloc, Warning::SuperfluousElse );
     1267                        SemanticWarning( yylloc, Warning::SuperfluousElse, "" );
    12651268                }
    12661269        | FOR '(' for_control_expression_list ')' statement     %prec THEN
     
    23942397        | ENUM '(' cfa_abstract_parameter_declaration ')' attribute_list_opt '{' enumerator_list comma_opt '}'
    23952398                {
    2396                         if ( $3->storageClasses.val != 0 || $3->type->qualifiers.val != 0 ) 
     2399                        if ( $3->storageClasses.val != 0 || $3->type->qualifiers.val != 0 )
    23972400                        { SemanticError( yylloc, "storage-class and CV qualifiers are not meaningful for enumeration constants, which are const." ); }
    23982401
     
    24382441        // empty
    24392442                { $$ = nullptr; }
    2440         // | '=' constant_expression
    2441         //      { $$ = $2; }
    2442         | simple_assignment_operator initializer
    2443                 { $$ = $1 == OperKinds::Assign ? $2 : $2->set_maybeConstructed( false ); }
     2443        | '=' constant_expression                                       { $$ = new InitializerNode( $2 ); }
     2444        | '=' '{' initializer_list_opt comma_opt '}' { $$ = new InitializerNode( $3, true ); }
     2445        // | simple_assignment_operator initializer
     2446        //      { $$ = $1 == OperKinds::Assign ? $2 : $2->set_maybeConstructed( false ); }
    24442447        ;
    24452448
     
    28412844                        linkage = LinkageSpec::update( yylloc, linkage, $2 );
    28422845                }
    2843           up external_definition down 
     2846          up external_definition down
    28442847                {
    28452848                        linkage = linkageStack.top();
  • src/ResolvExpr/CandidateFinder.cpp

    r9e23b446 rffec1bf  
    4141#include "Common/utility.h"       // for move, copy
    4242#include "SymTab/Mangler.h"
    43 #include "SymTab/Validate.h"      // for validateType
    4443#include "Tuples/Tuples.h"        // for handleTupleAssignment
    4544#include "InitTweak/InitTweak.h"  // for getPointerBase
     
    10911090                        assert( toType );
    10921091                        toType = resolveTypeof( toType, context );
    1093                         // toType = SymTab::validateType( castExpr->location, toType, symtab );
    10941092                        toType = adjustExprType( toType, tenv, symtab );
    10951093
     
    15901588                                // calculate target type
    15911589                                const ast::Type * toType = resolveTypeof( initAlt.type, context );
    1592                                 // toType = SymTab::validateType( initExpr->location, toType, symtab );
    15931590                                toType = adjustExprType( toType, tenv, symtab );
    15941591                                // The call to find must occur inside this loop, otherwise polymorphic return
  • src/ResolvExpr/CurrentObject.cc

    r9e23b446 rffec1bf  
    99// Author           : Rob Schluntz
    1010// Created On       : Tue Jun 13 15:28:32 2017
    11 // Last Modified By : Rob Schluntz
    12 // Last Modified On : Tue Jun 13 15:28:44 2017
    13 // Update Count     : 2
     11// Last Modified By : Peter A. Buhr
     12// Last Modified On : Fri Jul  1 09:16:01 2022
     13// Update Count     : 15
    1414//
    1515
     
    7373                virtual void setPosition( std::list< Expression * > & designators ) = 0;
    7474
    75                 /// retrieve the list of possible Type/Designaton pairs for the current position in the currect object
     75                /// retrieve the list of possible Type/Designation pairs for the current position in the currect object
    7676                virtual std::list<InitAlternative> operator*() const = 0;
    7777
     
    158158
    159159        private:
    160                 void setSize( Expression * expr ) { // replace this logic with an eval call
    161                         auto res = eval(expr);
     160                void setSize( Expression * expr ) {
     161                        auto res = eval( expr );
    162162                        if (res.second) {
    163163                                size = res.first;
     
    170170                void setPosition( Expression * expr ) {
    171171                        // need to permit integer-constant-expressions, including: integer constants, enumeration constants, character constants, sizeof expressions, _Alignof expressions, cast expressions
    172                         if ( ConstantExpr * constExpr = dynamic_cast< ConstantExpr * >( expr ) ) {
    173                                 try {
    174                                         index = constExpr->intValue();
    175                                 } catch( SemanticErrorException & ) {
    176                                         SemanticError( expr, "Constant expression of non-integral type in array designator: " );
    177                                 }
    178                         } else if ( CastExpr * castExpr = dynamic_cast< CastExpr * >( expr ) ) {
    179                                 setPosition( castExpr->get_arg() );
    180                         } else if ( VariableExpr * varExpr = dynamic_cast< VariableExpr * >( expr ) ) {
    181                                 EnumInstType * inst = dynamic_cast<EnumInstType *>( varExpr->get_result() );
    182                                 assertf( inst, "ArrayIterator given variable that isn't an enum constant : %s", toString( expr ).c_str() );
    183                                 long long int value;
    184                                 if ( inst->baseEnum->valueOf( varExpr->var, value ) ) {
    185                                         index = value;
    186                                 }
    187                         } else if ( dynamic_cast< SizeofExpr * >( expr ) || dynamic_cast< AlignofExpr * >( expr ) ) {
    188                                 index = 0; // xxx - get actual sizeof/alignof value?
    189                         } else {
    190                                 assertf( false, "bad designator given to ArrayIterator: %s", toString( expr ).c_str() );
    191                         }
     172                        auto arg = eval( expr );
     173                        index = arg.first;
     174                        return;
     175
     176                        // if ( ConstantExpr * constExpr = dynamic_cast< ConstantExpr * >( expr ) ) {
     177                        //      try {
     178                        //              index = constExpr->intValue();
     179                        //      } catch( SemanticErrorException & ) {
     180                        //              SemanticError( expr, "Constant expression of non-integral type in array designator: " );
     181                        //      }
     182                        // } else if ( CastExpr * castExpr = dynamic_cast< CastExpr * >( expr ) ) {
     183                        //      setPosition( castExpr->get_arg() );
     184                        // } else if ( VariableExpr * varExpr = dynamic_cast< VariableExpr * >( expr ) ) {
     185                        //      EnumInstType * inst = dynamic_cast<EnumInstType *>( varExpr->get_result() );
     186                        //      assertf( inst, "ArrayIterator given variable that isn't an enum constant : %s", toString( expr ).c_str() );
     187                        //      long long int value;
     188                        //      if ( inst->baseEnum->valueOf( varExpr->var, value ) ) {
     189                        //              index = value;
     190                        //      }
     191                        // } else if ( dynamic_cast< SizeofExpr * >( expr ) || dynamic_cast< AlignofExpr * >( expr ) ) {
     192                        //      index = 0; // xxx - get actual sizeof/alignof value?
     193                        // } else {
     194                        //      assertf( false, "4 bad designator given to ArrayIterator: %s", toString( expr ).c_str() );
     195                        // }
    192196                }
    193197
     
    329333                                        assertf( false, "could not find member in %s: %s", kind.c_str(), toString( varExpr ).c_str() );
    330334                                } else {
    331                                         assertf( false, "bad designator given to %s: %s", kind.c_str(), toString( designators.front() ).c_str() );
     335                                        assertf( false, "3 bad designator given to %s: %s", kind.c_str(), toString( designators.front() ).c_str() );
    332336                                } // if
    333337                        } // if
     
    637641
    638642                void setSize( const Expr * expr ) {
    639                         auto res = eval(expr);
     643                        auto res = eval( expr );
    640644                        if ( ! res.second ) {
    641                                 SemanticError( location,
    642                                         toString("Array designator must be a constant expression: ", expr ) );
     645                                SemanticError( location, toString( "Array designator must be a constant expression: ", expr ) );
    643646                        }
    644647                        size = res.first;
     
    646649
    647650        public:
    648                 ArrayIterator( const CodeLocation & loc, const ArrayType * at )
    649                 : location( loc ), array( at ), base( at->base ) {
     651                ArrayIterator( const CodeLocation & loc, const ArrayType * at ) : location( loc ), array( at ), base( at->base ) {
    650652                        PRINT( std::cerr << "Creating array iterator: " << at << std::endl; )
    651653                        memberIter.reset( createMemberIterator( loc, base ) );
     
    660662                        // enumeration constants, character constants, sizeof expressions, alignof expressions,
    661663                        // cast expressions
    662                         if ( auto constExpr = dynamic_cast< const ConstantExpr * >( expr ) ) {
    663                                 try {
    664                                         index = constExpr->intValue();
    665                                 } catch ( SemanticErrorException & ) {
    666                                         SemanticError( expr,
    667                                                 "Constant expression of non-integral type in array designator: " );
    668                                 }
    669                         } else if ( auto castExpr = dynamic_cast< const CastExpr * >( expr ) ) {
    670                                 setPosition( castExpr->arg );
    671                         } else if (
    672                                 dynamic_cast< const SizeofExpr * >( expr )
    673                                 || dynamic_cast< const AlignofExpr * >( expr )
    674                         ) {
    675                                 index = 0;
    676                         } else {
    677                                 assertf( false,
    678                                         "bad designator given to ArrayIterator: %s", toString( expr ).c_str() );
    679                         }
     664
     665                        auto arg = eval( expr );
     666                        index = arg.first;
     667                        return;
     668
     669                        // if ( auto constExpr = dynamic_cast< const ConstantExpr * >( expr ) ) {
     670                        //      try {
     671                        //              index = constExpr->intValue();
     672                        //      } catch ( SemanticErrorException & ) {
     673                        //              SemanticError( expr, "Constant expression of non-integral type in array designator: " );
     674                        //      }
     675                        // } else if ( auto castExpr = dynamic_cast< const CastExpr * >( expr ) ) {
     676                        //      setPosition( castExpr->arg );
     677                        // } else if ( dynamic_cast< const SizeofExpr * >( expr ) || dynamic_cast< const AlignofExpr * >( expr ) ) {
     678                        //      index = 0;
     679                        // } else {
     680                        //      assertf( false, "2 bad designator given to ArrayIterator: %s", toString( expr ).c_str() );
     681                        // }
    680682                }
    681683
     
    723725                                std::deque< InitAlternative > ret = memberIter->first();
    724726                                for ( InitAlternative & alt : ret ) {
    725                                         alt.designation.get_and_mutate()->designators.emplace_front(
    726                                                 ConstantExpr::from_ulong( location, index ) );
     727                                        alt.designation.get_and_mutate()->designators.emplace_front( ConstantExpr::from_ulong( location, index ) );
    727728                                }
    728729                                return ret;
     
    788789                                        return;
    789790                                }
    790                                 assertf( false,
    791                                         "could not find member in %s: %s", kind.c_str(), toString( varExpr ).c_str() );
     791                                assertf( false, "could not find member in %s: %s", kind.c_str(), toString( varExpr ).c_str() );
    792792                        } else {
    793                                 assertf( false,
    794                                         "bad designator given to %s: %s", kind.c_str(), toString( *begin ).c_str() );
     793                                assertf( false, "1 bad designator given to %s: %s", kind.c_str(), toString( *begin ).c_str() );
    795794                        }
    796795                }
  • src/SymTab/FixFunction.cc

    r9e23b446 rffec1bf  
    99// Author           : Richard C. Bilson
    1010// Created On       : Sun May 17 16:19:49 2015
    11 // Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Mar  6 23:36:59 2017
    13 // Update Count     : 6
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Tue Jul 12 14:28:00 2022
     13// Update Count     : 7
    1414//
    1515
     
    122122                }
    123123
     124                void previsit( const ast::FunctionType * ) { visit_children = false; }
     125
     126                const ast::Type * postvisit( const ast::FunctionType * type ) {
     127                        return new ast::PointerType( type );
     128                }
     129
    124130                void previsit( const ast::VoidType * ) { isVoid = true; }
    125131
     
    145151}
    146152
     153const ast::Type * fixFunction( const ast::Type * type, bool & isVoid ) {
     154        ast::Pass< FixFunction_new > fixer;
     155        type = type->accept( fixer );
     156        isVoid |= fixer.core.isVoid;
     157        return type;
     158}
     159
    147160} // namespace SymTab
    148161
  • src/SymTab/FixFunction.h

    r9e23b446 rffec1bf  
    1010// Created On       : Sun May 17 17:02:08 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sat Jul 22 09:45:55 2017
    13 // Update Count     : 4
     12// Last Modified On : Tue Jul 12 14:19:00 2022
     13// Update Count     : 5
    1414//
    1515
     
    2121namespace ast {
    2222        class DeclWithType;
     23        class Type;
    2324}
    2425
     
    3132        /// Sets isVoid to true if type is void
    3233        const ast::DeclWithType * fixFunction( const ast::DeclWithType * dwt, bool & isVoid );
     34        const ast::Type * fixFunction( const ast::Type * type, bool & isVoid );
    3335} // namespace SymTab
    3436
  • src/SymTab/Mangler.cc

    r9e23b446 rffec1bf  
    537537                }
    538538
     539                __attribute__((unused))
    539540                inline std::vector< ast::ptr< ast::Type > > getTypes( const std::vector< ast::ptr< ast::DeclWithType > > & decls ) {
    540541                        std::vector< ast::ptr< ast::Type > > ret;
  • src/SymTab/Validate.cc

    r9e23b446 rffec1bf  
    1010// Created On       : Sun May 17 21:50:04 2015
    1111// Last Modified By : Andrew Beach
    12 // Last Modified On : Tue May 17 14:36:00 2022
    13 // Update Count     : 366
     12// Last Modified On : Tue Jul 12 15:00:00 2022
     13// Update Count     : 367
    1414//
    1515
     
    294294        };
    295295
    296         void validate_A( std::list< Declaration * > & translationUnit ) {
     296        void validate( std::list< Declaration * > &translationUnit, __attribute__((unused)) bool doDebug ) {
    297297                PassVisitor<HoistTypeDecls> hoistDecls;
    298298                {
     
    305305                        decayEnumsAndPointers( translationUnit ); // must happen before VerifyCtorDtorAssign, because void return objects should not exist; before LinkReferenceToTypes_old because it is an indexer and needs correct types for mangling
    306306                }
    307         }
    308 
    309         void validate_B( std::list< Declaration * > & translationUnit ) {
    310307                PassVisitor<FixQualifiedTypes> fixQual;
    311308                {
    312309                        Stats::Heap::newPass("validate-B");
    313310                        Stats::Time::BlockGuard guard("validate-B");
    314                         //linkReferenceToTypes( translationUnit );
     311                        linkReferenceToTypes( translationUnit ); // Must happen before auto-gen, because it uses the sized flag.
    315312                        mutateAll( translationUnit, fixQual ); // must happen after LinkReferenceToTypes_old, because aggregate members are accessed
    316313                        HoistStruct::hoistStruct( translationUnit );
    317314                        EliminateTypedef::eliminateTypedef( translationUnit );
    318315                }
    319         }
    320 
    321         void validate_C( std::list< Declaration * > & translationUnit ) {
    322316                PassVisitor<ValidateGenericParameters> genericParams;
    323317                PassVisitor<ResolveEnumInitializers> rei( nullptr );
     
    343337                        });
    344338                }
    345         }
    346 
    347         void validate_D( std::list< Declaration * > & translationUnit ) {
    348339                {
    349340                        Stats::Heap::newPass("validate-D");
     
    362353                        });
    363354                }
    364         }
    365 
    366         void validate_E( std::list< Declaration * > & translationUnit ) {
    367355                PassVisitor<CompoundLiteral> compoundliteral;
    368356                {
     
    384372                        }
    385373                }
    386         }
    387 
    388         void validate_F( std::list< Declaration * > & translationUnit ) {
    389374                PassVisitor<LabelAddressFixer> labelAddrFixer;
    390375                {
     
    410395                        }
    411396                }
    412         }
    413 
    414         void validate( std::list< Declaration * > &translationUnit, __attribute__((unused)) bool doDebug ) {
    415                 validate_A( translationUnit );
    416                 validate_B( translationUnit );
    417                 validate_C( translationUnit );
    418                 validate_D( translationUnit );
    419                 validate_E( translationUnit );
    420                 validate_F( translationUnit );
    421397        }
    422398
  • src/SymTab/Validate.h

    r9e23b446 rffec1bf  
    1111// Created On       : Sun May 17 21:53:34 2015
    1212// Last Modified By : Andrew Beach
    13 // Last Modified On : Tue May 17 14:35:00 2022
    14 // Update Count     : 5
     13// Last Modified On : Tue Jul 12 15:30:00 2022
     14// Update Count     : 6
    1515//
    1616
     
    1919#include <list>  // for list
    2020
    21 struct CodeLocation;
    22 class  Declaration;
    23 class  Type;
    24 
    25 namespace ast {
    26         class Type;
    27         class SymbolTable;
    28 }
     21class Declaration;
    2922
    3023namespace SymTab {
    31         class Indexer;
    32 
    3324        /// Normalizes struct and function declarations
    3425        void validate( std::list< Declaration * > &translationUnit, bool doDebug = false );
    35 
    36         // Sub-passes of validate.
    37         void validate_A( std::list< Declaration * > &translationUnit );
    38         void validate_B( std::list< Declaration * > &translationUnit );
    39         void validate_C( std::list< Declaration * > &translationUnit );
    40         void validate_D( std::list< Declaration * > &translationUnit );
    41         void validate_E( std::list< Declaration * > &translationUnit );
    42         void validate_F( std::list< Declaration * > &translationUnit );
    4326} // namespace SymTab
    4427
  • src/SymTab/ValidateType.cc

    r9e23b446 rffec1bf  
    222222        // visit enum members first so that the types of self-referencing members are updated properly
    223223        // Replace the enum base; right now it works only for StructEnum
    224         if ( enumDecl->base && dynamic_cast<TypeInstType*>(enumDecl->base) ) {
    225                 std::string baseName = static_cast<TypeInstType*>(enumDecl->base)->name;
    226                 const StructDecl * st = local_indexer->lookupStruct( baseName );
    227                 if ( st ) {
    228                         enumDecl->base = new StructInstType(Type::Qualifiers(),const_cast<StructDecl *>(st)); // Just linking in the node
     224        if ( enumDecl->base ) {
     225                if ( const TypeInstType * base = dynamic_cast< TypeInstType * >(enumDecl->base) ) {
     226                        if ( const StructDecl * decl = local_indexer->lookupStruct( base->name ) ) {
     227                                enumDecl->base = new StructInstType( Type::Qualifiers(), const_cast< StructDecl * >( decl ) ); // Just linking in the node
     228                        }
     229                } else if ( const PointerType * ptr = dynamic_cast< PointerType * >(enumDecl->base) ) {
     230                        if ( const TypeInstType * ptrBase = dynamic_cast< TypeInstType * >( ptr->base ) ) {
     231                                if ( const StructDecl * decl = local_indexer->lookupStruct( ptrBase->name ) ) {
     232                                        enumDecl->base = new PointerType( Type::Qualifiers(),
     233                                                new StructInstType( Type::Qualifiers(), const_cast< StructDecl * >( decl ) ) );
     234                                }
     235                        }
    229236                }
    230237        }
     238       
    231239        if ( enumDecl->body ) {
    232240                ForwardEnumsType::iterator fwds = forwardEnums.find( enumDecl->name );
  • src/SynTree/AggregateDecl.cc

    r9e23b446 rffec1bf  
    1010// Created On       : Sun May 17 23:56:39 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Dec 16 15:07:20 2019
    13 // Update Count     : 31
     12// Last Modified On : Fri Jul  1 09:12:33 2022
     13// Update Count     : 32
    1414//
    1515
     
    125125                                SingleInit * init = strict_dynamic_cast< SingleInit * >( field->init );
    126126                                auto result = eval( init->value );
    127                                 if ( ! result.second ) SemanticError( init->location, toString( "Non-constexpr in initialization of enumerator: ", field ) );
     127                                if ( ! result.second ) SemanticError( init->location, toString( "Enumerator value for '", field, "' is not an integer constant" ) );
    128128                                currentValue = result.first;
    129129                        }
  • src/SynTree/Type.h

    r9e23b446 rffec1bf  
    274274class PointerType : public Type {
    275275  public:
    276         Type *base;
     276        Type * base;
    277277
    278278        // In C99, pointer types can be qualified in many ways e.g., int f( int a[ static 3 ] )
     
    516516        typedef ReferenceToType Parent;
    517517  public:
    518         // this decl is not "owned" by the union inst; it is merely a pointer to elsewhere in the tree,
    519         // where the union used in this type is actually defined
     518        // this decl is not "owned" by the enum inst; it is merely a pointer to elsewhere in the tree,
     519        // where the enum used in this type is actually defined
    520520        EnumDecl *baseEnum = nullptr;
    521521
  • src/Tuples/Tuples.cc

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // Tuples.h --
     7// Tuples.cc -- A collection of tuple operations.
    88//
    99// Author           : Andrew Beach
  • src/Tuples/Tuples.h

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // Tuples.h --
     7// Tuples.h -- A collection of tuple operations.
    88//
    99// Author           : Rodolfo G. Esteves
  • src/Validate/Autogen.cpp

    r9e23b446 rffec1bf  
    2828#include "AST/DeclReplacer.hpp"
    2929#include "AST/Expr.hpp"
     30#include "AST/Inspect.hpp"
    3031#include "AST/Pass.hpp"
    3132#include "AST/Stmt.hpp"
     
    121122
    122123        // Built-ins do not use autogeneration.
    123         bool shouldAutogen() const final { return !decl->linkage.is_builtin; }
     124        bool shouldAutogen() const final { return !decl->linkage.is_builtin && !structHasFlexibleArray(decl); }
    124125private:
    125126        void genFuncBody( ast::FunctionDecl * decl ) final;
     
    183184        {
    184185                // TODO: These functions are somewhere between instrinsic and autogen,
    185                 // could possibly use a new linkage type. For now we just make them
    186                 // intrinsic to code-gen them as C assignments.
    187                 proto_linkage = ast::Linkage::Intrinsic;
     186                // could possibly use a new linkage type. For now we just make the
     187                // basic ones intrinsic to code-gen them as C assignments.
     188                const auto & real_type = decl->base;
     189                const auto & basic = real_type.as<ast::BasicType>();
     190                if(!real_type || (basic && basic->isInteger())) proto_linkage = ast::Linkage::Intrinsic;
    188191        }
    189192
     
    402405        auto retval = srcParam();
    403406        retval->name = "_ret";
    404         // xxx - Adding this unused attribute can slience unused variable warning
    405         // However, some code might not be compiled as expected
    406         // Temporarily disabled
    407         // retval->attributes.push_back(new ast::Attribute("unused"));
    408407        return genProto( "?=?", { dstParam(), srcParam() }, { retval } );
    409408}
  • src/Validate/Autogen.hpp

    r9e23b446 rffec1bf  
    2222namespace Validate {
    2323
     24/// Generate routines for all data types in the translation unit.
     25/// A lot of passes have to happen either before or after this pass.
    2426void autogenerateRoutines( ast::TranslationUnit & translationUnit );
    2527
  • src/Validate/CompoundLiteral.hpp

    r9e23b446 rffec1bf  
    2323
    2424/// Use variables to implement compound literals.
     25/// Must happen after auto-gen routines are added.
    2526void handleCompoundLiterals( ast::TranslationUnit & translationUnit );
    2627
  • src/Validate/EliminateTypedef.cpp

    r9e23b446 rffec1bf  
    1010// Created On       : Wed Apr 20 16:37:00 2022
    1111// Last Modified By : Andrew Beach
    12 // Last Modified On : Mon Apr 25 14:26:00 2022
    13 // Update Count     : 0
     12// Last Modified On : Mon Jul 11 16:30:00 2022
     13// Update Count     : 1
    1414//
    1515
     
    2828
    2929struct EliminateTypedefCore {
     30        // Remove typedefs from inside aggregates.
    3031        ast::StructDecl const * previsit( ast::StructDecl const * decl );
    3132        ast::UnionDecl const * previsit( ast::UnionDecl const * decl );
     33        // Remove typedefs from statement lists.
    3234        ast::CompoundStmt const * previsit( ast::CompoundStmt const * stmt );
     35        // Remove typedefs from control structure initializers.
     36        ast::IfStmt const * previsit( ast::IfStmt const * stmt );
     37        ast::ForStmt const * previsit( ast::ForStmt const * stmt );
     38        ast::WhileDoStmt const * previsit( ast::WhileDoStmt const * stmt );
    3339};
    3440
     
    6369}
    6470
     71ast::IfStmt const * EliminateTypedefCore::previsit( ast::IfStmt const * stmt ) {
     72        return field_erase_if( stmt, &ast::IfStmt::inits, isTypedefStmt );
     73}
     74
     75ast::ForStmt const * EliminateTypedefCore::previsit( ast::ForStmt const * stmt ) {
     76        return field_erase_if( stmt, &ast::ForStmt::inits, isTypedefStmt );
     77}
     78
     79ast::WhileDoStmt const * EliminateTypedefCore::previsit( ast::WhileDoStmt const * stmt ) {
     80        return field_erase_if( stmt, &ast::WhileDoStmt::inits, isTypedefStmt );
     81}
     82
    6583} // namespace
    6684
  • src/Validate/FindSpecialDecls.h

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FindSpecialDeclarations.h --
     7// FindSpecialDeclarations.h -- Find special declarations used in the compiler.
    88//
    99// Author           : Rob Schluntz
     
    4343        void findSpecialDecls( std::list< Declaration * > & translationUnit );
    4444
    45 /// find and remember some of the special declarations that are useful for
     45/// Find and remember some of the special declarations that are useful for
    4646/// generating code, so that they do not have to be discovered multiple times.
    4747void findGlobalDecls( ast::TranslationUnit & translationUnit );
  • src/Validate/FixQualifiedTypes.cpp

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixQualifiedTypes.cpp --
     7// FixQualifiedTypes.cpp -- Replace the qualified type with a direct type.
    88//
    99// Author           : Andrew Beach
     
    7676                                                        ret->qualifiers = type->qualifiers;
    7777                                                        ast::TypeSubstitution sub( aggr->params, instp->params );
    78                                                         // = parent->genericSubstitution();
    7978                                                        auto result = sub.apply(ret);
    8079                                                        return result.node.release();
  • src/Validate/FixQualifiedTypes.hpp

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // FixQualifiedTypes.hpp --
     7// FixQualifiedTypes.hpp -- Replace the qualified type with a direct type.
    88//
    99// Author           : Andrew Beach
     
    2222namespace Validate {
    2323
     24/// Replaces qualified types with an unqualified NamedTypeDecl.
     25/// Must happen after Link References To Types,
     26/// because aggregate members are accessed.
    2427void fixQualifiedTypes( ast::TranslationUnit & translationUnit );
    2528
  • src/Validate/ForallPointerDecay.hpp

    r9e23b446 rffec1bf  
    2929/// Also checks that operator names are used properly on functions and
    3030/// assigns unique IDs. This is a "legacy" pass.
     31/// Must be after implement concurrent keywords; because uniqueIds must be
     32/// set on declaration before resolution.
     33/// Must happen before auto-gen routines are added.
    3134void decayForallPointers( ast::TranslationUnit & transUnit );
    3235
  • src/Validate/GenericParameter.cpp

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // GenericParameter.hpp --
     7// GenericParameter.hpp -- Generic parameter related passes.
    88//
    99// Author           : Andrew Beach
  • src/Validate/GenericParameter.hpp

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // GenericParameter.hpp --
     7// GenericParameter.hpp -- Generic parameter related passes.
    88//
    99// Author           : Andrew Beach
     
    2323
    2424/// Perform substutions for generic parameters and fill in defaults.
     25/// Check as early as possible, but it can't happen before Link References to
     26/// Types and observed failing when attempted before eliminate typedef.
    2527void fillGenericParameters( ast::TranslationUnit & translationUnit );
    2628
  • src/Validate/HoistStruct.hpp

    r9e23b446 rffec1bf  
    2222namespace Validate {
    2323
    24 /// Flattens nested type declarations.
     24/// Flattens nested type declarations. (Run right after Fix Qualified Types.)
    2525void hoistStruct( ast::TranslationUnit & translationUnit );
    2626
  • src/Validate/LabelAddressFixer.cpp

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // LabelAddressFixer.cpp --
     7// LabelAddressFixer.cpp -- Create label address expressions.
    88//
    99// Author           : Andrew Beach
  • src/Validate/LabelAddressFixer.hpp

    r9e23b446 rffec1bf  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // LabelAddressFixer.hpp --
     7// LabelAddressFixer.hpp -- Create label address expressions.
    88//
    99// Author           : Andrew Beach
     
    2020namespace Validate {
    2121
     22/// Label addresses are not actually created in the parser, this pass finds
     23/// the patterns that represent the label address expression.
    2224void fixLabelAddresses( ast::TranslationUnit & translationUnit );
    2325
  • src/Validate/module.mk

    r9e23b446 rffec1bf  
    2626        Validate/EliminateTypedef.cpp \
    2727        Validate/EliminateTypedef.hpp \
     28        Validate/EnumAndPointerDecay.cpp \
     29        Validate/EnumAndPointerDecay.hpp \
    2830        Validate/FindSpecialDeclsNew.cpp \
    2931        Validate/FixQualifiedTypes.cpp \
    3032        Validate/FixQualifiedTypes.hpp \
     33        Validate/FixReturnTypes.cpp \
     34        Validate/FixReturnTypes.hpp \
    3135        Validate/ForallPointerDecay.cpp \
    3236        Validate/ForallPointerDecay.hpp \
     
    3741        Validate/HoistStruct.cpp \
    3842        Validate/HoistStruct.hpp \
     43        Validate/HoistTypeDecls.cpp \
     44        Validate/HoistTypeDecls.hpp \
    3945        Validate/InitializerLength.cpp \
    4046        Validate/InitializerLength.hpp \
    4147        Validate/LabelAddressFixer.cpp \
    4248        Validate/LabelAddressFixer.hpp \
     49        Validate/LinkReferenceToTypes.cpp \
     50        Validate/LinkReferenceToTypes.hpp \
    4351        Validate/NoIdSymbolTable.hpp \
     52        Validate/ReplaceTypedef.cpp \
     53        Validate/ReplaceTypedef.hpp \
    4454        Validate/ReturnCheck.cpp \
    45         Validate/ReturnCheck.hpp
     55        Validate/ReturnCheck.hpp \
     56        Validate/VerifyCtorDtorAssign.cpp \
     57        Validate/VerifyCtorDtorAssign.hpp
    4658
    4759SRCDEMANGLE += $(SRC_VALIDATE)
  • src/Virtual/Tables.h

    r9e23b446 rffec1bf  
    1919#include "AST/Fwd.hpp"
    2020class Declaration;
     21class Expression;
     22class FunctionDecl;
     23class Initializer;
     24class ObjectDecl;
    2125class StructDecl;
    22 class Expression;
     26class StructInstType;
     27class Type;
    2328
    2429namespace Virtual {
  • src/main.cc

    r9e23b446 rffec1bf  
    1010// Created On       : Fri May 15 23:12:02 2015
    1111// Last Modified By : Andrew Beach
    12 // Last Modified On : Tue Jun  7 13:29:00 2022
    13 // Update Count     : 674
     12// Last Modified On : Mon Jul 18 11:08:00 2022
     13// Update Count     : 676
    1414//
    1515
     
    7878#include "Validate/CompoundLiteral.hpp"     // for handleCompoundLiterals
    7979#include "Validate/EliminateTypedef.hpp"    // for eliminateTypedef
     80#include "Validate/EnumAndPointerDecay.hpp" // for decayEnumsAndPointers
    8081#include "Validate/FindSpecialDecls.h"      // for findGlobalDecls
    8182#include "Validate/FixQualifiedTypes.hpp"   // for fixQualifiedTypes
     83#include "Validate/FixReturnTypes.hpp"      // for fixReturnTypes
    8284#include "Validate/ForallPointerDecay.hpp"  // for decayForallPointers
    8385#include "Validate/GenericParameter.hpp"    // for fillGenericParameters, tr...
    8486#include "Validate/HoistStruct.hpp"         // for hoistStruct
     87#include "Validate/HoistTypeDecls.hpp"      // for hoistTypeDecls
    8588#include "Validate/InitializerLength.hpp"   // for setLengthFromInitializer
    8689#include "Validate/LabelAddressFixer.hpp"   // for fixLabelAddresses
     90#include "Validate/LinkReferenceToTypes.hpp" // for linkReferenceToTypes
     91#include "Validate/ReplaceTypedef.hpp"      // for replaceTypedef
    8792#include "Validate/ReturnCheck.hpp"         // for checkReturnStatements
     93#include "Validate/VerifyCtorDtorAssign.hpp" // for verifyCtorDtorAssign
    8894#include "Virtual/ExpandCasts.h"            // for expandCasts
    8995
     
    324330                Stats::Time::StopBlock();
    325331
    326                 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) );
    327                 if ( exdeclp ) {
    328                         dump( translationUnit );
    329                         return EXIT_SUCCESS;
    330                 } // if
    331 
    332                 // add the assignment statement after the initialization of a type parameter
    333                 PASS( "Validate-A", SymTab::validate_A( translationUnit ) );
    334 
    335                 // Must happen before auto-gen, because it uses the sized flag.
    336                 PASS( "Link Reference To Types", SymTab::linkReferenceToTypes( translationUnit ) );
    337 
    338                 CodeTools::fillLocations( translationUnit );
    339 
    340332                if( useNewAST ) {
    341                         CodeTools::fillLocations( translationUnit );
    342 
    343333                        if (Stats::Counters::enabled) {
    344334                                ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New");
     
    349339                        forceFillCodeLocations( transUnit );
    350340
    351                         // Must happen after Link References To Types,
    352                         // because aggregate members are accessed.
     341                        PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) );
     342                        if ( exdeclp ) {
     343                                dump( move( transUnit ) );
     344                                return EXIT_SUCCESS;
     345                        }
     346
     347                        PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) );
     348                        PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) );
     349                        // Hoist Type Decls pulls some declarations out of contexts where
     350                        // locations are not tracked. Perhaps they should be, but for now
     351                        // the full fill solves it.
     352                        forceFillCodeLocations( transUnit );
     353
     354                        PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) );
     355                        PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) );
     356                        PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) );
     357
     358                        PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) );
     359
    353360                        PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) );
    354 
    355361                        PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) );
    356362                        PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) );
    357 
    358                         // Check as early as possible. Can't happen before
    359                         // LinkReferenceToType, observed failing when attempted
    360                         // before eliminateTypedef
    361363                        PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) );
    362 
    363364                        PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) );
    364365                        PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) );
    365 
    366                         // Must happen before Autogen.
    367366                        PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) );
    368 
    369367                        PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) );
    370 
    371                         // Must be after implement concurrent keywords; because uniqueIds
    372                         //   must be set on declaration before resolution.
    373                         // Must happen before autogen routines are added.
    374368                        PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) );
    375 
    376                         // Must happen before autogen routines are added.
    377369                        PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) );
    378370
    379                         // Must be after enum and pointer decay.
    380                         // Must be before compound literals.
    381371                        PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) );
    382372
     
    454444                        translationUnit = convert( move( transUnit ) );
    455445                } else {
    456                         PASS( "Validate-B", SymTab::validate_B( translationUnit ) );
    457                         PASS( "Validate-C", SymTab::validate_C( translationUnit ) );
    458                         PASS( "Validate-D", SymTab::validate_D( translationUnit ) );
    459                         PASS( "Validate-E", SymTab::validate_E( translationUnit ) );
    460                         PASS( "Validate-F", SymTab::validate_F( translationUnit ) );
     446                        PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) );
     447                        if ( exdeclp ) {
     448                                dump( translationUnit );
     449                                return EXIT_SUCCESS;
     450                        } // if
     451
     452                        // add the assignment statement after the initialization of a type parameter
     453                        PASS( "Validate", SymTab::validate( translationUnit ) );
    461454
    462455                        if ( symtabp ) {
  • tests/.expect/attributes.nast.arm64.txt

    r9e23b446 rffec1bf  
    13341334    }
    13351335    inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){
    1336         enum __anonymous4 _X4_retM12__anonymous4_2;
    13371336        {
    13381337            ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2));
    13391338        }
    13401339
    1341         {
    1342             ((void)(_X4_retM12__anonymous4_2=(*_X4_dstM12__anonymous4_2)) /* ?{} */);
    1343         }
    1344 
    1345         return _X4_retM12__anonymous4_2;
     1340        return (*_X4_dstM12__anonymous4_2);
    13461341    }
    13471342    {
  • tests/.expect/attributes.nast.x64.txt

    r9e23b446 rffec1bf  
    13341334    }
    13351335    inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){
    1336         enum __anonymous4 _X4_retM12__anonymous4_2;
    13371336        {
    13381337            ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2));
  • tests/.expect/attributes.nast.x86.txt

    r9e23b446 rffec1bf  
    13341334    }
    13351335    inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){
    1336         enum __anonymous4 _X4_retM12__anonymous4_2;
    13371336        {
    13381337            ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2));
  • tests/.expect/attributes.oast.x64.txt

    r9e23b446 rffec1bf  
    13341334    }
    13351335    inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){
    1336         enum __anonymous4 _X4_retM12__anonymous4_2;
    13371336        {
    13381337            ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2));
  • tests/alloc2.cfa

    r9e23b446 rffec1bf  
    1111typedef struct S1 T1;
    1212
    13 void test_base( void * ip, size_t size, size_t align) {
     13void test_base( void * ip, size_t size, size_t align ) {
    1414        tests_total += 1;
    15 //      printf("DEBUG: starting test %d\n", tests_total);
    16         bool passed = (malloc_size(ip) == size) && (malloc_usable_size(ip) >= size) && (malloc_alignment(ip) == align) && ((uintptr_t)ip % align  == 0);
    17         if (!passed) {
    18                 printf("failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size(ip), malloc_usable_size(ip), malloc_alignment(ip));
     15//      printf( "DEBUG: starting test %d\n", tests_total);
     16        bool passed = (malloc_size( ip ) == size) && (malloc_usable_size( ip ) >= size) && (malloc_alignment( ip ) == align) && ((uintptr_t)ip % align  == 0);
     17        if ( ! passed ) {
     18                printf( "failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size( ip ), malloc_usable_size( ip ), malloc_alignment( ip ) );
    1919                tests_failed += 1;
    20         }
    21 //      printf("DEBUG: done test %d\n", tests_total);
     20        } // if
     21//      printf( "DEBUG: done test %d\n", tests_total);
    2222}
    2323
    24 void test_fill( void * ip_, size_t start, size_t end, char fill) {
     24void test_fill( void * ip_, size_t start, size_t end, char fill ) {
    2525        tests_total += 1;
    26 //      printf("DEBUG: starting test %d\n", tests_total);
     26//      printf( "DEBUG: starting test %d\n", tests_total );
    2727        bool passed = true;
    2828        char * ip = (char *) ip_;
    29         for (i; start ~ end) passed = passed && (ip[i] == fill);
    30         if (!passed) {
    31                 printf("failed test %3d: fill C\n", tests_total);
     29        for ( i; start ~ end ) passed = passed && (ip[i] == fill);
     30        if ( ! passed ) {
     31                printf( "failed test %3d: fill C\n", tests_total );
    3232                tests_failed += 1;
    33         }
    34 //      printf("DEBUG: done test %d\n", tests_total);
     33        } // if
     34//      printf( "DEBUG: done test %d\n", tests_total );
    3535}
    3636
    37 void test_fill( void * ip_, size_t start, size_t end, int fill) {
     37void test_fill( void * ip_, size_t start, size_t end, int fill ) {
    3838        tests_total += 1;
    39 //      printf("DEBUG: starting test %d\n", tests_total);
     39//      printf( "DEBUG: starting test %d\n", tests_total );
    4040        bool passed = true;
    41         int * ip = (int *) ip_;
    42         for (i; start ~ end) passed = passed && (ip[i] == fill);
    43         if (!passed) {
    44                 printf("failed test %3d: fill int\n", tests_total);
     41        int * ip = (int *)ip_;
     42        for (i; start ~ end ) passed = passed && (ip[i] == fill);
     43        if ( ! passed ) {
     44                printf( "failed test %3d: fill int\n", tests_total );
    4545                tests_failed += 1;
    46         }
    47 //      printf("DEBUG: done test %d\n", tests_total);
     46        } // if
     47//      printf( "DEBUG: done test %d\n", tests_total );
    4848}
    4949
    50 void test_fill( void * ip_, size_t start, size_t end, int * fill) {
     50void test_fill( void * ip_, size_t start, size_t end, int * fill ) {
    5151        tests_total += 1;
    52 //      printf("DEBUG: starting test %d\n", tests_total);
    53         bool passed = (memcmp((void*)((uintptr_t)ip_ + start), (void*)fill, end) == 0);
    54         if (!passed) {
    55                 printf("failed test %3d: fill int A\n", tests_total);
     52//      printf( "DEBUG: starting test %d\n", tests_total );
     53        bool passed = memcmp((void*)((uintptr_t )ip_ + start ), (void*)fill, end ) == 0;
     54        if ( ! passed ) {
     55                printf( "failed test %3d: fill int A\n", tests_total );
    5656                tests_failed += 1;
    57         }
    58 //      printf("DEBUG: done test %d\n", tests_total);
     57        } // if
     58//      printf( "DEBUG: done test %d\n", tests_total );
    5959}
    6060
    61 void test_fill( void * ip_, size_t start, size_t end, T1 fill) {
     61void test_fill( void * ip_, size_t start, size_t end, T1 fill ) {
    6262        tests_total += 1;
    63 //      printf("DEBUG: starting test %d\n", tests_total);
     63//      printf( "DEBUG: starting test %d\n", tests_total );
    6464        bool passed = true;
    6565        T1 * ip = (T1 *) ip_;
    66         for (i; start ~ end) passed = passed && (ip[i].data == fill.data);
    67         if (!passed) {
    68                 printf("failed test %3d: fill T1\n", tests_total);
     66        for ( i; start ~ end ) passed = passed && (ip[i].data == fill.data );
     67        if ( ! passed ) {
     68                printf( "failed test %3d: fill T1\n", tests_total );
    6969                tests_failed += 1;
    70         }
    71 //      printf("DEBUG: done test %d\n", tests_total);
     70        } // if
     71//      printf( "DEBUG: done test %d\n", tests_total );
    7272}
    7373
    74 void test_fill( void * ip_, size_t start, size_t end, T1 * fill) {
     74void test_fill( void * ip_, size_t start, size_t end, T1 * fill ) {
    7575        tests_total += 1;
    76 //      printf("DEBUG: starting test %d\n", tests_total);
    77         bool passed = (memcmp((void*)((uintptr_t)ip_ + start), (void*)fill, end) == 0);
    78         if (!passed) {
    79                 printf("failed test %3d: fill T1 A\n", tests_total);
     76//      printf( "DEBUG: starting test %d\n", tests_total );
     77        bool passed = memcmp( (void*)((uintptr_t )ip_ + start ), (void*)fill, end ) == 0;
     78        if ( ! passed ) {
     79                printf( "failed test %3d: fill T1 A\n", tests_total );
    8080                tests_failed += 1;
    81         }
    82 //      printf("DEBUG: done test %d\n", tests_total);
     81        } // if
     82//      printf( "DEBUG: done test %d\n", tests_total );
    8383}
    8484
    85 void test_use( int * ip, size_t dim) {
     85void test_use( int * ip, size_t dim ) {
    8686        tests_total += 1;
    87 //      printf("DEBUG: starting test %d\n", tests_total);
     87//      printf( "DEBUG: starting test %d\n", tests_total );
    8888        bool passed = true;
    89         for (i; 0 ~ dim) ip[i] = 0xdeadbeef;
    90         for (i; 0 ~ dim) passed = passed &&  (ip[i] == 0xdeadbeef);
    91         if (!passed) {
    92                 printf("failed test %3d: use int\n", tests_total);
     89        for ( i; 0 ~ dim ) ip[i] = 0xdeadbeef;
     90        for ( i; 0 ~ dim ) passed = passed &&  (ip[i] == 0xdeadbeef);
     91        if ( ! passed ) {
     92                printf( "failed test %3d: use int\n", tests_total );
    9393                tests_failed += 1;
    94         }
    95 //      printf("DEBUG: done test %d\n", tests_total);
     94        } // if
     95//      printf( "DEBUG: done test %d\n", tests_total );
    9696}
    9797
    98 void test_use( T1 * ip, size_t dim) {
     98void test_use( T1 * ip, size_t dim ) {
    9999        tests_total += 1;
    100 //      printf("DEBUG: starting test %d\n", tests_total);
     100//      printf( "DEBUG: starting test %d\n", tests_total );
    101101        bool passed = true;
    102         for (i; 0 ~ dim) ip[i].data = 0xdeadbeef;
    103         for (i; 0 ~ dim) passed = passed &&  (ip[i].data == 0xdeadbeef);
    104         if (!passed) {
    105                 printf("failed test %3d: use T1\n", tests_total);
     102        for ( i; 0 ~ dim ) ip[i].data = 0xdeadbeef;
     103        for ( i; 0 ~ dim ) passed = passed &&  (ip[i].data == 0xdeadbeef);
     104        if ( ! passed ) {
     105                printf( "failed test %3d: use T1\n", tests_total );
    106106                tests_failed += 1;
    107         }
    108 //      printf("DEBUG: done test %d\n", tests_total);
     107        } // if
     108//      printf( "DEBUG: done test %d\n", tests_total );
    109109}
    110110
    111111int main( void ) {
     112        enum { dim = 8, align = 64, libAlign = libAlign() };
    112113        size_t elemSize = sizeof(int);
    113         size_t dim = 8;
    114114        size_t size = dim * elemSize;
    115         size_t align = 64;
    116         const size_t libAlign = libAlign();
    117 
    118         int     FillT = 9;
    119         char    FillC = 'a';
    120         int   * FillA = calloc(dim / 4);
    121         T1          FillT1 = { FillT };
    122         T1        * FillT1A = (T1 *)(void *) malloc( (dim / 4) * sizeof(T1) );
    123         for (i; 0 ~ (dim / 4) ) FillT1A[i] = FillT1;
    124 
    125         int             * ip;
    126         int     * op;
    127         double  * dp;
    128         T1      * t1p;
    129         T1          * t1op;
     115
     116        int FillT = 9;
     117        char FillC = 'a';
     118        int * FillA = calloc( dim / 4 );
     119        T1 FillT1 = { FillT };
     120        T1 * FillT1A = (T1 *)(void *) malloc( (dim / 4) * sizeof(T1) );
     121        for ( i; 0 ~ (dim / 4) ) FillT1A[i] = FillT1;
     122
     123        int * ip;
     124        int * op;
     125        double * dp;
     126        T1 * t1p;
     127        T1 * t1op;
    130128
    131129        // testing alloc
     
    136134
    137135        ip = alloc();
    138         test_base(ip, elemSize, libAlign);
    139         test_use(ip, elemSize / elemSize);
    140         free(ip);
     136        test_base( ip, elemSize, libAlign );
     137        test_use( ip, elemSize / elemSize );
     138        free( ip );
    141139
    142140        ip = alloc( dim );
    143         test_base(ip, size, libAlign);
    144         test_use(ip, size / elemSize);
    145         free(ip);
     141        test_base( ip, size, libAlign );
     142        test_use( ip, size / elemSize );
     143        free( ip );
    146144
    147145        ip = alloc( 0 );
    148         test_base(ip, 0, libAlign);
    149         free(ip);
     146        test_base( ip, 0, libAlign );
     147        free( ip );
    150148
    151149        dp = alloc( dim );
    152150        ip = alloc( dp`resize );
    153         test_base(ip, elemSize, libAlign);
    154         test_use(ip, elemSize / elemSize);
    155         free(ip);
    156 
    157         ip = alloc( ((double*)0p)`resize );
    158         test_base(ip, elemSize, libAlign);
    159         test_use(ip, elemSize / elemSize);
    160         free(ip);
     151        test_base( ip, elemSize, libAlign );
     152        test_use( ip, elemSize / elemSize );
     153        free( ip );
     154
     155        ip = alloc( ((double *)0p)`resize );
     156        test_base( ip, elemSize, libAlign );
     157        test_use( ip, elemSize / elemSize );
     158        free( ip );
    161159
    162160        dp = alloc( dim );
    163161        ip = alloc( dim, dp`resize );
    164         test_base(ip, size, libAlign);
    165         test_use(ip, size / elemSize);
    166         free(ip);
     162        test_base( ip, size, libAlign );
     163        test_use( ip, size / elemSize );
     164        free( ip );
    167165
    168166        dp = alloc( dim );
    169167        ip = alloc( 0, dp`resize );
    170         test_base(ip, 0, libAlign);
    171         free(ip);
    172 
    173         ip = alloc( dim, ((double*)0p)`resize );
    174         test_base(ip, size, libAlign);
    175         test_use(ip, size / elemSize);
    176         free(ip);
    177 
    178         ip = alloc( 0, ((double*)0p)`resize );
    179         test_base(ip, 0, libAlign);
    180         free(ip);
    181 
    182         op = alloc( dim, ((int)0xdeadbeef)`fill );
     168        test_base( ip, 0, libAlign );
     169        free( ip );
     170
     171        ip = alloc( dim, 0p`resize );
     172        test_base( ip, size, libAlign );
     173        test_use( ip, size / elemSize );
     174        free( ip );
     175
     176        ip = alloc( 0, 0p`resize );
     177        test_base( ip, 0, libAlign );
     178        free( ip );
     179
     180        op = alloc( dim, 0xdeadbeefN`fill );
    183181        ip = alloc( dim, op`realloc );
    184         test_base(ip, size, libAlign);
    185         test_fill(ip, 0, dim, (int)0xdeadbeef);
    186         test_use(ip, size / elemSize);
    187         free(ip);
    188 
    189         op = alloc( dim, ((int)0xdeadbeef)`fill );
     182        test_base( ip, size, libAlign );
     183        test_fill( ip, 0, dim, 0xdeadbeefN );
     184        test_use( ip, size / elemSize );
     185        free( ip );
     186
     187        op = alloc( dim, 0xdeadbeefN`fill );
    190188        ip = alloc( 0, op`realloc );
    191         test_base(ip, 0, libAlign);
    192         free(ip);
    193 
    194         ip = alloc( dim, ((int*)0p)`realloc );
    195         test_base(ip, size, libAlign);
    196         test_use(ip, size / elemSize);
    197         free(ip);
    198 
    199         ip = alloc( 0, ((int*)0p)`realloc );
    200         test_base(ip, 0, libAlign);
    201         free(ip);
    202 
    203         op = alloc( dim, ((int)0xdeadbeef)`fill );
     189        test_base( ip, 0, libAlign );
     190        free( ip );
     191
     192        ip = alloc( dim, 0p`realloc );
     193        test_base( ip, size, libAlign );
     194        test_use( ip, size / elemSize );
     195        free( ip );
     196
     197        ip = alloc( 0, 0p`realloc );
     198        test_base( ip, 0, libAlign );
     199        free( ip );
     200
     201        op = alloc( dim, 0xdeadbeefN`fill );
    204202        ip = alloc( dim, op`resize );
    205         test_base(ip, size, libAlign);
    206         test_use(ip, size / elemSize);
    207         free(ip);
     203        test_base( ip, size, libAlign );
     204        test_use( ip, size / elemSize );
     205        free( ip );
    208206
    209207        ip = alloc( FillC`fill );
    210         test_base(ip, elemSize, libAlign);
    211         test_fill(ip, 0, elemSize, FillC);
    212         test_use(ip, elemSize / elemSize);
    213         free(ip);
     208        test_base( ip, elemSize, libAlign );
     209        test_fill( ip, 0, elemSize, FillC );
     210        test_use( ip, elemSize / elemSize );
     211        free( ip );
    214212
    215213        ip = alloc( FillT`fill );
    216         test_base(ip, elemSize, libAlign);
    217         test_fill(ip, 0, 1, FillT);
    218         test_use(ip, elemSize / elemSize);
    219         free(ip);
     214        test_base( ip, elemSize, libAlign );
     215        test_fill( ip, 0, 1, FillT );
     216        test_use( ip, elemSize / elemSize );
     217        free( ip );
    220218
    221219        ip = alloc( dim, FillC`fill );
    222         test_base(ip, size, libAlign);
    223         test_fill(ip, 0, size, FillC);
    224         test_use(ip, size / elemSize);
    225         free(ip);
     220        test_base( ip, size, libAlign );
     221        test_fill( ip, 0, size, FillC );
     222        test_use( ip, size / elemSize );
     223        free( ip );
    226224
    227225        ip = alloc( 0, FillC`fill );
    228         test_base(ip, 0, libAlign);
    229         free(ip);
     226        test_base( ip, 0, libAlign );
     227        free( ip );
    230228
    231229        ip = alloc( dim, FillT`fill );
    232         test_base(ip, size, libAlign);
    233         test_fill(ip, 0, dim, FillT);
    234         test_use(ip, size / elemSize);
    235         free(ip);
     230        test_base( ip, size, libAlign );
     231        test_fill( ip, 0, dim, FillT );
     232        test_use( ip, size / elemSize );
     233        free( ip );
    236234
    237235        ip = alloc( 0, FillT`fill );
    238         test_base(ip, 0, libAlign);
    239         free(ip);
     236        test_base( ip, 0, libAlign );
     237        free( ip );
    240238
    241239        ip = alloc( dim, [FillA, dim/4]`fill );
    242         test_base(ip, size, libAlign);
    243         test_fill(ip, 0, size/4, FillA);
    244         test_use(ip, size / elemSize);
    245         free(ip);
     240        test_base( ip, size, libAlign );
     241        test_fill( ip, 0, size/4, FillA );
     242        test_use( ip, size / elemSize );
     243        free( ip );
    246244
    247245        ip = alloc( 0, [FillA, dim/4]`fill );
    248         test_base(ip, 0, libAlign);
    249         free(ip);
    250 
    251         op = alloc( dim, ((int)0xdeadbeef)`fill );
     246        test_base( ip, 0, libAlign );
     247        free( ip );
     248
     249        op = alloc( dim, 0xdeadbeefN`fill );
    252250        ip = alloc( dim, op`realloc, FillC`fill );
    253         test_base(ip, size, libAlign);
    254         test_fill(ip, 0, dim, (int)0xdeadbeef);
    255         test_use(ip, size / elemSize);
    256         free(ip);
    257 
    258         op = alloc( dim, ((int)0xdeadbeef)`fill );
     251        test_base( ip, size, libAlign );
     252        test_fill( ip, 0, dim, 0xdeadbeefN );
     253        test_use( ip, size / elemSize );
     254        free( ip );
     255
     256        op = alloc( dim, 0xdeadbeefN`fill );
    259257        ip = alloc( dim / 4, op`realloc, FillC`fill );
    260         test_base(ip, size / 4, libAlign);
    261         test_fill(ip, 0, dim / 4, (int)0xdeadbeef);
    262         test_use(ip, size / 4 / elemSize);
    263         free(ip);
    264 
    265         op = alloc( dim, ((int)0xdeadbeef)`fill );
     258        test_base( ip, size / 4, libAlign );
     259        test_fill( ip, 0, dim / 4, 0xdeadbeefN );
     260        test_use( ip, size / 4 / elemSize );
     261        free( ip );
     262
     263        op = alloc( dim, 0xdeadbeefN`fill );
    266264        ip = alloc( dim * 4, op`realloc, FillC`fill );
    267         test_base(ip, size * 4, libAlign);
    268         test_fill(ip, 0, dim, (int)0xdeadbeef);
    269         test_fill(ip, size, size * 4, FillC);
    270         test_use(ip, size * 4 / elemSize);
    271         free(ip);
    272 
    273         op = alloc( dim, ((int)0xdeadbeef)`fill );
     265        test_base( ip, size * 4, libAlign );
     266        test_fill( ip, 0, dim, 0xdeadbeefN );
     267        test_fill( ip, size, size * 4, FillC );
     268        test_use( ip, size * 4 / elemSize );
     269        free( ip );
     270
     271        op = alloc( dim, 0xdeadbeefN`fill );
    274272        ip = alloc( 0, op`realloc, FillC`fill );
    275         test_base(ip, 0, libAlign);
    276         free(ip);
    277 
    278         ip = alloc( dim, ((int*)0p)`realloc, FillC`fill );
    279         test_base(ip, size, libAlign);
    280         test_fill(ip, 0, size, FillC);
    281         test_use(ip, size / elemSize);
    282         free(ip);
    283 
    284         ip = alloc( 0, ((int*)0p)`realloc, FillC`fill );
    285         test_base(ip, 0, libAlign);
    286         free(ip);
    287 
    288         op = alloc( dim, ((int)0xdeadbeef)`fill );
     273        test_base( ip, 0, libAlign );
     274        free( ip );
     275
     276        ip = alloc( dim, 0p`realloc, FillC`fill );
     277        test_base( ip, size, libAlign );
     278        test_fill( ip, 0, size, FillC );
     279        test_use( ip, size / elemSize );
     280        free( ip );
     281
     282        ip = alloc( 0, 0p`realloc, FillC`fill );
     283        test_base( ip, 0, libAlign );
     284        free( ip );
     285
     286        op = alloc( dim, 0xdeadbeefN`fill );
    289287        ip = alloc( dim, op`realloc, FillT`fill );
    290         test_base(ip, size, libAlign);
    291         test_fill(ip, 0, dim, (int)0xdeadbeef);
    292         test_use(ip, size / elemSize);
    293         free(ip);
    294 
    295         op = alloc( dim, ((int)0xdeadbeef)`fill );
     288        test_base( ip, size, libAlign );
     289        test_fill( ip, 0, dim, 0xdeadbeefN );
     290        test_use( ip, size / elemSize );
     291        free( ip );
     292
     293        op = alloc( dim, 0xdeadbeefN`fill );
    296294        ip = alloc( dim / 4, op`realloc, FillT`fill );
    297         test_base(ip, size / 4, libAlign);
    298         test_fill(ip, 0, dim / 4, (int)0xdeadbeef);
    299         test_use(ip, size / 4 / elemSize);
    300         free(ip);
    301 
    302         op = alloc( dim, ((int)0xdeadbeef)`fill );
     295        test_base( ip, size / 4, libAlign );
     296        test_fill( ip, 0, dim / 4, 0xdeadbeefN );
     297        test_use( ip, size / 4 / elemSize );
     298        free( ip );
     299
     300        op = alloc( dim, 0xdeadbeefN`fill );
    303301        ip = alloc( dim * 4, op`realloc, FillT`fill );
    304         test_base(ip, size * 4, libAlign);
    305         test_fill(ip, 0, dim, (int)0xdeadbeef);
    306         test_fill(ip, dim, dim * 4, FillT);
    307         test_use(ip, size * 4 / elemSize);
    308         free(ip);
    309 
    310         op = alloc( dim, ((int)0xdeadbeef)`fill );
     302        test_base( ip, size * 4, libAlign );
     303        test_fill( ip, 0, dim, 0xdeadbeefN );
     304        test_fill( ip, dim, dim * 4, FillT );
     305        test_use( ip, size * 4 / elemSize );
     306        free( ip );
     307
     308        op = alloc( dim, 0xdeadbeefN`fill );
    311309        ip = alloc( 0, op`realloc, FillT`fill );
    312         test_base(ip, 0, libAlign);
    313         free(ip);
    314 
    315         ip = alloc( dim, ((int*)0p)`realloc, FillT`fill );
    316         test_base(ip, size, libAlign);
    317         test_fill(ip, 0, dim, FillT);
    318         test_use(ip, size / elemSize);
    319         free(ip);
    320 
    321         ip = alloc( 0, ((int*)0p)`realloc, FillT`fill );
    322         test_base(ip, 0, libAlign);
    323         free(ip);
     310        test_base( ip, 0, libAlign );
     311        free( ip );
     312
     313        ip = alloc( dim, 0p`realloc, FillT`fill );
     314        test_base( ip, size, libAlign );
     315        test_fill( ip, 0, dim, FillT );
     316        test_use( ip, size / elemSize );
     317        free( ip );
     318
     319        ip = alloc( 0, 0p`realloc, FillT`fill );
     320        test_base( ip, 0, libAlign );
     321        free( ip );
    324322
    325323        ip = alloc( align`align );
    326         test_base(ip, elemSize, align);
    327         test_use(ip, elemSize / elemSize);
    328         free(ip);
     324        test_base( ip, elemSize, align );
     325        test_use( ip, elemSize / elemSize );
     326        free( ip );
    329327
    330328        ip = alloc( dim, align`align );
    331         test_base(ip, size, align);
    332         test_use(ip, size / elemSize);
    333         free(ip);
     329        test_base( ip, size, align );
     330        test_use( ip, size / elemSize );
     331        free( ip );
    334332
    335333        ip = alloc( 0, align`align );
    336         test_base(ip, 0, libAlign);
    337         free(ip);
    338 
    339         op = alloc( dim, ((int)0xdeadbeef)`fill );
     334        test_base( ip, 0, libAlign );
     335        free( ip );
     336
     337        op = alloc( dim, 0xdeadbeefN`fill );
    340338        ip = alloc( op`realloc, align`align );
    341         test_base(ip, elemSize, align);
    342         test_fill(ip, 0, 1, (int)0xdeadbeef);
    343         test_use(ip, elemSize / elemSize);
    344         free(ip);
    345 
    346         ip = alloc( ((int*)0p)`realloc, align`align );
    347         test_base(ip, elemSize, align);
    348         test_use(ip, elemSize / elemSize);
    349         free(ip);
     339        test_base( ip, elemSize, align );
     340        test_fill( ip, 0, 1, 0xdeadbeefN );
     341        test_use( ip, elemSize / elemSize );
     342        free( ip );
     343
     344        ip = alloc( 0p`realloc, align`align );
     345        test_base( ip, elemSize, align );
     346        test_use( ip, elemSize / elemSize );
     347        free( ip );
    350348
    351349        dp = alloc( dim );
    352350        ip = alloc( dp`resize, align`align );
    353         test_base(ip, elemSize, align);
    354         test_use(ip, elemSize / elemSize);
    355         free(ip);
    356 
    357         ip = alloc( ((double*)0p)`resize, align`align );
    358         test_base(ip, elemSize, align);
    359         test_use(ip, elemSize / elemSize);
    360         free(ip);
    361 
    362         op = alloc( dim, ((int)0xdeadbeef)`fill);
     351        test_base( ip, elemSize, align );
     352        test_use( ip, elemSize / elemSize );
     353        free( ip );
     354
     355        ip = alloc( 0p`resize, align`align );
     356        test_base( ip, elemSize, align );
     357        test_use( ip, elemSize / elemSize );
     358        free( ip );
     359
     360        op = alloc( dim, 0xdeadbeefN`fill );
    363361        ip = alloc( dim, op`realloc, align`align );
    364         test_base(ip, size, align);
    365         test_fill(ip, 0, dim, (int)0xdeadbeef);
    366         test_use(ip, size / elemSize);
    367         free(ip);
    368 
    369         op = alloc( dim, ((int)0xdeadbeef)`fill );
     362        test_base( ip, size, align );
     363        test_fill( ip, 0, dim, 0xdeadbeefN );
     364        test_use( ip, size / elemSize );
     365        free( ip );
     366
     367        op = alloc( dim, 0xdeadbeefN`fill );
    370368        ip = alloc( 0, op`realloc, align`align );
    371         test_base(ip, 0, libAlign);
    372         free(ip);
    373 
    374         ip = alloc( dim, ((int*)0p)`realloc, align`align );
    375         test_base(ip, size, align);
    376         test_use(ip, size / elemSize);
    377         free(ip);
    378 
    379         ip = alloc( 0, ((int*)0p)`realloc, align`align );
    380         test_base(ip, 0, libAlign);
    381         free(ip);
     369        test_base( ip, 0, libAlign );
     370        free( ip );
     371
     372        ip = alloc( dim, 0p`realloc, align`align );
     373        test_base( ip, size, align );
     374        test_use( ip, size / elemSize );
     375        free( ip );
     376
     377        ip = alloc( 0, 0p`realloc, align`align );
     378        test_base( ip, 0, libAlign );
     379        free( ip );
    382380
    383381        ip = alloc( align`align, FillC`fill );
    384         test_base(ip, elemSize, align);
    385         test_fill(ip, 0, elemSize, FillC);
    386         test_use(ip, elemSize / elemSize);
    387         free(ip);
     382        test_base( ip, elemSize, align );
     383        test_fill( ip, 0, elemSize, FillC );
     384        test_use( ip, elemSize / elemSize );
     385        free( ip );
    388386
    389387        ip = alloc( align`align, FillT`fill );
    390         test_base(ip, elemSize, align);
    391         test_fill(ip, 0, 1, FillT);
    392         test_use(ip, elemSize / elemSize);
    393         free(ip);
     388        test_base( ip, elemSize, align );
     389        test_fill( ip, 0, 1, FillT );
     390        test_use( ip, elemSize / elemSize );
     391        free( ip );
    394392
    395393        ip = alloc( dim, align`align, FillC`fill );
    396         test_base(ip, size, align);
    397         test_fill(ip, 0, size, FillC);
    398         test_use(ip, size / elemSize);
    399         free(ip);
     394        test_base( ip, size, align );
     395        test_fill( ip, 0, size, FillC );
     396        test_use( ip, size / elemSize );
     397        free( ip );
    400398
    401399        ip = alloc( 0, align`align, FillC`fill );
    402         test_base(ip, 0, libAlign);
    403         free(ip);
     400        test_base( ip, 0, libAlign );
     401        free( ip );
    404402
    405403        ip = alloc( dim, align`align, FillT`fill );
    406         test_base(ip, size, align);
    407         test_fill(ip, 0, dim, FillT);
    408         test_use(ip, size / elemSize);
    409         free(ip);
     404        test_base( ip, size, align );
     405        test_fill( ip, 0, dim, FillT );
     406        test_use( ip, size / elemSize );
     407        free( ip );
    410408
    411409        ip = alloc( 0, align`align, FillT`fill );
    412         test_base(ip, 0, libAlign);
    413         free(ip);
     410        test_base( ip, 0, libAlign );
     411        free( ip );
    414412
    415413        ip = alloc( dim, align`align, [FillA, dim/4]`fill );
    416         test_base(ip, size, align);
    417         test_fill(ip, 0, size/4, FillA);
    418         test_use(ip, size / elemSize);
    419         free(ip);
     414        test_base( ip, size, align );
     415        test_fill( ip, 0, size/4, FillA );
     416        test_use( ip, size / elemSize );
     417        free( ip );
    420418
    421419        ip = alloc( 0, align`align, [FillA, dim/4]`fill );
    422         test_base(ip, 0, libAlign);
    423         free(ip);
    424 
    425         op = alloc( dim, ((int)0xdeadbeef)`fill );
     420        test_base( ip, 0, libAlign );
     421        free( ip );
     422
     423        op = alloc( dim, 0xdeadbeefN`fill );
    426424        ip = alloc( dim, op`realloc, align`align, FillC`fill );
    427         test_base(ip, size, align);
    428         test_fill(ip, 0, dim, (int)0xdeadbeef);
    429         test_use(ip, size / elemSize);
    430         free(ip);
    431 
    432         op = alloc( dim, ((int)0xdeadbeef)`fill );
     425        test_base( ip, size, align );
     426        test_fill( ip, 0, dim, 0xdeadbeefN );
     427        test_use( ip, size / elemSize );
     428        free( ip );
     429
     430        op = alloc( dim, 0xdeadbeefN`fill );
    433431        ip = alloc( dim / 4, op`realloc, align`align, FillC`fill );
    434         test_base(ip, size / 4, align);
    435         test_fill(ip, 0, dim / 4, (int)0xdeadbeef);
    436         test_use(ip, size / 4 / elemSize);
    437         free(ip);
    438 
    439         op = alloc( dim, ((int)0xdeadbeef)`fill );
     432        test_base( ip, size / 4, align );
     433        test_fill( ip, 0, dim / 4, 0xdeadbeefN );
     434        test_use( ip, size / 4 / elemSize );
     435        free( ip );
     436
     437        op = alloc( dim, 0xdeadbeefN`fill );
    440438        ip = alloc( dim * 4, op`realloc, align`align, FillC`fill );
    441         test_base(ip, size * 4, align);
    442         test_fill(ip, 0, dim, (int)0xdeadbeef);
    443         test_fill(ip, size, size * 4, FillC);
    444         test_use(ip, size * 4 / elemSize);
    445         free(ip);
    446 
    447         op = alloc( dim, ((int)0xdeadbeef)`fill );
     439        test_base( ip, size * 4, align );
     440        test_fill( ip, 0, dim, 0xdeadbeefN );
     441        test_fill( ip, size, size * 4, FillC );
     442        test_use( ip, size * 4 / elemSize );
     443        free( ip );
     444
     445        op = alloc( dim, 0xdeadbeefN`fill );
    448446        ip = alloc( 0, op`realloc, align`align, FillC`fill );
    449         test_base(ip, 0, libAlign);
    450         free(ip);
    451 
    452         ip = alloc( dim, ((int*)0p)`realloc, align`align, FillC`fill );
    453         test_base(ip, size, align);
    454         test_fill(ip, 0, size, FillC);
    455         test_use(ip, size / elemSize);
    456         free(ip);
    457 
    458         ip = alloc( 0, ((int*)0p)`realloc, align`align, FillC`fill );
    459         test_base(ip, 0, libAlign);
    460         free(ip);
    461 
    462         op = alloc( dim, ((int)0xdeadbeef)`fill );
     447        test_base( ip, 0, libAlign );
     448        free( ip );
     449
     450        ip = alloc( dim, 0p`realloc, align`align, FillC`fill );
     451        test_base( ip, size, align );
     452        test_fill( ip, 0, size, FillC );
     453        test_use( ip, size / elemSize );
     454        free( ip );
     455
     456        ip = alloc( 0, 0p`realloc, align`align, FillC`fill );
     457        test_base( ip, 0, libAlign );
     458        free( ip );
     459
     460        op = alloc( dim, 0xdeadbeefN`fill );
    463461        ip = alloc( dim, op`realloc, align`align, FillT`fill );
    464         test_base(ip, size, align);
    465         test_fill(ip, 0, dim, (int)0xdeadbeef);
    466         test_use(ip, size / elemSize);
    467         free(ip);
    468 
    469         op = alloc( dim, ((int)0xdeadbeef)`fill );
     462        test_base( ip, size, align );
     463        test_fill( ip, 0, dim, 0xdeadbeefN );
     464        test_use( ip, size / elemSize );
     465        free( ip );
     466
     467        op = alloc( dim, 0xdeadbeefN`fill );
    470468        ip = alloc( dim / 4, op`realloc, align`align, FillT`fill );
    471         test_base(ip, size / 4, align);
    472         test_fill(ip, 0, dim / 4, (int)0xdeadbeef);
    473         test_use(ip, size / 4 / elemSize);
    474         free(ip);
    475 
    476         op = alloc( dim, ((int)0xdeadbeef)`fill );
     469        test_base( ip, size / 4, align );
     470        test_fill( ip, 0, dim / 4, 0xdeadbeefN );
     471        test_use( ip, size / 4 / elemSize );
     472        free( ip );
     473
     474        op = alloc( dim, 0xdeadbeefN`fill );
    477475        ip = alloc( dim * 4, op`realloc, align`align, FillT`fill );
    478         test_base(ip, size * 4, align);
    479         test_fill(ip, 0, dim, (int)0xdeadbeef);
    480         test_fill(ip, dim, dim * 4, FillT);
    481         test_use(ip, size * 4 / elemSize);
    482         free(ip);
    483 
    484         op = alloc( dim, ((int)0xdeadbeef)`fill );
     476        test_base( ip, size * 4, align );
     477        test_fill( ip, 0, dim, 0xdeadbeefN );
     478        test_fill( ip, dim, dim * 4, FillT );
     479        test_use( ip, size * 4 / elemSize );
     480        free( ip );
     481
     482        op = alloc( dim, 0xdeadbeefN`fill );
    485483        ip = alloc( 0, op`realloc, align`align, FillT`fill );
    486         test_base(ip, 0, libAlign);
    487         free(ip);
    488 
    489         ip = alloc( dim, ((int*)0p)`realloc, align`align, FillT`fill );
    490         test_base(ip, size, align);
    491         test_fill(ip, 0, dim, FillT);
    492         test_use(ip, size / elemSize);
    493         free(ip);
    494 
    495         ip = alloc( 0, ((int*)0p)`realloc, align`align, FillT`fill );
    496         test_base(ip, 0, libAlign);
    497         free(ip);
    498 
    499         if (tests_failed == 0) printf("PASSED alloc tests\n\n");
    500         else printf("failed alloc tests : %d/%d\n\n", tests_failed, tests_total);
    501 
    502         // testing alloc (aligned struct)
     484        test_base( ip, 0, libAlign );
     485        free( ip );
     486
     487        ip = alloc( dim, 0p`realloc, align`align, FillT`fill );
     488        test_base( ip, size, align );
     489        test_fill( ip, 0, dim, FillT );
     490        test_use( ip, size / elemSize );
     491        free( ip );
     492
     493        ip = alloc( 0, 0p`realloc, align`align, FillT`fill );
     494        test_base( ip, 0, libAlign );
     495        free( ip );
     496
     497        if ( tests_failed == 0 ) printf( "PASSED alloc tests\n\n" );
     498        else printf( "failed alloc tests : %d/%d\n\n", tests_failed, tests_total );
     499
     500        // testing alloc ( aligned struct )
    503501
    504502        elemSize = sizeof(T1);
     
    509507
    510508        t1p = alloc();
    511         test_base(t1p, elemSize, tAlign);
    512         test_use(t1p, elemSize / elemSize);
    513         free(t1p);
     509        test_base( t1p, elemSize, tAlign );
     510        test_use( t1p, elemSize / elemSize );
     511        free( t1p );
    514512
    515513        t1p = alloc( dim );
    516         test_base(t1p, size, tAlign);
    517         test_use(t1p, size / elemSize);
    518         free(t1p);
     514        test_base( t1p, size, tAlign );
     515        test_use( t1p, size / elemSize );
     516        free( t1p );
    519517
    520518        t1p = alloc( 0 );
    521         test_base(t1p, 0, libAlign);
    522         free(t1p);
     519        test_base( t1p, 0, libAlign );
     520        free( t1p );
    523521
    524522        dp = alloc( dim );
    525523        t1p = alloc( dp`resize );
    526         test_base(t1p, elemSize, tAlign);
    527         test_use(t1p, elemSize / elemSize);
    528         free(t1p);
    529 
    530         t1p = alloc( ((double*)0p)`resize );
    531         test_base(t1p, elemSize, tAlign);
    532         test_use(t1p, elemSize / elemSize);
    533         free(t1p);
     524        test_base( t1p, elemSize, tAlign );
     525        test_use( t1p, elemSize / elemSize );
     526        free( t1p );
     527
     528        t1p = alloc( 0p`resize );
     529        test_base( t1p, elemSize, tAlign );
     530        test_use( t1p, elemSize / elemSize );
     531        free( t1p );
    534532
    535533        dp = alloc( dim );
    536534        t1p = alloc( dim, dp`resize );
    537         test_base(t1p, size, tAlign);
    538         test_use(t1p, size / elemSize);
    539         free(t1p);
     535        test_base( t1p, size, tAlign );
     536        test_use( t1p, size / elemSize );
     537        free( t1p );
    540538
    541539        dp = alloc( dim );
    542540        t1p = alloc( 0, dp`resize );
    543         test_base(t1p, 0, libAlign);
    544         free(t1p);
    545 
    546         t1p = alloc( dim, ((double*)0p)`resize );
    547         test_base(t1p, size, tAlign);
    548         test_use(t1p, size / elemSize);
    549         free(t1p);
    550 
    551         t1p = alloc( 0, ((double*)0p)`resize );
    552         test_base(t1p, 0, libAlign);
    553         free(t1p);
     541        test_base( t1p, 0, libAlign );
     542        free( t1p );
     543
     544        t1p = alloc( dim, 0p`resize );
     545        test_base( t1p, size, tAlign );
     546        test_use( t1p, size / elemSize );
     547        free( t1p );
     548
     549        t1p = alloc( 0, 0p`resize );
     550        test_base( t1p, 0, libAlign );
     551        free( t1p );
    554552
    555553        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    556554        t1p = alloc( dim, t1op`realloc );
    557         test_base(t1p, size, tAlign);
    558         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    559         test_use(t1p, size / elemSize);
    560         free(t1p);
     555        test_base( t1p, size, tAlign );
     556        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     557        test_use( t1p, size / elemSize );
     558        free( t1p );
    561559
    562560        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    563561        t1p = alloc( 0, t1op`realloc );
    564         test_base(t1p, 0, libAlign);
    565         free(t1p);
    566 
    567         t1p = alloc( dim, ((T1*)0p)`realloc );
    568         test_base(t1p, size, tAlign);
    569         test_use(t1p, size / elemSize);
    570         free(t1p);
    571 
    572         t1p = alloc( 0, ((T1*)0p)`realloc );
    573         test_base(t1p, 0, libAlign);
    574         free(t1p);
     562        test_base( t1p, 0, libAlign );
     563        free( t1p );
     564
     565        t1p = alloc( dim, 0p`realloc );
     566        test_base( t1p, size, tAlign );
     567        test_use( t1p, size / elemSize );
     568        free( t1p );
     569
     570        t1p = alloc( 0, 0p`realloc );
     571        test_base( t1p, 0, libAlign );
     572        free( t1p );
    575573
    576574        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    577575        t1p = alloc( dim, t1op`resize );
    578         test_base(t1p, size, tAlign);
    579         test_use(t1p, size / elemSize);
    580         free(t1p);
     576        test_base( t1p, size, tAlign );
     577        test_use( t1p, size / elemSize );
     578        free( t1p );
    581579
    582580        t1p = alloc( FillC`fill );
    583         test_base(t1p, elemSize, tAlign);
    584         test_fill(t1p, 0, elemSize, FillC);
    585         test_use(t1p, elemSize / elemSize);
    586         free(t1p);
     581        test_base( t1p, elemSize, tAlign );
     582        test_fill( t1p, 0, elemSize, FillC );
     583        test_use( t1p, elemSize / elemSize );
     584        free( t1p );
    587585
    588586        t1p = alloc( FillT1`fill );
    589         test_base(t1p, elemSize, tAlign);
    590         test_fill(t1p, 0, 1, FillT1);
    591         test_use(t1p, elemSize / elemSize);
    592         free(t1p);
     587        test_base( t1p, elemSize, tAlign );
     588        test_fill( t1p, 0, 1, FillT1);
     589        test_use( t1p, elemSize / elemSize );
     590        free( t1p );
    593591
    594592        t1p = alloc( dim, FillC`fill );
    595         test_base(t1p, size, tAlign);
    596         test_fill(t1p, 0, size, FillC);
    597         test_use(t1p, size / elemSize);
    598         free(t1p);
     593        test_base( t1p, size, tAlign );
     594        test_fill( t1p, 0, size, FillC );
     595        test_use( t1p, size / elemSize );
     596        free( t1p );
    599597
    600598        t1p = alloc( 0, FillC`fill );
    601         test_base(t1p, 0, libAlign);
    602         free(t1p);
     599        test_base( t1p, 0, libAlign );
     600        free( t1p );
    603601
    604602        t1p = alloc( dim, FillT1`fill );
    605         test_base(t1p, size, tAlign);
    606         test_fill(t1p, 0, dim, FillT1);
    607         test_use(t1p, size / elemSize);
    608         free(t1p);
     603        test_base( t1p, size, tAlign );
     604        test_fill( t1p, 0, dim, FillT1);
     605        test_use( t1p, size / elemSize );
     606        free( t1p );
    609607
    610608        t1p = alloc( 0, FillT1`fill );
    611         test_base(t1p, 0, libAlign);
    612         free(t1p);
     609        test_base( t1p, 0, libAlign );
     610        free( t1p );
    613611
    614612        t1p = alloc( dim, [FillT1A, dim / 4]`fill );
    615         test_base(t1p, size, tAlign);
    616         test_fill(t1p, 0, size/4, FillT1A);
    617         test_use(t1p, size / elemSize);
    618         free(t1p);
     613        test_base( t1p, size, tAlign );
     614        test_fill( t1p, 0, size/4, FillT1A );
     615        test_use( t1p, size / elemSize );
     616        free( t1p );
    619617
    620618        t1p = alloc( 0, [FillT1A, dim / 4]`fill );
    621         test_base(t1p, 0, libAlign);
    622         free(t1p);
     619        test_base( t1p, 0, libAlign );
     620        free( t1p );
    623621
    624622        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    625623        t1p = alloc( dim, t1op`realloc, FillC`fill );
    626         test_base(t1p, size, tAlign);
    627         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    628         test_use(t1p, size / elemSize);
    629         free(t1p);
     624        test_base( t1p, size, tAlign );
     625        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     626        test_use( t1p, size / elemSize );
     627        free( t1p );
    630628
    631629        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    632630        t1p = alloc( dim / 4, t1op`realloc, FillC`fill );
    633         test_base(t1p, size / 4, tAlign);
    634         test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef});
    635         test_use(t1p, size / 4 / elemSize);
    636         free(t1p);
     631        test_base( t1p, size / 4, tAlign );
     632        test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});
     633        test_use( t1p, size / 4 / elemSize );
     634        free( t1p );
    637635
    638636        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    639637        t1p = alloc( dim * 4, t1op`realloc, FillC`fill );
    640         test_base(t1p, size * 4, tAlign);
    641         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    642         test_fill(t1p, size, size * 4, FillC);
    643         test_use(t1p, size * 4 / elemSize);
    644         free(t1p);
     638        test_base( t1p, size * 4, tAlign );
     639        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     640        test_fill( t1p, size, size * 4, FillC );
     641        test_use( t1p, size * 4 / elemSize );
     642        free( t1p );
    645643
    646644        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    647645        t1p = alloc( 0, t1op`realloc, FillC`fill );
    648         test_base(t1p, 0, libAlign);
    649         free(t1p);
    650 
    651         t1p = alloc( dim, ((T1*)0p)`realloc, FillC`fill );
    652         test_base(t1p, size, tAlign);
    653         test_fill(t1p, 0, size, FillC);
    654         test_use(t1p, size / elemSize);
    655         free(t1p);
    656 
    657         t1p = alloc( 0, ((T1*)0p)`realloc, FillC`fill );
    658         test_base(t1p, 0, libAlign);
    659         free(t1p);
     646        test_base( t1p, 0, libAlign );
     647        free( t1p );
     648
     649        t1p = alloc( dim, 0p`realloc, FillC`fill );
     650        test_base( t1p, size, tAlign );
     651        test_fill( t1p, 0, size, FillC );
     652        test_use( t1p, size / elemSize );
     653        free( t1p );
     654
     655        t1p = alloc( 0, 0p`realloc, FillC`fill );
     656        test_base( t1p, 0, libAlign );
     657        free( t1p );
    660658
    661659        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    662660        t1p = alloc( dim, t1op`realloc, FillT1`fill );
    663         test_base(t1p, size, tAlign);
    664         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    665         test_use(t1p, size / elemSize);
    666         free(t1p);
     661        test_base( t1p, size, tAlign );
     662        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     663        test_use( t1p, size / elemSize );
     664        free( t1p );
    667665
    668666        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    669667        t1p = alloc( dim / 4, t1op`realloc, FillT1`fill );
    670         test_base(t1p, size / 4, tAlign);
    671         test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef});
    672         test_use(t1p, size / 4 / elemSize);
    673         free(t1p);
     668        test_base( t1p, size / 4, tAlign );
     669        test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});
     670        test_use( t1p, size / 4 / elemSize );
     671        free( t1p );
    674672
    675673        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    676674        t1p = alloc( dim * 4, t1op`realloc, FillT1`fill );
    677         test_base(t1p, size * 4, tAlign);
    678         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    679         test_fill(t1p, dim, dim * 4, FillT1);
    680         test_use(t1p, size * 4 / elemSize);
    681         free(t1p);
     675        test_base( t1p, size * 4, tAlign );
     676        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     677        test_fill( t1p, dim, dim * 4, FillT1);
     678        test_use( t1p, size * 4 / elemSize );
     679        free( t1p );
    682680
    683681        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    684682        t1p = alloc( 0, t1op`realloc, FillT1`fill );
    685         test_base(t1p, 0, libAlign);
    686         free(t1p);
    687 
    688         t1p = alloc( dim, ((T1*)0p)`realloc, FillT1`fill );
    689         test_base(t1p, size, tAlign);
    690         test_fill(t1p, 0, dim, FillT1);
    691         test_use(t1p, size / elemSize);
    692         free(t1p);
    693 
    694         t1p = alloc( 0, ((T1*)0p)`realloc, FillT1`fill );
    695         test_base(t1p, 0, libAlign);
    696         free(t1p);
     683        test_base( t1p, 0, libAlign );
     684        free( t1p );
     685
     686        t1p = alloc( dim, 0p`realloc, FillT1`fill );
     687        test_base( t1p, size, tAlign );
     688        test_fill( t1p, 0, dim, FillT1);
     689        test_use( t1p, size / elemSize );
     690        free( t1p );
     691
     692        t1p = alloc( 0, 0p`realloc, FillT1`fill );
     693        test_base( t1p, 0, libAlign );
     694        free( t1p );
    697695
    698696        t1p = alloc( align`align );
    699         test_base(t1p, elemSize, align);
    700         test_use(t1p, elemSize / elemSize);
    701         free(t1p);
     697        test_base( t1p, elemSize, align );
     698        test_use( t1p, elemSize / elemSize );
     699        free( t1p );
    702700
    703701        t1p = alloc( dim, align`align );
    704         test_base(t1p, size, align);
    705         test_use(t1p, size / elemSize);
    706         free(t1p);
     702        test_base( t1p, size, align );
     703        test_use( t1p, size / elemSize );
     704        free( t1p );
    707705
    708706        t1p = alloc( 0, align`align );
    709         test_base(t1p, 0, libAlign);
    710         free(t1p);
     707        test_base( t1p, 0, libAlign );
     708        free( t1p );
    711709
    712710        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    713711        t1p = alloc( t1op`realloc, align`align );
    714         test_base(t1p, elemSize, align);
    715         test_fill(t1p, 0, 1, (T1){0xdeadbeef});
    716         test_use(t1p, elemSize / elemSize);
    717         free(t1p);
    718 
    719         t1p = alloc( ((T1*)0p)`realloc, align`align );
    720         test_base(t1p, elemSize, align);
    721         test_use(t1p, elemSize / elemSize);
    722         free(t1p);
     712        test_base( t1p, elemSize, align );
     713        test_fill( t1p, 0, 1, (T1){0xdeadbeef});
     714        test_use( t1p, elemSize / elemSize );
     715        free( t1p );
     716
     717        t1p = alloc( 0p`realloc, align`align );
     718        test_base( t1p, elemSize, align );
     719        test_use( t1p, elemSize / elemSize );
     720        free( t1p );
    723721
    724722        dp = alloc( dim );
    725723        t1p = alloc( dp`resize, align`align );
    726         test_base(t1p, elemSize, align);
    727         test_use(t1p, elemSize / elemSize);
    728         free(t1p);
    729 
    730         t1p = alloc( ((double*)0p)`resize, align`align );
    731         test_base(t1p, elemSize, align);
    732         test_use(t1p, elemSize / elemSize);
    733         free(t1p);
     724        test_base( t1p, elemSize, align );
     725        test_use( t1p, elemSize / elemSize );
     726        free( t1p );
     727
     728        t1p = alloc( 0p`resize, align`align );
     729        test_base( t1p, elemSize, align );
     730        test_use( t1p, elemSize / elemSize );
     731        free( t1p );
    734732
    735733        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    736734        t1p = alloc( dim, t1op`realloc, align`align );
    737         test_base(t1p, size, align);
    738         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    739         test_use(t1p, size / elemSize);
    740         free(t1p);
     735        test_base( t1p, size, align );
     736        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     737        test_use( t1p, size / elemSize );
     738        free( t1p );
    741739
    742740        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    743741        t1p = alloc( 0, t1op`realloc, align`align );
    744         test_base(t1p, 0, libAlign);
    745         free(t1p);
    746 
    747         t1p = alloc( dim, ((T1*)0p)`realloc, align`align );
    748         test_base(t1p, size, align);
    749         test_use(t1p, size / elemSize);
    750         free(t1p);
    751 
    752         t1p = alloc( 0, ((T1*)0p)`realloc, align`align );
    753         test_base(t1p, 0, libAlign);
    754         free(t1p);
     742        test_base( t1p, 0, libAlign );
     743        free( t1p );
     744
     745        t1p = alloc( dim, 0p`realloc, align`align );
     746        test_base( t1p, size, align );
     747        test_use( t1p, size / elemSize );
     748        free( t1p );
     749
     750        t1p = alloc( 0, 0p`realloc, align`align );
     751        test_base( t1p, 0, libAlign );
     752        free( t1p );
    755753
    756754        t1p = alloc( align`align, FillC`fill );
    757         test_base(t1p, elemSize, align);
    758         test_fill(t1p, 0, elemSize, FillC);
    759         test_use(t1p, elemSize / elemSize);
    760         free(t1p);
     755        test_base( t1p, elemSize, align );
     756        test_fill( t1p, 0, elemSize, FillC );
     757        test_use( t1p, elemSize / elemSize );
     758        free( t1p );
    761759
    762760        t1p = alloc( align`align, FillT1`fill );
    763         test_base(t1p, elemSize, align);
    764         test_fill(t1p, 0, 1, FillT1);
    765         test_use(t1p, elemSize / elemSize);
    766         free(t1p);
     761        test_base( t1p, elemSize, align );
     762        test_fill( t1p, 0, 1, FillT1);
     763        test_use( t1p, elemSize / elemSize );
     764        free( t1p );
    767765
    768766        t1p = alloc( dim, align`align, FillC`fill );
    769         test_base(t1p, size, align);
    770         test_fill(t1p, 0, size, FillC);
    771         test_use(t1p, size / elemSize);
    772         free(t1p);
     767        test_base( t1p, size, align );
     768        test_fill( t1p, 0, size, FillC );
     769        test_use( t1p, size / elemSize );
     770        free( t1p );
    773771
    774772        t1p = alloc( 0, align`align, FillC`fill );
    775         test_base(t1p, 0, libAlign);
    776         free(t1p);
     773        test_base( t1p, 0, libAlign );
     774        free( t1p );
    777775
    778776        t1p = alloc( dim, align`align, FillT1`fill );
    779         test_base(t1p, size, align);
    780         test_fill(t1p, 0, dim, FillT1);
    781         test_use(t1p, size / elemSize);
    782         free(t1p);
     777        test_base( t1p, size, align );
     778        test_fill( t1p, 0, dim, FillT1);
     779        test_use( t1p, size / elemSize );
     780        free( t1p );
    783781
    784782        t1p = alloc( 0, align`align, FillT1`fill );
    785         test_base(t1p, 0, libAlign);
    786         free(t1p);
     783        test_base( t1p, 0, libAlign );
     784        free( t1p );
    787785
    788786        t1p = alloc( dim, align`align, [FillT1A, dim / 4]`fill );
    789         test_base(t1p, size, align);
    790         test_fill(t1p, 0, size/4, FillT1A);
    791         test_use(t1p, size / elemSize);
    792         free(t1p);
     787        test_base( t1p, size, align );
     788        test_fill( t1p, 0, size/4, FillT1A );
     789        test_use( t1p, size / elemSize );
     790        free( t1p );
    793791
    794792        t1p = alloc( 0, align`align, [FillT1A, dim / 4]`fill );
    795         test_base(t1p, 0, libAlign);
    796         free(t1p);
     793        test_base( t1p, 0, libAlign );
     794        free( t1p );
    797795
    798796        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    799797        t1p = alloc( dim, t1op`realloc, align`align, FillC`fill );
    800         test_base(t1p, size, align);
    801         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    802         test_use(t1p, size / elemSize);
    803         free(t1p);
     798        test_base( t1p, size, align );
     799        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     800        test_use( t1p, size / elemSize );
     801        free( t1p );
    804802
    805803        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    806804        t1p = alloc( dim / 4, t1op`realloc, align`align, FillC`fill );
    807         test_base(t1p, size / 4, align);
    808         test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef});
    809         test_use(t1p, size / 4 / elemSize);
    810         free(t1p);
     805        test_base( t1p, size / 4, align );
     806        test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});
     807        test_use( t1p, size / 4 / elemSize );
     808        free( t1p );
    811809
    812810        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    813811        t1p = alloc( dim * 4, t1op`realloc, align`align, FillC`fill );
    814         test_base(t1p, size * 4, align);
    815         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    816         test_fill(t1p, size, size * 4, FillC);
    817         test_use(t1p, size * 4 / elemSize);
    818         free(t1p);
     812        test_base( t1p, size * 4, align );
     813        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     814        test_fill( t1p, size, size * 4, FillC );
     815        test_use( t1p, size * 4 / elemSize );
     816        free( t1p );
    819817
    820818        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    821819        t1p = alloc( 0, t1op`realloc, align`align, FillC`fill );
    822         test_base(t1p, 0, libAlign);
    823         free(t1p);
    824 
    825         t1p = alloc( dim, ((T1*)0p)`realloc, align`align, FillC`fill );
    826         test_base(t1p, size, align);
    827         test_fill(t1p, 0, size, FillC);
    828         test_use(t1p, size / elemSize);
    829         free(t1p);
    830 
    831         t1p = alloc( 0, ((T1*)0p)`realloc, align`align, FillC`fill );
    832         test_base(t1p, 0, libAlign);
    833         free(t1p);
    834 
    835         t1op = alloc( dim, ((T1){0xdeadbeef})`fill);
     820        test_base( t1p, 0, libAlign );
     821        free( t1p );
     822
     823        t1p = alloc( dim, 0p`realloc, align`align, FillC`fill );
     824        test_base( t1p, size, align );
     825        test_fill( t1p, 0, size, FillC );
     826        test_use( t1p, size / elemSize );
     827        free( t1p );
     828
     829        t1p = alloc( 0, 0p`realloc, align`align, FillC`fill );
     830        test_base( t1p, 0, libAlign );
     831        free( t1p );
     832
     833        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    836834        t1p = alloc( dim, t1op`realloc, align`align, FillT1`fill );
    837         test_base(t1p, size, align);
    838         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    839         test_use(t1p, size / elemSize);
    840         free(t1p);
     835        test_base( t1p, size, align );
     836        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     837        test_use( t1p, size / elemSize );
     838        free( t1p );
    841839
    842840        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    843841        t1p = alloc( dim / 4, t1op`realloc, align`align, FillT1`fill );
    844         test_base(t1p, size / 4, align);
    845         test_fill(t1p, 0, dim / 4, (T1){0xdeadbeef});
    846         test_use(t1p, size / 4 / elemSize);
    847         free(t1p);
     842        test_base( t1p, size / 4, align );
     843        test_fill( t1p, 0, dim / 4, (T1){0xdeadbeef});
     844        test_use( t1p, size / 4 / elemSize );
     845        free( t1p );
    848846
    849847        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    850848        t1p = alloc( dim * 4, t1op`realloc, align`align, FillT1`fill );
    851         test_base(t1p, size * 4, align);
    852         test_fill(t1p, 0, dim, (T1){0xdeadbeef});
    853         test_fill(t1p, dim, dim * 4, FillT1);
    854         test_use(t1p, size * 4 / elemSize);
    855         free(t1p);
     849        test_base( t1p, size * 4, align );
     850        test_fill( t1p, 0, dim, (T1){0xdeadbeef});
     851        test_fill( t1p, dim, dim * 4, FillT1);
     852        test_use( t1p, size * 4 / elemSize );
     853        free( t1p );
    856854
    857855        t1op = alloc( dim, ((T1){0xdeadbeef})`fill );
    858856        t1p = alloc( 0, t1op`realloc, align`align, FillT1`fill );
    859         test_base(t1p, 0, libAlign);
    860         free(t1p);
    861 
    862         t1p = alloc( dim, ((T1*)0p)`realloc, align`align, FillT1`fill );
    863         test_base(t1p, size, align);
    864         test_fill(t1p, 0, dim, FillT1);
    865         test_use(t1p, size / elemSize);
    866         free(t1p);
    867 
    868         t1p = alloc( 0, ((T1*)0p)`realloc, align`align, FillT1`fill );
    869         test_base(t1p, 0, libAlign);
    870         free(t1p);
    871 
    872         if (tests_failed == 0) printf("PASSED alloc tests (aligned struct)\n\n");
    873         else printf("failed alloc tests (aligned struct) : %d/%d\n\n", tests_failed, tests_total);
    874 
    875         printf("(if applicable) alignment error below indicates memory trashing caused by test_use.\n\n");
    876         free(FillA);
    877         free(FillT1A);
    878         return 0;
     857        test_base( t1p, 0, libAlign );
     858        free( t1p );
     859
     860        t1p = alloc( dim, 0p`realloc, align`align, FillT1`fill );
     861        test_base( t1p, size, align );
     862        test_fill( t1p, 0, dim, FillT1);
     863        test_use( t1p, size / elemSize );
     864        free( t1p );
     865
     866        t1p = alloc( 0, 0p`realloc, align`align, FillT1`fill );
     867        test_base( t1p, 0, libAlign );
     868        free( t1p );
     869
     870        if ( tests_failed == 0) printf( "PASSED alloc tests (aligned struct)\n\n");
     871        else printf( "failed alloc tests ( aligned struct ) : %d/%d\n\n", tests_failed, tests_total );
     872
     873        printf( "(if applicable) alignment error below indicates memory trashing caused by test_use.\n\n");
     874        free( FillA );
     875        free( FillT1A );
    879876} // main
  • tests/enum.cfa

    r9e23b446 rffec1bf  
    2424}
    2525
     26// test constant-expressions
     27
     28struct S {
     29    int i;
     30};
     31enum K { P = 3 + 4 };
     32enum Y { W = 9 + (3 && 4 || 7)};
     33int p[W];
     34enum { X = W + -3 + ~1 / 2 * (int)4 + sizeof(struct S) + _Alignof(struct S) || 3 && 5 + (3 ? 1 : 2 ) + __builtin_offsetof(struct S, i ) };
     35int x[X];
     36enum { B = 3 + 4 - 7 * 20 / 34 << 3 >> 4 > 8 < 9 <= 23 >= 42 == 12 != 13  & 4 ^ 2 | 8 + sizeof(struct S) + _Alignof(struct S) };
     37int y[B];
     38enum { J = +3 + -4 / ~20 * ! 0 };
     39int z[J] = { 1, 2, 3 };
     40int aa[41] @= { [3] : 3, [1] : 6 };
     41
    2642//Dummy main
    2743int main(int argc, char const *argv[]) {
  • tests/enum_tests/structEnum.cfa

    r9e23b446 rffec1bf  
    2424int main() {
    2525    printf("%d %c\n", apple.x, apple.y);
    26     // Failed; enumInstType is now not a real type and not instantiated. 
     26    // Failed; enumInstType is now not a real type and not instantiated.
    2727    // Not sure if we want that
    2828    // printf("%d %c\n", second.x, second.y);
    2929    return 0;
    3030}
    31 
    32 
    33 
  • tests/pybin/tools.py

    r9e23b446 rffec1bf  
    4646
    4747                        print(cmd)
    48                         return 0, None
     48                        return 0, None, None
    4949
    5050                with contextlib.ExitStack() as onexit:
     
    291291################################################################################
    292292def jobserver_version():
    293         make_ret, out, err = sh('make', '.test_makeflags', '-j2', output_file=subprocess.PIPE, error=subprocess.PIPE)
     293        make_ret, out, err = sh('make', '.test_makeflags', '-j2', ignore_dry_run = True, output_file=subprocess.PIPE, error=subprocess.PIPE)
    294294        if make_ret != 0:
    295295                print("ERROR: cannot find Makefile jobserver version", file=sys.stderr)
  • tests/unified_locking/.expect/pthread_locks.txt

    r9e23b446 rffec1bf  
    55Start Test 3: lock and condition variable multiple acquire and wait/notify
    66Done Test 3
     7Start Test 4: lock and condition variable single timed wait/notify
     8Done Test 4
  • tests/unified_locking/mutex_test.hfa

    r9e23b446 rffec1bf  
    2222}
    2323
    24 uint32_t cs() {
     24uint32_t cs(uint32_t & entries) {
    2525        thread$ * me = active_thread();
    2626        uint32_t value;
    2727        lock(mo.l);
    2828        {
     29                entries++;
    2930                uint32_t tsum = mo.sum;
    3031                uint32_t cnt = mo.cnt;
     
    4243thread LockCheck {
    4344        uint32_t sum;
     45        uint32_t entries;
    4446};
    4547
    4648void main(LockCheck & this) {
    4749        this.sum = 0;
     50        this.entries = 0;
    4851        for(num_times) {
    4952                trash();
    50                 this.sum += cs();
     53                this.sum += cs( this.entries );
    5154                trash();
    5255                yield(random(10));
     
    5861        mo.sum = -32;
    5962        mo.cnt = 0;
     63        uint32_t real_entries = 0;
    6064        processor p[2];
    6165        sout | "Starting";
     
    6367                LockCheck checkers[13];
    6468                for(i;13) {
    65                         sum += join(checkers[i]).sum;
     69                        LockCheck & curr = join(checkers[i]);
     70                        sum += curr.sum;
     71                        real_entries += curr.entries;
    6672                }
    6773        }
    6874        sout | "Done!";
    69         if(mo.cnt != (13 * num_times)) sout | "Invalid cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')';
     75        if(real_entries != (13 * num_times)) sout | "Invalid real cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')';
     76        if(mo.cnt != (13 * num_times)) sout | "Invalid concurrent cs count!" | mo.cnt | "vs "| (13 * num_times) | "(13 *" | num_times | ')';
    7077        if(sum == mo.sum) sout | "Match!";
    7178        else sout | "No Match!" | sum | "vs" | mo.sum;
  • tests/unified_locking/pthread_locks.cfa

    r9e23b446 rffec1bf  
    33#include <stdlib.hfa>
    44#include <thread.hfa>
     5#include <time.h>
     6#include <stdlib.hfa>
    57
    6 const unsigned int num_times = 50000;
     8const unsigned int num_times = 50;
    79
    810simple_owner_lock l;
    911pthread_cond_var( simple_owner_lock ) c;
     12
     13owner_lock l2;
     14condition_variable( owner_lock ) c2;
    1015
    1116volatile int counter = 0;
     
    5964}
    6065
     66thread Wait_Time_Signal_1 {};
     67
     68void main( Wait_Time_Signal_1 & this ) {
     69        for (unsigned int i = 0; i < num_times; i++) {
     70                lock(l);
     71                if(empty(c) || random(10) >= 9 ) {
     72                        timespec t;
     73                        clock_gettime(CLOCK_REALTIME, &t);
     74                        timespec waitTime{0,1};
     75                        bool woken = wait(c,l, t + waitTime);
     76                }else{
     77                        notify_one(c);
     78                }
     79                unlock(l);
     80        }
     81}
     82
    6183int main() {
    62         processor p[3];
     84        processor p[1];
    6385        printf("Start Test 1: lock and condition variable single wait/notify\n");
    6486        {
     
    78100        }
    79101        printf("Done Test 3\n");
     102
     103        printf("Start Test 4: lock and condition variable single timed wait/notify\n");
     104        {
     105                Wait_Time_Signal_1 t1[2];
     106        }
     107        printf("Done Test 4\n");
    80108}
  • tools/gdb/utils-gdb.py

    r9e23b446 rffec1bf  
    8989        return argv
    9090
    91 def get_cluster_root():
    92         """
    93         Return: gdb.Value of globalClusters.root (is an address)
    94         """
     91class ClusterIter:
     92        def __init__(self, root):
     93                self.curr = None
     94                self.root = root
     95
     96        def __iter__(self):
     97                return self
     98
     99        def __next__(self):
     100                # Clusters form a cycle
     101                # If we haven't seen the root yet, then the root is the first
     102                if not self.curr:
     103                        self.curr = self.root
     104                        return self.curr
     105
     106                # if we already saw the root, then go forward
     107                self.curr = self.curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
     108
     109                # if we reached the root again, then we are done
     110                if self.curr == self.root:
     111                        raise StopIteration
     112
     113                # otherwise return the next
     114                return self.curr
     115
     116def all_clusters():
     117        """
     118        Return: a list of all the clusters as an iterator.
     119        obtained from gdb.Value of globalClusters.root (is an address)
     120        """
     121        if not is_cforall():
     122                return []
     123
    95124        cluster_root = gdb.parse_and_eval('_X11mainClusterPS7cluster_1')
    96125        if cluster_root.address == 0x0:
    97126                print('No clusters, program terminated')
    98         return cluster_root
    99 
    100 def get_sched_lock():
    101         """
    102         Return: gdb.Value of __scheduler_lock
    103         """
    104         lock = gdb.parse_and_eval('_X16__scheduler_lockPS20__scheduler_RWLock_t_1')
    105         if lock.address == 0x0:
    106                 print('No scheduler lock, program terminated')
    107         return lock
    108 
    109 def all_clusters():
    110         if not is_cforall():
    111                 return None
    112 
    113         cluster_root = get_cluster_root()
    114         if cluster_root.address == 0x0:
    115                 return
    116 
    117         curr = cluster_root
    118         ret = [curr]
    119 
    120         while True:
    121                 curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
    122                 if curr == cluster_root:
    123                         break
    124 
    125                 ret.append(curr)
    126 
    127         return ret
     127                return []
     128
     129        return ClusterIter(cluster_root)
     130
     131class ProcIter:
     132        def __init__(self, root):
     133                self.curr = None
     134                self.root = root
     135
     136        def __iter__(self):
     137                return self
     138
     139        def check(self):
     140                # check if this is the last value
     141                addr = int(self.curr)
     142                mask = 1 << ((8 * int(gdb.parse_and_eval('sizeof(void*)'))) - 1)
     143                if 0 != (mask & addr):
     144                        raise StopIteration
     145
     146        def __next__(self):
     147                cfa_t = get_cfa_types()
     148
     149                # Processors form a cycle
     150                # If we haven't seen the root yet, then the root is the first
     151                if not self.curr:
     152                        my_next = self.root
     153                        self.curr = my_next.cast(cfa_t.processor_ptr)
     154
     155                        #check if this is an empty list
     156                        self.check()
     157
     158                        return self.curr
     159
     160                # if we already saw the root, then go forward
     161                my_next = self.curr['__anonymous_object2225']['_X4nextPY13__tE_generic__1']
     162                self.curr = my_next.cast(cfa_t.processor_ptr)
     163
     164                #check if we reached the end
     165                self.check()
     166
     167                # otherwise return the next
     168                return self.curr
     169
     170def proc_list(cluster):
     171        """
     172        Return: for a given processor, return the active and idle processors, as 2 iterators
     173        """
     174        cfa_t = get_cfa_types()
     175        proclist = cluster['_X5procsS19__cluster_proc_list_1']
     176        idle = proclist['_X5idlesS5dlist_S9processorS5dlink_S9processor___1']['__anonymous_object2167']['_X4nextPY13__tE_generic__1']
     177        active = proclist['_X7activesS5dlist_S9processorS5dlink_S9processor___1']['__anonymous_object2167']['_X4nextPY13__tE_generic__1']
     178        return ProcIter(active.cast(cfa_t.processor_ptr)), ProcIter(idle.cast(cfa_t.processor_ptr))
    128179
    129180def all_processors():
    130         if not is_cforall():
    131                 return None
    132 
    133         cfa_t = get_cfa_types()
    134 
    135         # get processors from registration to the RWlock
    136         lock = get_sched_lock()
    137 
    138         #get number of elements
    139         count = lock['_X5readyVj_1']
    140 
    141         #find all the procs
    142         raw_procs = [lock['_X4dataPS21__scheduler_lock_id_t_1'][i]['_X6handleVPS16__processor_id_t_1'] for i in range(count)]
    143 
    144         # pre cast full procs
    145         procs = [p.cast(cfa_t.processor_ptr) for p in raw_procs if p['_X9full_procb_1']]
    146 
    147         # sort procs by clusters
    148         return sorted(procs, key=lambda p: p['_X4cltrPS7cluster_1'])
     181        procs = []
     182        for c in all_clusters():
     183                active, idle = proc_list(c)
     184                for p in active:
     185                        procs.append(p)
     186
     187                for p in idle:
     188                        procs.append(p)
     189
     190        print(procs)
     191        return procs
    149192
    150193def tls_for_pthread(pthrd):
     
    160203
    161204def tls_for_proc(proc):
    162         return tls_for_pthread(proc['_X13kernel_threadm_1'])
     205        return proc['_X10local_dataPS16KernelThreadData_1']
    163206
    164207def thread_for_pthread(pthrd):
     
    180223def lookup_cluster(name = None):
    181224        """
    182         Look up a cluster given its ID
     225        Look up one or more cluster given a name
    183226        @name: str
    184227        Return: gdb.Value
     
    187230                return None
    188231
    189         root = get_cluster_root()
    190         if root.address == 0x0:
     232        clusters = all_clusters()
     233        if not clusters:
    191234                return None
    192235
    193236        if not name:
    194                 return root
     237                return clusters.root
    195238
    196239        # lookup for the task associated with the id
    197         cluster = None
    198         curr = root
    199         while True:
    200                 if curr['_X4namePKc_1'].string() == name:
    201                         cluster = curr.address
    202                         break
    203                 curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
    204                 if curr == root or curr == 0x0:
    205                         break
    206 
    207         if not cluster:
     240        found = [c for c in clusters if c['_X4namePKc_1'].string() == name]
     241
     242        if not found:
    208243                print("Cannot find a cluster with the name: {}.".format(name))
    209244                return None
    210245
    211         return cluster
     246        return found
     247
    212248
    213249def lookup_threads_by_cluster(cluster):
     
    294330                super(Processors, self).__init__('info processors', gdb.COMMAND_USER)
    295331
    296         def print_processor(self, processor):
     332        def print_processor(self, processor, in_stats):
    297333                should_stop = processor['_X12do_terminateVb_1']
    298334                if not should_stop:
    299                         midle = processor['_X6$linksS7$dlinks_S9processor__1']['_X4nextS9$mgd_link_Y13__tE_generic___1']['_X4elemPY13__tE_generic__1'] != 0x0
    300                         end   = processor['_X6$linksS7$dlinks_S9processor__1']['_X4nextS9$mgd_link_Y13__tE_generic___1']['_X10terminatorPv_1'] != 0x0
    301 
    302                         status = 'Idle' if midle or end else 'Active'
     335                        status = in_stats
    303336                else:
    304337                        stop_count  = processor['_X10terminatedS9semaphore_1']['_X5counti_1']
     
    336369                        return
    337370
    338                 procs = all_processors()
    339 
    340371                print('{:>20}  {:>11}  {:<7}  {}'.format('Processor', '', 'Pending', 'Object'))
    341372                print('{:>20}  {:>11}  {:<7}  {}'.format('Name', 'Status', 'Yield', 'Address'))
    342                 cl = None
    343                 for p in procs:
    344                         # if this is a different cluster print it
    345                         if cl != p['_X4cltrPS7cluster_1']:
    346                                 if cl:
    347                                         print()
    348                                 cl = p['_X4cltrPS7cluster_1']
    349                                 print('Cluster {}'.format(cl['_X4namePKc_1'].string()))
    350 
     373                for c in clusters:
     374                        print('Cluster {}'.format(c['_X4namePKc_1'].string()))
     375
     376                        active, idle = proc_list(c)
    351377                        # print the processor information
    352                         self.print_processor(p)
     378                        for p in active:
     379                                self.print_processor(p, 'Active')
     380
     381                        for p in idle:
     382                                self.print_processor(p, 'Idle')
     383
     384                        print()
    353385
    354386                print()
     
    433465                        cluster = lookup_cluster(arg)
    434466                        if not cluster:
    435                                 print("Could not find cluster '{}'".format(arg))
     467                                print("No matching cluster")
    436468                                return
    437469
Note: See TracChangeset for help on using the changeset viewer.