Changeset 50b29d9


Ignore:
Timestamp:
Aug 9, 2021, 4:21:44 PM (16 months ago)
Author:
Peter A. Buhr <pabuhr@…>
Branches:
enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast-unique-expr, pthread-emulation, qualifiedEnum
Children:
d874f59
Parents:
4f89e7b (diff), 5438e41 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

Files:
11 edited

Legend:

Unmodified
Added
Removed
  • benchmark/readyQ/cycle.cfa

    r4f89e7b r50b29d9  
    8585                }
    8686
    87                 printf("Duration (ms)        : %'ld\n", (end - start)`dms);
     87                printf("Duration (ms)        : %'lf\n", (end - start)`dms);
    8888                printf("Number of processors : %'d\n", nprocs);
    8989                printf("Number of threads    : %'d\n", tthreads);
  • benchmark/readyQ/locality.cfa

    r4f89e7b r50b29d9  
    126126// ==================================================
    127127// Do some work by accessing 'cnt' cells in the array
    128 __attribute__((noinline)) void work(MyData & data, size_t cnt, uint64_t & state) {
    129         for (cnt) {
     128__attribute__((noinline)) void work(MyData & data, size_t cnt_, uint64_t & state) {
     129        for (cnt_) {
    130130                access(data, __xorshift64(state));
    131131        }
     
    200200
    201201        if( nspots == 0 ) { nspots = nthreads - nprocs; }
     202        if( nspots == 0 ) {
     203                fprintf(stderr, "--nspots must be set or --nthreads set to something bigger than --nprocs\n");
     204                exit(EXIT_FAILURE);
     205        }
    202206
    203207        Time start, end;
  • benchmark/readyQ/yield.cpp

    r4f89e7b r50b29d9  
    2929#include "libfibre/fibre.h"
    3030
    31 FibreBarrier * barrier;
     31FredBarrier * barrier;
    3232struct __attribute__((aligned(128))) counter_t {
    3333        int value = 0;
     
    6666
    6767                const char * arg = optarg ? optarg : "";
    68                 size_t len = 0;
    6968                char * end;
    7069                switch(opt) {
     
    111110
    112111                FibreInit();
    113                 barrier = new FibreBarrier(nthreads + 1);
     112                barrier = new FredBarrier(nthreads + 1);
    114113                {
    115114                        Context::CurrCluster().addWorkers(nprocs);
  • doc/theses/andrew_beach_MMath/performance.tex

    r4f89e7b r50b29d9  
    160160% Match All     &   3719462155 &   43294042 &   3223004977 &  1286054154 &   623887874 \\
    161161% Match None    &   4971630929 &   55311709 &   9481225467 &  1310251289 &   623752624 \\
     162%
     163% run-algol-04-a
     164% --------------
     165% Raise Empty   & 0.0 & 0.0 &  3250260945 & 0.0 & 0.0 \\
     166% Raise D'tor   & 0.0 & 0.0 & 29017675113 & N/A & N/A \\
     167% Raise Finally & 0.0 & 0.0 &         N/A & 0.0 & 0.0 \\
     168% Raise Other   & 0.0 & 0.0 & 24411823773 & 0.0 & 0.0 \\
     169% Cross Handler & 0.0 & 0.0 &      769334 & 0.0 & 0.0 \\
     170% Cross Finally & 0.0 & N/A &         N/A & 0.0 & 0.0 \\
     171% Match All     & 0.0 & 0.0 &  3254283504 & 0.0 & 0.0 \\
     172% Match None    & 0.0 & 0.0 &  9476060146 & 0.0 & 0.0 \\
     173
    162174\begin{tabular}{|l|c c c c c|}
    163175\hline
     
    196208% Match All     &   3189512499 &   39124453 &   2667795989 &  1525889031 &   733785613 \\
    197209% Match None    &   4094675477 &   48749857 &   7850618572 &  1566713577 &   733478963 \\
     210%
     211% run-plg7a-04-a
     212% --------------
     213% 0.0 are unfilled.
     214% Raise Empty   & 0.0 & 0.0 &  2770781479 & 0.0 & 0.0 \\
     215% Raise D'tor   & 0.0 & 0.0 & 23530084907 & N/A & N/A \\
     216% Raise Finally & 0.0 & 0.0 &         N/A & 0.0 & 0.0 \\
     217% Raise Other   & 0.0 & 0.0 & 23816827982 & 0.0 & 0.0 \\
     218% Cross Handler & 0.0 & 0.0 &     1422188 & 0.0 & 0.0 \\
     219% Cross Finally & 0.0 & N/A &         N/A & 0.0 & 0.0 \\
     220% Match All     & 0.0 & 0.0 &  2671989778 & 0.0 & 0.0 \\
     221% Match None    & 0.0 & 0.0 &  7829059869 & 0.0 & 0.0 \\
    198222
    199223% PLG7A (in seconds)
     
    202226              & \CFA (Terminate) & \CFA (Resume) & \Cpp & Java & Python \\
    203227\hline
    204 % Raise Empty   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
    205 % Raise D'tor   & 0.0 & 0.0 & 0.0 & N/A & N/A \\
    206 % Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\
    207 % Raise Other   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
    208 % Cross Handler & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
    209 % Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\
    210 % Match All     & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
    211 % Match None    & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
    212228Raise Empty   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
    213229Raise D'tor   & 0.0 & 0.0 & 0.0 & N/A & N/A \\
     
    231247
    232248This means that while \CFA does not actually keep up with Python in every
    233 case it is no worse than roughly half the speed of \Cpp. This is good
     249case it is usually no worse than roughly half the speed of \Cpp. This is good
    234250enough for the prototyping purposes of the project.
    235251
    236 One difference not shown is that optimizations in \CFA is very fragile.
    237 The \CFA compiler uses gcc as part of its complation process and the version
    238 of gcc could change the speed of some of the benchmarks by 10 times or more.
    239 Similar changes to g++ for the \Cpp benchmarks had no significant changes.
    240 Because of the connection between gcc and g++; this suggests it is not the
    241 optimizations that are changing but how the optimizer is detecting if the
    242 optimizations can be applied. So the optimizations are always applied in
    243 g++, but only newer versions of gcc can detect that they can be applied in
    244 the more complex \CFA code.
     252The test case where \CFA falls short is Raise Other, the case where the
     253stack is unwound including a bunch of non-matching handlers.
     254This slowdown seems to come from missing optimizations,
     255the results above came from gcc/g++ 10 (gcc as \CFA backend or g++ for \Cpp)
     256but the results change if they are run in gcc/g++ 9 instead.
     257Importantly, there is a huge slowdown in \Cpp's results bringing that brings
     258\CFA's performace back in that roughly half speed area. However many other
     259\CFA benchmarks increase their run-time by a similar amount falling far
     260behind their \Cpp counter-parts.
     261
     262This suggests that the performance issue in Raise Other is just an
     263optimization not being applied. Later versions of gcc may be able to
     264optimize this case further, at least down to the half of \Cpp mark.
     265A \CFA compiler that directly produced assembly could do even better as it
     266would not have to work across some of \CFA's current abstractions, like
     267the try terminate function.
    245268
    246269Resumption exception handling is also incredibly fast. Often an order of
  • libcfa/prelude/bootloader.cf

    r4f89e7b r50b29d9  
    33char ** cfa_args_argv;
    44char ** cfa_args_envp;
     5int cfa_main_returned = 0;
    56
    67int main(int argc, char* argv[], char* envp[]) {
     
    89        cfa_args_argv = argv;
    910        cfa_args_envp = envp;
    10         return invoke_main(argc, argv, envp);
     11        int ret = invoke_main(argc, argv, envp);
     12        cfa_main_returned = 1;
     13        return ret;
    1114}
  • libcfa/src/concurrency/invoke.h

    r4f89e7b r50b29d9  
    170170                bool corctx_flag;
    171171
     172                int last_cpu;
     173
    172174                //SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it
    173175
  • libcfa/src/concurrency/kernel.cfa

    r4f89e7b r50b29d9  
    394394        __builtin_prefetch( thrd_dst->context.SP );
    395395
     396        int curr = __kernel_getcpu();
     397        if(thrd_dst->last_cpu != curr) {
     398                int64_t l = thrd_dst->last_cpu;
     399                int64_t c = curr;
     400                int64_t v = (l << 32) | c;
     401                __push_stat( __tls_stats(), v, false, "Processor", this );
     402        }
     403
     404        thrd_dst->last_cpu = curr;
     405
    396406        __cfadbg_print_safe(runtime_core, "Kernel : core %p running thread %p (%s)\n", this, thrd_dst, thrd_dst->self_cor.name);
    397407
     
    470480                                #if !defined(__CFA_NO_STATISTICS__)
    471481                                        __tls_stats()->ready.threads.threads++;
    472                                         __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this );
    473482                                #endif
    474483                                // This is case 2, the racy case, someone tried to run this thread before it finished blocking
     
    488497        #if !defined(__CFA_NO_STATISTICS__)
    489498                __tls_stats()->ready.threads.threads--;
    490                 __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this );
    491499        #endif
    492500
     
    570578                                __tls_stats()->ready.threads.extunpark++;
    571579                        }
    572                         __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", kernelTLS().this_processor );
    573580                }
    574581                else {
    575582                        __atomic_fetch_add(&cl->stats->ready.threads.threads, 1, __ATOMIC_RELAXED);
    576583                        __atomic_fetch_add(&cl->stats->ready.threads.extunpark, 1, __ATOMIC_RELAXED);
    577                         __push_stat( cl->stats, cl->stats->ready.threads.threads, true, "Cluster", cl );
    578584                }
    579585        #endif
  • libcfa/src/concurrency/kernel.hfa

    r4f89e7b r50b29d9  
    6767                unsigned target;
    6868                unsigned last;
     69                unsigned cnt;
    6970                unsigned long long int cutoff;
    7071        } rdq;
  • libcfa/src/concurrency/kernel/startup.cfa

    r4f89e7b r50b29d9  
    478478        state = Start;
    479479        self_cor{ info };
     480        last_cpu = __kernel_getcpu();
    480481        curr_cor = &self_cor;
    481482        curr_cluster = mainCluster;
  • libcfa/src/concurrency/ready_queue.cfa

    r4f89e7b r50b29d9  
    345345                        /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
    346346                        /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
    347                         uint64_t chaos = __tls_rand();
    348                         uint64_t high_chaos = (chaos >> 32);
    349                         uint64_t  mid_chaos = (chaos >> 16) & 0xffff;
    350                         uint64_t  low_chaos = chaos & 0xffff;
    351 
    352                         unsigned me = map.self;
    353                         unsigned cpu_chaos = map.start + (mid_chaos % map.count);
    354                         bool global = cpu_chaos == me;
    355 
    356                         if(global) {
    357                                 proc->rdq.target = high_chaos % lanes.count;
     347
     348                        if(0 == (__tls_rand() % 10_000)) {
     349                                proc->rdq.target = __tls_rand() % lanes.count;
    358350                        } else {
    359                                 proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (low_chaos % READYQ_SHARD_FACTOR);
     351                                unsigned cpu_chaos = map.start + (__tls_rand() % map.count);
     352                                proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (__tls_rand() % READYQ_SHARD_FACTOR);
    360353                                /* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR));
    361354                                /* paranoid */ verify(proc->rdq.target <  ((map.start + map.count) * READYQ_SHARD_FACTOR));
  • libcfa/src/concurrency/thread.cfa

    r4f89e7b r50b29d9  
    3434        preempted = __NO_PREEMPTION;
    3535        corctx_flag = false;
     36        last_cpu = __kernel_getcpu();
    3637        curr_cor = &self_cor;
    3738        self_mon.owner = &this;
Note: See TracChangeset for help on using the changeset viewer.