Changes in / [50b29d9:4f89e7b]


Ignore:
Files:
11 edited

Legend:

Unmodified
Added
Removed
  • benchmark/readyQ/cycle.cfa

    r50b29d9 r4f89e7b  
    8585                }
    8686
    87                 printf("Duration (ms)        : %'lf\n", (end - start)`dms);
     87                printf("Duration (ms)        : %'ld\n", (end - start)`dms);
    8888                printf("Number of processors : %'d\n", nprocs);
    8989                printf("Number of threads    : %'d\n", tthreads);
  • benchmark/readyQ/locality.cfa

    r50b29d9 r4f89e7b  
    126126// ==================================================
    127127// Do some work by accessing 'cnt' cells in the array
    128 __attribute__((noinline)) void work(MyData & data, size_t cnt_, uint64_t & state) {
    129         for (cnt_) {
     128__attribute__((noinline)) void work(MyData & data, size_t cnt, uint64_t & state) {
     129        for (cnt) {
    130130                access(data, __xorshift64(state));
    131131        }
     
    200200
    201201        if( nspots == 0 ) { nspots = nthreads - nprocs; }
    202         if( nspots == 0 ) {
    203                 fprintf(stderr, "--nspots must be set or --nthreads set to something bigger than --nprocs\n");
    204                 exit(EXIT_FAILURE);
    205         }
    206202
    207203        Time start, end;
  • benchmark/readyQ/yield.cpp

    r50b29d9 r4f89e7b  
    2929#include "libfibre/fibre.h"
    3030
    31 FredBarrier * barrier;
     31FibreBarrier * barrier;
    3232struct __attribute__((aligned(128))) counter_t {
    3333        int value = 0;
     
    6666
    6767                const char * arg = optarg ? optarg : "";
     68                size_t len = 0;
    6869                char * end;
    6970                switch(opt) {
     
    110111
    111112                FibreInit();
    112                 barrier = new FredBarrier(nthreads + 1);
     113                barrier = new FibreBarrier(nthreads + 1);
    113114                {
    114115                        Context::CurrCluster().addWorkers(nprocs);
  • doc/theses/andrew_beach_MMath/performance.tex

    r50b29d9 r4f89e7b  
    160160% Match All     &   3719462155 &   43294042 &   3223004977 &  1286054154 &   623887874 \\
    161161% Match None    &   4971630929 &   55311709 &   9481225467 &  1310251289 &   623752624 \\
    162 %
    163 % run-algol-04-a
    164 % --------------
    165 % Raise Empty   & 0.0 & 0.0 &  3250260945 & 0.0 & 0.0 \\
    166 % Raise D'tor   & 0.0 & 0.0 & 29017675113 & N/A & N/A \\
    167 % Raise Finally & 0.0 & 0.0 &         N/A & 0.0 & 0.0 \\
    168 % Raise Other   & 0.0 & 0.0 & 24411823773 & 0.0 & 0.0 \\
    169 % Cross Handler & 0.0 & 0.0 &      769334 & 0.0 & 0.0 \\
    170 % Cross Finally & 0.0 & N/A &         N/A & 0.0 & 0.0 \\
    171 % Match All     & 0.0 & 0.0 &  3254283504 & 0.0 & 0.0 \\
    172 % Match None    & 0.0 & 0.0 &  9476060146 & 0.0 & 0.0 \\
    173 
    174162\begin{tabular}{|l|c c c c c|}
    175163\hline
     
    208196% Match All     &   3189512499 &   39124453 &   2667795989 &  1525889031 &   733785613 \\
    209197% Match None    &   4094675477 &   48749857 &   7850618572 &  1566713577 &   733478963 \\
    210 %
    211 % run-plg7a-04-a
    212 % --------------
    213 % 0.0 are unfilled.
    214 % Raise Empty   & 0.0 & 0.0 &  2770781479 & 0.0 & 0.0 \\
    215 % Raise D'tor   & 0.0 & 0.0 & 23530084907 & N/A & N/A \\
    216 % Raise Finally & 0.0 & 0.0 &         N/A & 0.0 & 0.0 \\
    217 % Raise Other   & 0.0 & 0.0 & 23816827982 & 0.0 & 0.0 \\
    218 % Cross Handler & 0.0 & 0.0 &     1422188 & 0.0 & 0.0 \\
    219 % Cross Finally & 0.0 & N/A &         N/A & 0.0 & 0.0 \\
    220 % Match All     & 0.0 & 0.0 &  2671989778 & 0.0 & 0.0 \\
    221 % Match None    & 0.0 & 0.0 &  7829059869 & 0.0 & 0.0 \\
    222198
    223199% PLG7A (in seconds)
     
    226202              & \CFA (Terminate) & \CFA (Resume) & \Cpp & Java & Python \\
    227203\hline
     204% Raise Empty   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
     205% Raise D'tor   & 0.0 & 0.0 & 0.0 & N/A & N/A \\
     206% Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\
     207% Raise Other   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
     208% Cross Handler & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
     209% Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\
     210% Match All     & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
     211% Match None    & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
    228212Raise Empty   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
    229213Raise D'tor   & 0.0 & 0.0 & 0.0 & N/A & N/A \\
     
    247231
    248232This means that while \CFA does not actually keep up with Python in every
    249 case it is usually no worse than roughly half the speed of \Cpp. This is good
     233case it is no worse than roughly half the speed of \Cpp. This is good
    250234enough for the prototyping purposes of the project.
    251235
    252 The test case where \CFA falls short is Raise Other, the case where the
    253 stack is unwound including a bunch of non-matching handlers.
    254 This slowdown seems to come from missing optimizations,
    255 the results above came from gcc/g++ 10 (gcc as \CFA backend or g++ for \Cpp)
    256 but the results change if they are run in gcc/g++ 9 instead.
    257 Importantly, there is a huge slowdown in \Cpp's results bringing that brings
    258 \CFA's performace back in that roughly half speed area. However many other
    259 \CFA benchmarks increase their run-time by a similar amount falling far
    260 behind their \Cpp counter-parts.
    261 
    262 This suggests that the performance issue in Raise Other is just an
    263 optimization not being applied. Later versions of gcc may be able to
    264 optimize this case further, at least down to the half of \Cpp mark.
    265 A \CFA compiler that directly produced assembly could do even better as it
    266 would not have to work across some of \CFA's current abstractions, like
    267 the try terminate function.
     236One difference not shown is that optimizations in \CFA is very fragile.
     237The \CFA compiler uses gcc as part of its complation process and the version
     238of gcc could change the speed of some of the benchmarks by 10 times or more.
     239Similar changes to g++ for the \Cpp benchmarks had no significant changes.
     240Because of the connection between gcc and g++; this suggests it is not the
     241optimizations that are changing but how the optimizer is detecting if the
     242optimizations can be applied. So the optimizations are always applied in
     243g++, but only newer versions of gcc can detect that they can be applied in
     244the more complex \CFA code.
    268245
    269246Resumption exception handling is also incredibly fast. Often an order of
  • libcfa/prelude/bootloader.cf

    r50b29d9 r4f89e7b  
    33char ** cfa_args_argv;
    44char ** cfa_args_envp;
    5 int cfa_main_returned = 0;
    65
    76int main(int argc, char* argv[], char* envp[]) {
     
    98        cfa_args_argv = argv;
    109        cfa_args_envp = envp;
    11         int ret = invoke_main(argc, argv, envp);
    12         cfa_main_returned = 1;
    13         return ret;
     10        return invoke_main(argc, argv, envp);
    1411}
  • libcfa/src/concurrency/invoke.h

    r50b29d9 r4f89e7b  
    170170                bool corctx_flag;
    171171
    172                 int last_cpu;
    173 
    174172                //SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it
    175173
  • libcfa/src/concurrency/kernel.cfa

    r50b29d9 r4f89e7b  
    394394        __builtin_prefetch( thrd_dst->context.SP );
    395395
    396         int curr = __kernel_getcpu();
    397         if(thrd_dst->last_cpu != curr) {
    398                 int64_t l = thrd_dst->last_cpu;
    399                 int64_t c = curr;
    400                 int64_t v = (l << 32) | c;
    401                 __push_stat( __tls_stats(), v, false, "Processor", this );
    402         }
    403 
    404         thrd_dst->last_cpu = curr;
    405 
    406396        __cfadbg_print_safe(runtime_core, "Kernel : core %p running thread %p (%s)\n", this, thrd_dst, thrd_dst->self_cor.name);
    407397
     
    480470                                #if !defined(__CFA_NO_STATISTICS__)
    481471                                        __tls_stats()->ready.threads.threads++;
     472                                        __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this );
    482473                                #endif
    483474                                // This is case 2, the racy case, someone tried to run this thread before it finished blocking
     
    497488        #if !defined(__CFA_NO_STATISTICS__)
    498489                __tls_stats()->ready.threads.threads--;
     490                __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this );
    499491        #endif
    500492
     
    578570                                __tls_stats()->ready.threads.extunpark++;
    579571                        }
     572                        __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", kernelTLS().this_processor );
    580573                }
    581574                else {
    582575                        __atomic_fetch_add(&cl->stats->ready.threads.threads, 1, __ATOMIC_RELAXED);
    583576                        __atomic_fetch_add(&cl->stats->ready.threads.extunpark, 1, __ATOMIC_RELAXED);
     577                        __push_stat( cl->stats, cl->stats->ready.threads.threads, true, "Cluster", cl );
    584578                }
    585579        #endif
  • libcfa/src/concurrency/kernel.hfa

    r50b29d9 r4f89e7b  
    6767                unsigned target;
    6868                unsigned last;
    69                 unsigned cnt;
    7069                unsigned long long int cutoff;
    7170        } rdq;
  • libcfa/src/concurrency/kernel/startup.cfa

    r50b29d9 r4f89e7b  
    478478        state = Start;
    479479        self_cor{ info };
    480         last_cpu = __kernel_getcpu();
    481480        curr_cor = &self_cor;
    482481        curr_cluster = mainCluster;
  • libcfa/src/concurrency/ready_queue.cfa

    r50b29d9 r4f89e7b  
    345345                        /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
    346346                        /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
    347 
    348                         if(0 == (__tls_rand() % 10_000)) {
    349                                 proc->rdq.target = __tls_rand() % lanes.count;
     347                        uint64_t chaos = __tls_rand();
     348                        uint64_t high_chaos = (chaos >> 32);
     349                        uint64_t  mid_chaos = (chaos >> 16) & 0xffff;
     350                        uint64_t  low_chaos = chaos & 0xffff;
     351
     352                        unsigned me = map.self;
     353                        unsigned cpu_chaos = map.start + (mid_chaos % map.count);
     354                        bool global = cpu_chaos == me;
     355
     356                        if(global) {
     357                                proc->rdq.target = high_chaos % lanes.count;
    350358                        } else {
    351                                 unsigned cpu_chaos = map.start + (__tls_rand() % map.count);
    352                                 proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (__tls_rand() % READYQ_SHARD_FACTOR);
     359                                proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (low_chaos % READYQ_SHARD_FACTOR);
    353360                                /* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR));
    354361                                /* paranoid */ verify(proc->rdq.target <  ((map.start + map.count) * READYQ_SHARD_FACTOR));
  • libcfa/src/concurrency/thread.cfa

    r50b29d9 r4f89e7b  
    3434        preempted = __NO_PREEMPTION;
    3535        corctx_flag = false;
    36         last_cpu = __kernel_getcpu();
    3736        curr_cor = &self_cor;
    3837        self_mon.owner = &this;
Note: See TracChangeset for help on using the changeset viewer.