Aug 9, 2021, 4:21:44 PM (16 months ago)
enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast-unique-expr, pthread-emulation, qualifiedEnum
d874f59
4f89e7b (diff), 5438e41 (diff)
Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

 r4f89e7b } printf("Duration (ms)        : %'ld\n", (end - start)dms); printf("Duration (ms)        : %'lf\n", (end - start)dms); printf("Number of processors : %'d\n", nprocs); printf("Number of threads    : %'d\n", tthreads);

 r4f89e7b // ================================================== // Do some work by accessing 'cnt' cells in the array __attribute__((noinline)) void work(MyData & data, size_t cnt, uint64_t & state) { for (cnt) { __attribute__((noinline)) void work(MyData & data, size_t cnt_, uint64_t & state) { for (cnt_) { access(data, __xorshift64(state)); } if( nspots == 0 ) { nspots = nthreads - nprocs; } if( nspots == 0 ) { fprintf(stderr, "--nspots must be set or --nthreads set to something bigger than --nprocs\n"); exit(EXIT_FAILURE); } Time start, end;

 r4f89e7b #include "libfibre/fibre.h" FibreBarrier * barrier; FredBarrier * barrier; struct __attribute__((aligned(128))) counter_t { int value = 0; const char * arg = optarg ? optarg : ""; size_t len = 0; char * end; switch(opt) { FibreInit(); barrier = new FibreBarrier(nthreads + 1); barrier = new FredBarrier(nthreads + 1); { Context::CurrCluster().addWorkers(nprocs);
• ## doc/theses/andrew_beach_MMath/performance.tex

 r4f89e7b % Match All     &   3719462155 &   43294042 &   3223004977 &  1286054154 &   623887874 \\ % Match None    &   4971630929 &   55311709 &   9481225467 &  1310251289 &   623752624 \\ % % run-algol-04-a % -------------- % Raise Empty   & 0.0 & 0.0 &  3250260945 & 0.0 & 0.0 \\ % Raise D'tor   & 0.0 & 0.0 & 29017675113 & N/A & N/A \\ % Raise Finally & 0.0 & 0.0 &         N/A & 0.0 & 0.0 \\ % Raise Other   & 0.0 & 0.0 & 24411823773 & 0.0 & 0.0 \\ % Cross Handler & 0.0 & 0.0 &      769334 & 0.0 & 0.0 \\ % Cross Finally & 0.0 & N/A &         N/A & 0.0 & 0.0 \\ % Match All     & 0.0 & 0.0 &  3254283504 & 0.0 & 0.0 \\ % Match None    & 0.0 & 0.0 &  9476060146 & 0.0 & 0.0 \\ \begin{tabular}{|l|c c c c c|} \hline % Match All     &   3189512499 &   39124453 &   2667795989 &  1525889031 &   733785613 \\ % Match None    &   4094675477 &   48749857 &   7850618572 &  1566713577 &   733478963 \\ % % run-plg7a-04-a % -------------- % 0.0 are unfilled. % Raise Empty   & 0.0 & 0.0 &  2770781479 & 0.0 & 0.0 \\ % Raise D'tor   & 0.0 & 0.0 & 23530084907 & N/A & N/A \\ % Raise Finally & 0.0 & 0.0 &         N/A & 0.0 & 0.0 \\ % Raise Other   & 0.0 & 0.0 & 23816827982 & 0.0 & 0.0 \\ % Cross Handler & 0.0 & 0.0 &     1422188 & 0.0 & 0.0 \\ % Cross Finally & 0.0 & N/A &         N/A & 0.0 & 0.0 \\ % Match All     & 0.0 & 0.0 &  2671989778 & 0.0 & 0.0 \\ % Match None    & 0.0 & 0.0 &  7829059869 & 0.0 & 0.0 \\ % PLG7A (in seconds) & \CFA (Terminate) & \CFA (Resume) & \Cpp & Java & Python \\ \hline % Raise Empty   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ % Raise D'tor   & 0.0 & 0.0 & 0.0 & N/A & N/A \\ % Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\ % Raise Other   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ % Cross Handler & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ % Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\ % Match All     & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ % Match None    & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ Raise Empty   & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ Raise D'tor   & 0.0 & 0.0 & 0.0 & N/A & N/A \\ This means that while \CFA does not actually keep up with Python in every case it is no worse than roughly half the speed of \Cpp. This is good case it is usually no worse than roughly half the speed of \Cpp. This is good enough for the prototyping purposes of the project. One difference not shown is that optimizations in \CFA is very fragile. The \CFA compiler uses gcc as part of its complation process and the version of gcc could change the speed of some of the benchmarks by 10 times or more. Similar changes to g++ for the \Cpp benchmarks had no significant changes. Because of the connection between gcc and g++; this suggests it is not the optimizations that are changing but how the optimizer is detecting if the optimizations can be applied. So the optimizations are always applied in g++, but only newer versions of gcc can detect that they can be applied in the more complex \CFA code. The test case where \CFA falls short is Raise Other, the case where the stack is unwound including a bunch of non-matching handlers. This slowdown seems to come from missing optimizations, the results above came from gcc/g++ 10 (gcc as \CFA backend or g++ for \Cpp) but the results change if they are run in gcc/g++ 9 instead. Importantly, there is a huge slowdown in \Cpp's results bringing that brings \CFA's performace back in that roughly half speed area. However many other \CFA benchmarks increase their run-time by a similar amount falling far behind their \Cpp counter-parts. This suggests that the performance issue in Raise Other is just an optimization not being applied. Later versions of gcc may be able to optimize this case further, at least down to the half of \Cpp mark. A \CFA compiler that directly produced assembly could do even better as it would not have to work across some of \CFA's current abstractions, like the try terminate function. Resumption exception handling is also incredibly fast. Often an order of

 r4f89e7b char ** cfa_args_argv; char ** cfa_args_envp; int cfa_main_returned = 0; int main(int argc, char* argv[], char* envp[]) { cfa_args_argv = argv; cfa_args_envp = envp; return invoke_main(argc, argv, envp); int ret = invoke_main(argc, argv, envp); cfa_main_returned = 1; return ret; }
• ## libcfa/src/concurrency/invoke.h

 r4f89e7b bool corctx_flag; int last_cpu; //SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it