Changes in / [50b29d9:4f89e7b]
- Files:
-
- 11 edited
Legend:
- Unmodified
- Added
- Removed
-
benchmark/readyQ/cycle.cfa
r50b29d9 r4f89e7b 85 85 } 86 86 87 printf("Duration (ms) : %'l f\n", (end - start)`dms);87 printf("Duration (ms) : %'ld\n", (end - start)`dms); 88 88 printf("Number of processors : %'d\n", nprocs); 89 89 printf("Number of threads : %'d\n", tthreads); -
benchmark/readyQ/locality.cfa
r50b29d9 r4f89e7b 126 126 // ================================================== 127 127 // Do some work by accessing 'cnt' cells in the array 128 __attribute__((noinline)) void work(MyData & data, size_t cnt _, uint64_t & state) {129 for (cnt _) {128 __attribute__((noinline)) void work(MyData & data, size_t cnt, uint64_t & state) { 129 for (cnt) { 130 130 access(data, __xorshift64(state)); 131 131 } … … 200 200 201 201 if( nspots == 0 ) { nspots = nthreads - nprocs; } 202 if( nspots == 0 ) {203 fprintf(stderr, "--nspots must be set or --nthreads set to something bigger than --nprocs\n");204 exit(EXIT_FAILURE);205 }206 202 207 203 Time start, end; -
benchmark/readyQ/yield.cpp
r50b29d9 r4f89e7b 29 29 #include "libfibre/fibre.h" 30 30 31 F redBarrier * barrier;31 FibreBarrier * barrier; 32 32 struct __attribute__((aligned(128))) counter_t { 33 33 int value = 0; … … 66 66 67 67 const char * arg = optarg ? optarg : ""; 68 size_t len = 0; 68 69 char * end; 69 70 switch(opt) { … … 110 111 111 112 FibreInit(); 112 barrier = new F redBarrier(nthreads + 1);113 barrier = new FibreBarrier(nthreads + 1); 113 114 { 114 115 Context::CurrCluster().addWorkers(nprocs); -
doc/theses/andrew_beach_MMath/performance.tex
r50b29d9 r4f89e7b 160 160 % Match All & 3719462155 & 43294042 & 3223004977 & 1286054154 & 623887874 \\ 161 161 % Match None & 4971630929 & 55311709 & 9481225467 & 1310251289 & 623752624 \\ 162 %163 % run-algol-04-a164 % --------------165 % Raise Empty & 0.0 & 0.0 & 3250260945 & 0.0 & 0.0 \\166 % Raise D'tor & 0.0 & 0.0 & 29017675113 & N/A & N/A \\167 % Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\168 % Raise Other & 0.0 & 0.0 & 24411823773 & 0.0 & 0.0 \\169 % Cross Handler & 0.0 & 0.0 & 769334 & 0.0 & 0.0 \\170 % Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\171 % Match All & 0.0 & 0.0 & 3254283504 & 0.0 & 0.0 \\172 % Match None & 0.0 & 0.0 & 9476060146 & 0.0 & 0.0 \\173 174 162 \begin{tabular}{|l|c c c c c|} 175 163 \hline … … 208 196 % Match All & 3189512499 & 39124453 & 2667795989 & 1525889031 & 733785613 \\ 209 197 % Match None & 4094675477 & 48749857 & 7850618572 & 1566713577 & 733478963 \\ 210 %211 % run-plg7a-04-a212 % --------------213 % 0.0 are unfilled.214 % Raise Empty & 0.0 & 0.0 & 2770781479 & 0.0 & 0.0 \\215 % Raise D'tor & 0.0 & 0.0 & 23530084907 & N/A & N/A \\216 % Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\217 % Raise Other & 0.0 & 0.0 & 23816827982 & 0.0 & 0.0 \\218 % Cross Handler & 0.0 & 0.0 & 1422188 & 0.0 & 0.0 \\219 % Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\220 % Match All & 0.0 & 0.0 & 2671989778 & 0.0 & 0.0 \\221 % Match None & 0.0 & 0.0 & 7829059869 & 0.0 & 0.0 \\222 198 223 199 % PLG7A (in seconds) … … 226 202 & \CFA (Terminate) & \CFA (Resume) & \Cpp & Java & Python \\ 227 203 \hline 204 % Raise Empty & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ 205 % Raise D'tor & 0.0 & 0.0 & 0.0 & N/A & N/A \\ 206 % Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\ 207 % Raise Other & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ 208 % Cross Handler & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ 209 % Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\ 210 % Match All & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ 211 % Match None & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ 228 212 Raise Empty & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ 229 213 Raise D'tor & 0.0 & 0.0 & 0.0 & N/A & N/A \\ … … 247 231 248 232 This means that while \CFA does not actually keep up with Python in every 249 case it is usuallyno worse than roughly half the speed of \Cpp. This is good233 case it is no worse than roughly half the speed of \Cpp. This is good 250 234 enough for the prototyping purposes of the project. 251 235 252 The test case where \CFA falls short is Raise Other, the case where the 253 stack is unwound including a bunch of non-matching handlers. 254 This slowdown seems to come from missing optimizations, 255 the results above came from gcc/g++ 10 (gcc as \CFA backend or g++ for \Cpp) 256 but the results change if they are run in gcc/g++ 9 instead. 257 Importantly, there is a huge slowdown in \Cpp's results bringing that brings 258 \CFA's performace back in that roughly half speed area. However many other 259 \CFA benchmarks increase their run-time by a similar amount falling far 260 behind their \Cpp counter-parts. 261 262 This suggests that the performance issue in Raise Other is just an 263 optimization not being applied. Later versions of gcc may be able to 264 optimize this case further, at least down to the half of \Cpp mark. 265 A \CFA compiler that directly produced assembly could do even better as it 266 would not have to work across some of \CFA's current abstractions, like 267 the try terminate function. 236 One difference not shown is that optimizations in \CFA is very fragile. 237 The \CFA compiler uses gcc as part of its complation process and the version 238 of gcc could change the speed of some of the benchmarks by 10 times or more. 239 Similar changes to g++ for the \Cpp benchmarks had no significant changes. 240 Because of the connection between gcc and g++; this suggests it is not the 241 optimizations that are changing but how the optimizer is detecting if the 242 optimizations can be applied. So the optimizations are always applied in 243 g++, but only newer versions of gcc can detect that they can be applied in 244 the more complex \CFA code. 268 245 269 246 Resumption exception handling is also incredibly fast. Often an order of -
libcfa/prelude/bootloader.cf
r50b29d9 r4f89e7b 3 3 char ** cfa_args_argv; 4 4 char ** cfa_args_envp; 5 int cfa_main_returned = 0;6 5 7 6 int main(int argc, char* argv[], char* envp[]) { … … 9 8 cfa_args_argv = argv; 10 9 cfa_args_envp = envp; 11 int ret = invoke_main(argc, argv, envp); 12 cfa_main_returned = 1; 13 return ret; 10 return invoke_main(argc, argv, envp); 14 11 } -
libcfa/src/concurrency/invoke.h
r50b29d9 r4f89e7b 170 170 bool corctx_flag; 171 171 172 int last_cpu;173 174 172 //SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it 175 173 -
libcfa/src/concurrency/kernel.cfa
r50b29d9 r4f89e7b 394 394 __builtin_prefetch( thrd_dst->context.SP ); 395 395 396 int curr = __kernel_getcpu();397 if(thrd_dst->last_cpu != curr) {398 int64_t l = thrd_dst->last_cpu;399 int64_t c = curr;400 int64_t v = (l << 32) | c;401 __push_stat( __tls_stats(), v, false, "Processor", this );402 }403 404 thrd_dst->last_cpu = curr;405 406 396 __cfadbg_print_safe(runtime_core, "Kernel : core %p running thread %p (%s)\n", this, thrd_dst, thrd_dst->self_cor.name); 407 397 … … 480 470 #if !defined(__CFA_NO_STATISTICS__) 481 471 __tls_stats()->ready.threads.threads++; 472 __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this ); 482 473 #endif 483 474 // This is case 2, the racy case, someone tried to run this thread before it finished blocking … … 497 488 #if !defined(__CFA_NO_STATISTICS__) 498 489 __tls_stats()->ready.threads.threads--; 490 __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this ); 499 491 #endif 500 492 … … 578 570 __tls_stats()->ready.threads.extunpark++; 579 571 } 572 __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", kernelTLS().this_processor ); 580 573 } 581 574 else { 582 575 __atomic_fetch_add(&cl->stats->ready.threads.threads, 1, __ATOMIC_RELAXED); 583 576 __atomic_fetch_add(&cl->stats->ready.threads.extunpark, 1, __ATOMIC_RELAXED); 577 __push_stat( cl->stats, cl->stats->ready.threads.threads, true, "Cluster", cl ); 584 578 } 585 579 #endif -
libcfa/src/concurrency/kernel.hfa
r50b29d9 r4f89e7b 67 67 unsigned target; 68 68 unsigned last; 69 unsigned cnt;70 69 unsigned long long int cutoff; 71 70 } rdq; -
libcfa/src/concurrency/kernel/startup.cfa
r50b29d9 r4f89e7b 478 478 state = Start; 479 479 self_cor{ info }; 480 last_cpu = __kernel_getcpu();481 480 curr_cor = &self_cor; 482 481 curr_cluster = mainCluster; -
libcfa/src/concurrency/ready_queue.cfa
r50b29d9 r4f89e7b 345 345 /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores. 346 346 /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores. 347 348 if(0 == (__tls_rand() % 10_000)) { 349 proc->rdq.target = __tls_rand() % lanes.count; 347 uint64_t chaos = __tls_rand(); 348 uint64_t high_chaos = (chaos >> 32); 349 uint64_t mid_chaos = (chaos >> 16) & 0xffff; 350 uint64_t low_chaos = chaos & 0xffff; 351 352 unsigned me = map.self; 353 unsigned cpu_chaos = map.start + (mid_chaos % map.count); 354 bool global = cpu_chaos == me; 355 356 if(global) { 357 proc->rdq.target = high_chaos % lanes.count; 350 358 } else { 351 unsigned cpu_chaos = map.start + (__tls_rand() % map.count); 352 proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (__tls_rand() % READYQ_SHARD_FACTOR); 359 proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (low_chaos % READYQ_SHARD_FACTOR); 353 360 /* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR)); 354 361 /* paranoid */ verify(proc->rdq.target < ((map.start + map.count) * READYQ_SHARD_FACTOR)); -
libcfa/src/concurrency/thread.cfa
r50b29d9 r4f89e7b 34 34 preempted = __NO_PREEMPTION; 35 35 corctx_flag = false; 36 last_cpu = __kernel_getcpu();37 36 curr_cor = &self_cor; 38 37 self_mon.owner = &this;
Note: See TracChangeset
for help on using the changeset viewer.