Changeset 50b29d9
- Timestamp:
- Aug 9, 2021, 4:21:44 PM (3 years ago)
- Branches:
- ADT, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast-unique-expr, pthread-emulation, qualifiedEnum
- Children:
- d874f59
- Parents:
- 4f89e7b (diff), 5438e41 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - Files:
-
- 11 edited
Legend:
- Unmodified
- Added
- Removed
-
benchmark/readyQ/cycle.cfa
r4f89e7b r50b29d9 85 85 } 86 86 87 printf("Duration (ms) : %'l d\n", (end - start)`dms);87 printf("Duration (ms) : %'lf\n", (end - start)`dms); 88 88 printf("Number of processors : %'d\n", nprocs); 89 89 printf("Number of threads : %'d\n", tthreads); -
benchmark/readyQ/locality.cfa
r4f89e7b r50b29d9 126 126 // ================================================== 127 127 // Do some work by accessing 'cnt' cells in the array 128 __attribute__((noinline)) void work(MyData & data, size_t cnt , uint64_t & state) {129 for (cnt ) {128 __attribute__((noinline)) void work(MyData & data, size_t cnt_, uint64_t & state) { 129 for (cnt_) { 130 130 access(data, __xorshift64(state)); 131 131 } … … 200 200 201 201 if( nspots == 0 ) { nspots = nthreads - nprocs; } 202 if( nspots == 0 ) { 203 fprintf(stderr, "--nspots must be set or --nthreads set to something bigger than --nprocs\n"); 204 exit(EXIT_FAILURE); 205 } 202 206 203 207 Time start, end; -
benchmark/readyQ/yield.cpp
r4f89e7b r50b29d9 29 29 #include "libfibre/fibre.h" 30 30 31 F ibreBarrier * barrier;31 FredBarrier * barrier; 32 32 struct __attribute__((aligned(128))) counter_t { 33 33 int value = 0; … … 66 66 67 67 const char * arg = optarg ? optarg : ""; 68 size_t len = 0;69 68 char * end; 70 69 switch(opt) { … … 111 110 112 111 FibreInit(); 113 barrier = new F ibreBarrier(nthreads + 1);112 barrier = new FredBarrier(nthreads + 1); 114 113 { 115 114 Context::CurrCluster().addWorkers(nprocs); -
doc/theses/andrew_beach_MMath/performance.tex
r4f89e7b r50b29d9 160 160 % Match All & 3719462155 & 43294042 & 3223004977 & 1286054154 & 623887874 \\ 161 161 % Match None & 4971630929 & 55311709 & 9481225467 & 1310251289 & 623752624 \\ 162 % 163 % run-algol-04-a 164 % -------------- 165 % Raise Empty & 0.0 & 0.0 & 3250260945 & 0.0 & 0.0 \\ 166 % Raise D'tor & 0.0 & 0.0 & 29017675113 & N/A & N/A \\ 167 % Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\ 168 % Raise Other & 0.0 & 0.0 & 24411823773 & 0.0 & 0.0 \\ 169 % Cross Handler & 0.0 & 0.0 & 769334 & 0.0 & 0.0 \\ 170 % Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\ 171 % Match All & 0.0 & 0.0 & 3254283504 & 0.0 & 0.0 \\ 172 % Match None & 0.0 & 0.0 & 9476060146 & 0.0 & 0.0 \\ 173 162 174 \begin{tabular}{|l|c c c c c|} 163 175 \hline … … 196 208 % Match All & 3189512499 & 39124453 & 2667795989 & 1525889031 & 733785613 \\ 197 209 % Match None & 4094675477 & 48749857 & 7850618572 & 1566713577 & 733478963 \\ 210 % 211 % run-plg7a-04-a 212 % -------------- 213 % 0.0 are unfilled. 214 % Raise Empty & 0.0 & 0.0 & 2770781479 & 0.0 & 0.0 \\ 215 % Raise D'tor & 0.0 & 0.0 & 23530084907 & N/A & N/A \\ 216 % Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\ 217 % Raise Other & 0.0 & 0.0 & 23816827982 & 0.0 & 0.0 \\ 218 % Cross Handler & 0.0 & 0.0 & 1422188 & 0.0 & 0.0 \\ 219 % Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\ 220 % Match All & 0.0 & 0.0 & 2671989778 & 0.0 & 0.0 \\ 221 % Match None & 0.0 & 0.0 & 7829059869 & 0.0 & 0.0 \\ 198 222 199 223 % PLG7A (in seconds) … … 202 226 & \CFA (Terminate) & \CFA (Resume) & \Cpp & Java & Python \\ 203 227 \hline 204 % Raise Empty & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\205 % Raise D'tor & 0.0 & 0.0 & 0.0 & N/A & N/A \\206 % Raise Finally & 0.0 & 0.0 & N/A & 0.0 & 0.0 \\207 % Raise Other & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\208 % Cross Handler & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\209 % Cross Finally & 0.0 & N/A & N/A & 0.0 & 0.0 \\210 % Match All & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\211 % Match None & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\212 228 Raise Empty & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\ 213 229 Raise D'tor & 0.0 & 0.0 & 0.0 & N/A & N/A \\ … … 231 247 232 248 This means that while \CFA does not actually keep up with Python in every 233 case it is no worse than roughly half the speed of \Cpp. This is good249 case it is usually no worse than roughly half the speed of \Cpp. This is good 234 250 enough for the prototyping purposes of the project. 235 251 236 One difference not shown is that optimizations in \CFA is very fragile. 237 The \CFA compiler uses gcc as part of its complation process and the version 238 of gcc could change the speed of some of the benchmarks by 10 times or more. 239 Similar changes to g++ for the \Cpp benchmarks had no significant changes. 240 Because of the connection between gcc and g++; this suggests it is not the 241 optimizations that are changing but how the optimizer is detecting if the 242 optimizations can be applied. So the optimizations are always applied in 243 g++, but only newer versions of gcc can detect that they can be applied in 244 the more complex \CFA code. 252 The test case where \CFA falls short is Raise Other, the case where the 253 stack is unwound including a bunch of non-matching handlers. 254 This slowdown seems to come from missing optimizations, 255 the results above came from gcc/g++ 10 (gcc as \CFA backend or g++ for \Cpp) 256 but the results change if they are run in gcc/g++ 9 instead. 257 Importantly, there is a huge slowdown in \Cpp's results bringing that brings 258 \CFA's performace back in that roughly half speed area. However many other 259 \CFA benchmarks increase their run-time by a similar amount falling far 260 behind their \Cpp counter-parts. 261 262 This suggests that the performance issue in Raise Other is just an 263 optimization not being applied. Later versions of gcc may be able to 264 optimize this case further, at least down to the half of \Cpp mark. 265 A \CFA compiler that directly produced assembly could do even better as it 266 would not have to work across some of \CFA's current abstractions, like 267 the try terminate function. 245 268 246 269 Resumption exception handling is also incredibly fast. Often an order of -
libcfa/prelude/bootloader.cf
r4f89e7b r50b29d9 3 3 char ** cfa_args_argv; 4 4 char ** cfa_args_envp; 5 int cfa_main_returned = 0; 5 6 6 7 int main(int argc, char* argv[], char* envp[]) { … … 8 9 cfa_args_argv = argv; 9 10 cfa_args_envp = envp; 10 return invoke_main(argc, argv, envp); 11 int ret = invoke_main(argc, argv, envp); 12 cfa_main_returned = 1; 13 return ret; 11 14 } -
libcfa/src/concurrency/invoke.h
r4f89e7b r50b29d9 170 170 bool corctx_flag; 171 171 172 int last_cpu; 173 172 174 //SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it 173 175 -
libcfa/src/concurrency/kernel.cfa
r4f89e7b r50b29d9 394 394 __builtin_prefetch( thrd_dst->context.SP ); 395 395 396 int curr = __kernel_getcpu(); 397 if(thrd_dst->last_cpu != curr) { 398 int64_t l = thrd_dst->last_cpu; 399 int64_t c = curr; 400 int64_t v = (l << 32) | c; 401 __push_stat( __tls_stats(), v, false, "Processor", this ); 402 } 403 404 thrd_dst->last_cpu = curr; 405 396 406 __cfadbg_print_safe(runtime_core, "Kernel : core %p running thread %p (%s)\n", this, thrd_dst, thrd_dst->self_cor.name); 397 407 … … 470 480 #if !defined(__CFA_NO_STATISTICS__) 471 481 __tls_stats()->ready.threads.threads++; 472 __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this );473 482 #endif 474 483 // This is case 2, the racy case, someone tried to run this thread before it finished blocking … … 488 497 #if !defined(__CFA_NO_STATISTICS__) 489 498 __tls_stats()->ready.threads.threads--; 490 __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this );491 499 #endif 492 500 … … 570 578 __tls_stats()->ready.threads.extunpark++; 571 579 } 572 __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", kernelTLS().this_processor );573 580 } 574 581 else { 575 582 __atomic_fetch_add(&cl->stats->ready.threads.threads, 1, __ATOMIC_RELAXED); 576 583 __atomic_fetch_add(&cl->stats->ready.threads.extunpark, 1, __ATOMIC_RELAXED); 577 __push_stat( cl->stats, cl->stats->ready.threads.threads, true, "Cluster", cl );578 584 } 579 585 #endif -
libcfa/src/concurrency/kernel.hfa
r4f89e7b r50b29d9 67 67 unsigned target; 68 68 unsigned last; 69 unsigned cnt; 69 70 unsigned long long int cutoff; 70 71 } rdq; -
libcfa/src/concurrency/kernel/startup.cfa
r4f89e7b r50b29d9 478 478 state = Start; 479 479 self_cor{ info }; 480 last_cpu = __kernel_getcpu(); 480 481 curr_cor = &self_cor; 481 482 curr_cluster = mainCluster; -
libcfa/src/concurrency/ready_queue.cfa
r4f89e7b r50b29d9 345 345 /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores. 346 346 /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores. 347 uint64_t chaos = __tls_rand(); 348 uint64_t high_chaos = (chaos >> 32); 349 uint64_t mid_chaos = (chaos >> 16) & 0xffff; 350 uint64_t low_chaos = chaos & 0xffff; 351 352 unsigned me = map.self; 353 unsigned cpu_chaos = map.start + (mid_chaos % map.count); 354 bool global = cpu_chaos == me; 355 356 if(global) { 357 proc->rdq.target = high_chaos % lanes.count; 347 348 if(0 == (__tls_rand() % 10_000)) { 349 proc->rdq.target = __tls_rand() % lanes.count; 358 350 } else { 359 proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (low_chaos % READYQ_SHARD_FACTOR); 351 unsigned cpu_chaos = map.start + (__tls_rand() % map.count); 352 proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (__tls_rand() % READYQ_SHARD_FACTOR); 360 353 /* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR)); 361 354 /* paranoid */ verify(proc->rdq.target < ((map.start + map.count) * READYQ_SHARD_FACTOR)); -
libcfa/src/concurrency/thread.cfa
r4f89e7b r50b29d9 34 34 preempted = __NO_PREEMPTION; 35 35 corctx_flag = false; 36 last_cpu = __kernel_getcpu(); 36 37 curr_cor = &self_cor; 37 38 self_mon.owner = &this;
Note: See TracChangeset
for help on using the changeset viewer.