Changeset 089d30c for libcfa/src
- Timestamp:
- Sep 23, 2021, 1:02:46 PM (3 years ago)
- Branches:
- ADT, ast-experimental, enum, forall-pointer-decay, master, pthread-emulation, qualifiedEnum
- Children:
- fcd65ca
- Parents:
- bc4a433
- Location:
- libcfa/src/concurrency
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
libcfa/src/concurrency/kernel.hfa
rbc4a433 r089d30c 151 151 struct __attribute__((aligned(128))) __timestamp_t { 152 152 volatile unsigned long long tv; 153 }; 154 155 static inline void ?{}(__timestamp_t & this) { this.tv = 0; } 153 volatile unsigned long long ma; 154 }; 155 156 // Aligned timestamps which are used by the relaxed ready queue 157 struct __attribute__((aligned(128))) __help_cnts_t { 158 volatile unsigned long long src; 159 volatile unsigned long long dst; 160 volatile unsigned long long tri; 161 }; 162 163 static inline void ?{}(__timestamp_t & this) { this.tv = 0; this.ma = 0; } 156 164 static inline void ^?{}(__timestamp_t & this) {} 157 165 … … 169 177 // Array of times 170 178 __timestamp_t * volatile tscs; 179 180 // Array of stats 181 __help_cnts_t * volatile help; 171 182 172 183 // Number of lanes (empty or not) -
libcfa/src/concurrency/ready_queue.cfa
rbc4a433 r089d30c 246 246 // Cforall Ready Queue used for scheduling 247 247 //======================================================================= 248 unsigned long long moving_average(unsigned long long nval, unsigned long long oval) { 249 const unsigned long long tw = 16; 250 const unsigned long long nw = 4; 251 const unsigned long long ow = tw - nw; 252 return ((nw * nval) + (ow * oval)) / tw; 253 } 254 248 255 void ?{}(__ready_queue_t & this) with (this) { 249 256 #if defined(USE_CPU_WORK_STEALING) … … 251 258 lanes.data = alloc( lanes.count ); 252 259 lanes.tscs = alloc( lanes.count ); 260 lanes.help = alloc( cpu_info.hthrd_count ); 253 261 254 262 for( idx; (size_t)lanes.count ) { 255 263 (lanes.data[idx]){}; 256 264 lanes.tscs[idx].tv = rdtscl(); 265 lanes.tscs[idx].ma = rdtscl(); 266 } 267 for( idx; (size_t)cpu_info.hthrd_count ) { 268 lanes.help[idx].src = 0; 269 lanes.help[idx].dst = 0; 270 lanes.help[idx].tri = 0; 257 271 } 258 272 #else 259 273 lanes.data = 0p; 260 274 lanes.tscs = 0p; 275 lanes.help = 0p; 261 276 lanes.count = 0; 262 277 #endif … … 270 285 free(lanes.data); 271 286 free(lanes.tscs); 287 free(lanes.help); 272 288 } 273 289 … … 332 348 processor * const proc = kernelTLS().this_processor; 333 349 const int start = map.self * READYQ_SHARD_FACTOR; 350 const unsigned long long ctsc = rdtscl(); 334 351 335 352 // Did we already have a help target 336 353 if(proc->rdq.target == -1u) { 337 // if We don't have a 338 unsigned long long min = ts(lanes.data[start]); 354 unsigned long long max = 0; 339 355 for(i; READYQ_SHARD_FACTOR) { 340 unsigned long long tsc = ts(lanes.data[start + i]); 341 if(tsc < min) min = tsc; 342 } 343 proc->rdq.cutoff = min; 344 356 unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma); 357 if(tsc > max) max = tsc; 358 } 359 proc->rdq.cutoff = (max + 2 * max) / 2; 345 360 /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores. 346 361 /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores. 347 362 348 if(0 == (__tls_rand() % 10 _000)) {363 if(0 == (__tls_rand() % 100)) { 349 364 proc->rdq.target = __tls_rand() % lanes.count; 350 365 } else { … … 358 373 } 359 374 else { 360 const unsigned long long bias = 0; //2_500_000_000; 361 const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff; 375 unsigned long long max = 0; 376 for(i; READYQ_SHARD_FACTOR) { 377 unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma); 378 if(tsc > max) max = tsc; 379 } 380 const unsigned long long cutoff = (max + 2 * max) / 2; 362 381 { 363 382 unsigned target = proc->rdq.target; 364 383 proc->rdq.target = -1u; 365 if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) { 384 lanes.help[target].tri++; 385 if(moving_average(ctsc - lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) { 366 386 thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help)); 367 387 proc->rdq.last = target; 368 388 if(t) return t; 389 else proc->rdq.target = -1u; 369 390 } 391 else proc->rdq.target = -1u; 370 392 } 371 393 … … 645 667 // Actually pop the list 646 668 struct thread$ * thrd; 669 unsigned long long tsc_before = ts(lane); 647 670 unsigned long long tsv; 648 671 [thrd, tsv] = pop(lane); … … 658 681 __STATS( stats.success++; ) 659 682 660 #if defined(USE_WORK_STEALING) 683 #if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING) 684 unsigned long long now = rdtscl(); 661 685 lanes.tscs[w].tv = tsv; 686 lanes.tscs[w].ma = moving_average(now > tsc_before ? now - tsc_before : 0, lanes.tscs[w].ma); 662 687 #endif 663 688 -
libcfa/src/concurrency/thread.cfa
rbc4a433 r089d30c 25 25 #include "invoke.h" 26 26 27 uint64_t thread_rand(); 28 27 29 //----------------------------------------------------------------------------- 28 30 // Thread ctors and dtors … … 41 43 link.next = 0p; 42 44 link.ts = -1llu; 43 preferred = -1u;45 preferred = thread_rand() % cl.ready_queue.lanes.count; 44 46 last_proc = 0p; 45 47 #if defined( __CFA_WITH_VERIFY__ )
Note: See TracChangeset
for help on using the changeset viewer.