Changeset d672350 for libcfa/src/concurrency
- Timestamp:
- Mar 21, 2022, 1:44:06 PM (4 years ago)
- Branches:
- ADT, ast-experimental, enum, master, pthread-emulation, qualifiedEnum
- Children:
- a76202d
- Parents:
- ef3c383 (diff), dbe2533 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - Location:
- libcfa/src/concurrency
- Files:
-
- 2 added
- 18 edited
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
libcfa/src/concurrency/coroutine.cfa
ref3c383 rd672350 27 27 #include <unwind.h> 28 28 29 #include "kernel _private.hfa"29 #include "kernel/private.hfa" 30 30 #include "exception.hfa" 31 31 #include "math.hfa" -
libcfa/src/concurrency/io.cfa
ref3c383 rd672350 41 41 #include "kernel.hfa" 42 42 #include "kernel/fwd.hfa" 43 #include "kernel _private.hfa"43 #include "kernel/private.hfa" 44 44 #include "io/types.hfa" 45 45 … … 93 93 extern void __kernel_unpark( thread$ * thrd, unpark_hint ); 94 94 95 bool __cfa_io_drain( processor * proc) {95 bool __cfa_io_drain( $io_context * ctx ) { 96 96 /* paranoid */ verify( ! __preemption_enabled() ); 97 97 /* paranoid */ verify( ready_schedule_islocked() ); 98 /* paranoid */ verify( proc ); 99 /* paranoid */ verify( proc->io.ctx ); 98 /* paranoid */ verify( ctx ); 100 99 101 100 // Drain the queue 102 $io_context * ctx = proc->io.ctx;103 101 unsigned head = *ctx->cq.head; 104 102 unsigned tail = *ctx->cq.tail; … … 110 108 if(count == 0) return false; 111 109 110 if(!__atomic_try_acquire(&ctx->cq.lock)) { 111 return false; 112 } 113 112 114 for(i; count) { 113 115 unsigned idx = (head + i) & mask; … … 130 132 /* paranoid */ verify( ready_schedule_islocked() ); 131 133 /* paranoid */ verify( ! __preemption_enabled() ); 134 135 __atomic_unlock(&ctx->cq.lock); 132 136 133 137 return true; … … 175 179 /* paranoid */ verify( ! __preemption_enabled() ); 176 180 177 ctx.proc->io.pending = false;181 __atomic_store_n(&ctx.proc->io.pending, false, __ATOMIC_RELAXED); 178 182 } 179 183 180 184 ready_schedule_lock(); 181 bool ret = __cfa_io_drain( proc);185 bool ret = __cfa_io_drain( &ctx ); 182 186 ready_schedule_unlock(); 183 187 return ret; … … 287 291 //============================================================================================= 288 292 // submission 289 static inline void __submit ( struct $io_context * ctx, __u32 idxs[], __u32 have, bool lazy) {293 static inline void __submit_only( struct $io_context * ctx, __u32 idxs[], __u32 have) { 290 294 // We can proceed to the fast path 291 295 // Get the right objects … … 304 308 sq.to_submit += have; 305 309 306 ctx->proc->io.pending = true; 307 ctx->proc->io.dirty = true; 310 __atomic_store_n(&ctx->proc->io.pending, true, __ATOMIC_RELAXED); 311 __atomic_store_n(&ctx->proc->io.dirty , true, __ATOMIC_RELAXED); 312 } 313 314 static inline void __submit( struct $io_context * ctx, __u32 idxs[], __u32 have, bool lazy) { 315 __sub_ring_t & sq = ctx->sq; 316 __submit_only(ctx, idxs, have); 317 308 318 if(sq.to_submit > 30) { 309 319 __tls_stats()->io.flush.full++; … … 402 412 // I/O Arbiter 403 413 //============================================================================================= 404 static inline void block(__outstanding_io_queue & queue, __outstanding_io & item) { 414 static inline bool enqueue(__outstanding_io_queue & queue, __outstanding_io & item) { 415 bool was_empty; 416 405 417 // Lock the list, it's not thread safe 406 418 lock( queue.lock __cfaabi_dbg_ctx2 ); 407 419 { 420 was_empty = empty(queue.queue); 421 408 422 // Add our request to the list 409 423 add( queue.queue, item ); … … 414 428 unlock( queue.lock ); 415 429 416 wait( item.sem );430 return was_empty; 417 431 } 418 432 … … 432 446 pa.want = want; 433 447 434 block(this.pending, (__outstanding_io&)pa); 448 enqueue(this.pending, (__outstanding_io&)pa); 449 450 wait( pa.sem ); 435 451 436 452 return pa.ctx; … … 485 501 ei.lazy = lazy; 486 502 487 block(ctx->ext_sq, (__outstanding_io&)ei); 503 bool we = enqueue(ctx->ext_sq, (__outstanding_io&)ei); 504 505 __atomic_store_n(&ctx->proc->io.pending, true, __ATOMIC_SEQ_CST); 506 507 if( we ) { 508 sigval_t value = { PREEMPT_IO }; 509 pthread_sigqueue(ctx->proc->kernel_thread, SIGUSR1, value); 510 } 511 512 wait( ei.sem ); 488 513 489 514 __cfadbg_print_safe(io, "Kernel I/O : %u submitted from arbiter\n", have); … … 501 526 __external_io & ei = (__external_io&)drop( ctx.ext_sq.queue ); 502 527 503 __submit (&ctx, ei.idxs, ei.have, ei.lazy);528 __submit_only(&ctx, ei.idxs, ei.have); 504 529 505 530 post( ei.sem ); -
libcfa/src/concurrency/io/setup.cfa
ref3c383 rd672350 39 39 40 40 #else 41 #pragma GCC diagnostic push 42 #pragma GCC diagnostic ignored "-Waddress-of-packed-member" 41 43 #include <errno.h> 42 44 #include <stdint.h> … … 56 58 57 59 #include "bitmanip.hfa" 58 #include "kernel_private.hfa" 60 #include "fstream.hfa" 61 #include "kernel/private.hfa" 59 62 #include "thread.hfa" 63 #pragma GCC diagnostic pop 60 64 61 65 void ?{}(io_context_params & this) { … … 111 115 this.ext_sq.empty = true; 112 116 (this.ext_sq.queue){}; 113 __io_uring_setup( this, cl.io.params, proc->idle_ fd );117 __io_uring_setup( this, cl.io.params, proc->idle_wctx.evfd ); 114 118 __cfadbg_print_safe(io_core, "Kernel I/O : Created ring for io_context %u (%p)\n", this.fd, &this); 115 119 } … … 121 125 __cfadbg_print_safe(io_core, "Kernel I/O : Destroyed ring for io_context %u\n", this.fd); 122 126 } 123 124 extern void __disable_interrupts_hard();125 extern void __enable_interrupts_hard();126 127 127 128 static void __io_uring_setup( $io_context & this, const io_context_params & params_in, int procfd ) { … … 213 214 214 215 // completion queue 216 cq.lock = 0; 215 217 cq.head = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.head); 216 218 cq.tail = (volatile __u32 *)(((intptr_t)cq.ring_ptr) + params.cq_off.tail); … … 226 228 __cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd); 227 229 228 __disable_interrupts_hard();229 230 230 int ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &procfd, 1); 231 231 if (ret < 0) { 232 232 abort("KERNEL ERROR: IO_URING EVENTFD REGISTER - %s\n", strerror(errno)); 233 233 } 234 235 __enable_interrupts_hard();236 234 237 235 __cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd); … … 258 256 struct __sub_ring_t & sq = this.sq; 259 257 struct __cmp_ring_t & cq = this.cq; 258 { 259 __u32 fhead = sq.free_ring.head; 260 __u32 ftail = sq.free_ring.tail; 261 262 __u32 total = *sq.num; 263 __u32 avail = ftail - fhead; 264 265 if(avail != total) abort | "Processor (" | (void*)this.proc | ") tearing down ring with" | (total - avail) | "entries allocated but not submitted, out of" | total; 266 } 260 267 261 268 // unmap the submit queue entries -
libcfa/src/concurrency/io/types.hfa
ref3c383 rd672350 23 23 #include "bits/locks.hfa" 24 24 #include "bits/queue.hfa" 25 #include "iofwd.hfa" 25 26 #include "kernel/fwd.hfa" 26 27 … … 77 78 78 79 struct __cmp_ring_t { 80 volatile bool lock; 81 79 82 // Head and tail of the ring 80 83 volatile __u32 * head; … … 170 173 // void __ioctx_prepare_block($io_context & ctx); 171 174 #endif 172 173 //-----------------------------------------------------------------------174 // IO user data175 struct io_future_t {176 future_t self;177 __s32 result;178 };179 180 static inline {181 thread$ * fulfil( io_future_t & this, __s32 result, bool do_unpark = true ) {182 this.result = result;183 return fulfil(this.self, do_unpark);184 }185 186 // Wait for the future to be fulfilled187 bool wait ( io_future_t & this ) { return wait (this.self); }188 void reset ( io_future_t & this ) { return reset (this.self); }189 bool available( io_future_t & this ) { return available(this.self); }190 } -
libcfa/src/concurrency/iofwd.hfa
ref3c383 rd672350 19 19 extern "C" { 20 20 #include <asm/types.h> 21 #include <sys/stat.h> // needed for mode_t 21 22 #if CFA_HAVE_LINUX_IO_URING_H 22 23 #include <linux/io_uring.h> … … 24 25 } 25 26 #include "bits/defs.hfa" 27 #include "kernel/fwd.hfa" 26 28 #include "time.hfa" 27 29 … … 47 49 48 50 struct cluster; 49 struct io_future_t;50 51 struct $io_context; 51 52 … … 57 58 58 59 struct io_uring_sqe; 60 61 //----------------------------------------------------------------------- 62 // IO user data 63 struct io_future_t { 64 future_t self; 65 __s32 result; 66 }; 67 68 static inline { 69 thread$ * fulfil( io_future_t & this, __s32 result, bool do_unpark = true ) { 70 this.result = result; 71 return fulfil(this.self, do_unpark); 72 } 73 74 // Wait for the future to be fulfilled 75 bool wait ( io_future_t & this ) { return wait (this.self); } 76 void reset ( io_future_t & this ) { return reset (this.self); } 77 bool available( io_future_t & this ) { return available(this.self); } 78 } 59 79 60 80 //---------- … … 133 153 // Check if a function is blocks a only the user thread 134 154 bool has_user_level_blocking( fptr_t func ); 155 156 #if CFA_HAVE_LINUX_IO_URING_H 157 static inline void zero_sqe(struct io_uring_sqe * sqe) { 158 sqe->flags = 0; 159 sqe->ioprio = 0; 160 sqe->fd = 0; 161 sqe->off = 0; 162 sqe->addr = 0; 163 sqe->len = 0; 164 sqe->fsync_flags = 0; 165 sqe->__pad2[0] = 0; 166 sqe->__pad2[1] = 0; 167 sqe->__pad2[2] = 0; 168 sqe->fd = 0; 169 sqe->off = 0; 170 sqe->addr = 0; 171 sqe->len = 0; 172 } 173 #endif -
libcfa/src/concurrency/kernel.cfa
ref3c383 rd672350 19 19 // #define __CFA_DEBUG_PRINT_RUNTIME_CORE__ 20 20 21 #pragma GCC diagnostic push 22 #pragma GCC diagnostic ignored "-Waddress-of-packed-member" 23 21 24 //C Includes 22 25 #include <errno.h> … … 25 28 #include <signal.h> 26 29 #include <unistd.h> 30 27 31 extern "C" { 28 32 #include <sys/eventfd.h> … … 31 35 32 36 //CFA Includes 33 #include "kernel _private.hfa"37 #include "kernel/private.hfa" 34 38 #include "preemption.hfa" 35 39 #include "strstream.hfa" … … 40 44 #define __CFA_INVOKE_PRIVATE__ 41 45 #include "invoke.h" 46 #pragma GCC diagnostic pop 42 47 43 48 #if !defined(__CFA_NO_STATISTICS__) … … 131 136 static void mark_awake(__cluster_proc_list & idles, processor & proc); 132 137 133 extern void __cfa_io_start( processor * ); 134 extern bool __cfa_io_drain( processor * ); 138 extern bool __cfa_io_drain( $io_context * ); 135 139 extern bool __cfa_io_flush( processor *, int min_comp ); 136 extern void __cfa_io_stop ( processor * );137 140 static inline bool __maybe_io_drain( processor * ); 138 141 … … 159 162 verify(this); 160 163 161 io_future_t future; // used for idle sleep when io_uring is present 162 future.self.ptr = 1p; // mark it as already fulfilled so we know if there is a pending request or not 163 eventfd_t idle_val; 164 iovec idle_iovec = { &idle_val, sizeof(idle_val) }; 165 166 __cfa_io_start( this ); 164 /* paranoid */ verify( this->idle_wctx.ftr != 0p ); 165 /* paranoid */ verify( this->idle_wctx.rdbuf != 0p ); 166 167 // used for idle sleep when io_uring is present 168 // mark it as already fulfilled so we know if there is a pending request or not 169 this->idle_wctx.ftr->self.ptr = 1p; 170 iovec idle_iovec = { this->idle_wctx.rdbuf, sizeof(eventfd_t) }; 167 171 168 172 __cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this); … … 231 235 } 232 236 233 idle_sleep( this, future, idle_iovec );237 idle_sleep( this, *this->idle_wctx.ftr, idle_iovec ); 234 238 235 239 // We were woken up, remove self from idle … … 251 255 if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP; 252 256 253 if( this->io.pending && !this->io.dirty) {257 if(__atomic_load_n(&this->io.pending, __ATOMIC_RELAXED) && !__atomic_load_n(&this->io.dirty, __ATOMIC_RELAXED)) { 254 258 __IO_STATS__(true, io.flush.dirty++; ) 255 259 __cfa_io_flush( this, 0 ); … … 259 263 __cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this); 260 264 } 261 262 for(int i = 0; !available(future); i++) {263 if(i > 1000) __cfaabi_dbg_write( "ERROR: kernel has bin spinning on a flush after exit loop.\n", 60);264 __cfa_io_flush( this, 1 );265 }266 267 __cfa_io_stop( this );268 265 269 266 post( this->terminated ); … … 634 631 635 632 int fd = 1; 636 if( __atomic_load_n(&fdp-> fd, __ATOMIC_SEQ_CST) != 1 ) {637 fd = __atomic_exchange_n(&fdp-> fd, 1, __ATOMIC_RELAXED);633 if( __atomic_load_n(&fdp->sem, __ATOMIC_SEQ_CST) != 1 ) { 634 fd = __atomic_exchange_n(&fdp->sem, 1, __ATOMIC_RELAXED); 638 635 } 639 636 … … 677 674 __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this); 678 675 679 this->idle_wctx. fd= 1;676 this->idle_wctx.sem = 1; 680 677 681 678 eventfd_t val; 682 679 val = 1; 683 eventfd_write( this->idle_ fd, val );680 eventfd_write( this->idle_wctx.evfd, val ); 684 681 685 682 /* paranoid */ verify( ! __preemption_enabled() ); … … 689 686 // Tell everyone we are ready to go do sleep 690 687 for() { 691 int expected = this->idle_wctx. fd;688 int expected = this->idle_wctx.sem; 692 689 693 690 // Someone already told us to wake-up! No time for a nap. … … 695 692 696 693 // Try to mark that we are going to sleep 697 if(__atomic_compare_exchange_n(&this->idle_wctx. fd, &expected, this->idle_fd, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) {694 if(__atomic_compare_exchange_n(&this->idle_wctx.sem, &expected, this->idle_wctx.evfd, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) { 698 695 // Every one agreed, taking a nap 699 696 break; … … 713 710 { 714 711 eventfd_t val; 715 ssize_t ret = read( this->idle_ fd, &val, sizeof(val) );712 ssize_t ret = read( this->idle_wctx.evfd, &val, sizeof(val) ); 716 713 if(ret < 0) { 717 714 switch((int)errno) { … … 740 737 reset(future); 741 738 742 __kernel_read(this, future, iov, this->idle_ fd );739 __kernel_read(this, future, iov, this->idle_wctx.evfd ); 743 740 } 744 741 … … 750 747 __STATS__(true, ready.sleep.halts++; ) 751 748 752 proc.idle_wctx. fd= 0;749 proc.idle_wctx.sem = 0; 753 750 754 751 /* paranoid */ verify( ! __preemption_enabled() ); … … 842 839 if(head == tail) return false; 843 840 ready_schedule_lock(); 844 ret = __cfa_io_drain( proc);841 ret = __cfa_io_drain( ctx ); 845 842 ready_schedule_unlock(); 846 843 #endif -
libcfa/src/concurrency/kernel.hfa
ref3c383 rd672350 48 48 extern struct cluster * mainCluster; 49 49 50 // Processor id, required for scheduling threads 51 52 50 // Coroutine used py processors for the 2-step context switch 53 51 coroutine processorCtx_t { 54 52 struct processor * proc; 55 53 }; 56 54 57 55 struct io_future_t; 56 57 // Information needed for idle sleep 58 58 struct __fd_waitctx { 59 volatile int fd; 59 // semaphore/future like object 60 // values can be 0, 1 or some file descriptor. 61 // 0 - is the default state 62 // 1 - means the proc should wake-up immediately 63 // FD - means the proc is going asleep and should be woken by writing to the FD. 64 volatile int sem; 65 66 // The event FD that corresponds to this processor 67 int evfd; 68 69 // buffer into which the proc will read from evfd 70 // unused if not using io_uring for idle sleep 71 void * rdbuf; 72 73 // future use to track the read of the eventfd 74 // unused if not using io_uring for idle sleep 75 io_future_t * ftr; 60 76 }; 61 77 … … 92 108 struct { 93 109 $io_context * ctx; 94 bool pending; 95 bool dirty; 110 unsigned id; 111 unsigned target; 112 volatile bool pending; 113 volatile bool dirty; 96 114 } io; 97 115 … … 103 121 bool pending_preemption; 104 122 105 // Idle lock (kernel semaphore) 106 int idle_fd; 107 108 // Idle waitctx 123 // context for idle sleep 109 124 struct __fd_waitctx idle_wctx; 110 125 … … 155 170 void ^?{}(__intrusive_lane_t & this); 156 171 157 // Aligned timestamps which are used by the re laxed ready queue172 // Aligned timestamps which are used by the ready queue and io subsystem 158 173 struct __attribute__((aligned(128))) __timestamp_t { 159 174 volatile unsigned long long tv; … … 161 176 }; 162 177 178 static inline void ?{}(__timestamp_t & this) { this.tv = 0; this.ma = 0; } 179 static inline void ^?{}(__timestamp_t &) {} 180 181 163 182 struct __attribute__((aligned(16))) __cache_id_t { 164 183 volatile unsigned id; 165 184 }; 166 167 // Aligned timestamps which are used by the relaxed ready queue168 struct __attribute__((aligned(128))) __help_cnts_t {169 volatile unsigned long long src;170 volatile unsigned long long dst;171 volatile unsigned long long tri;172 };173 174 static inline void ?{}(__timestamp_t & this) { this.tv = 0; this.ma = 0; }175 static inline void ^?{}(__timestamp_t &) {}176 177 struct __attribute__((aligned(128))) __ready_queue_caches_t;178 void ?{}(__ready_queue_caches_t & this);179 void ^?{}(__ready_queue_caches_t & this);180 181 //TODO adjust cache size to ARCHITECTURE182 // Structure holding the ready queue183 struct __ready_queue_t {184 // Data tracking the actual lanes185 // On a seperate cacheline from the used struct since186 // used can change on each push/pop but this data187 // only changes on shrink/grow188 struct {189 // Arary of lanes190 __intrusive_lane_t * volatile data;191 192 // Array of times193 __timestamp_t * volatile tscs;194 195 __cache_id_t * volatile caches;196 197 // Array of stats198 __help_cnts_t * volatile help;199 200 // Number of lanes (empty or not)201 volatile size_t count;202 } lanes;203 };204 205 void ?{}(__ready_queue_t & this);206 void ^?{}(__ready_queue_t & this);207 #if !defined(__CFA_NO_STATISTICS__)208 unsigned cnt(const __ready_queue_t & this, unsigned idx);209 #endif210 185 211 186 // Idle Sleep … … 233 208 // Cluster 234 209 struct __attribute__((aligned(128))) cluster { 235 // Ready queue for threads 236 __ready_queue_t ready_queue; 210 struct { 211 struct { 212 // Arary of subqueues 213 __intrusive_lane_t * data; 214 215 // Time since subqueues were processed 216 __timestamp_t * tscs; 217 218 // Number of subqueue / timestamps 219 size_t count; 220 } readyQ; 221 222 struct { 223 // Array of $io_ 224 $io_context ** data; 225 226 // Time since subqueues were processed 227 __timestamp_t * tscs; 228 229 // Number of I/O subqueues 230 size_t count; 231 } io; 232 233 // Cache each kernel thread belongs to 234 __cache_id_t * caches; 235 } sched; 236 237 // // Ready queue for threads 238 // __ready_queue_t ready_queue; 237 239 238 240 // Name of the cluster -
libcfa/src/concurrency/kernel/fwd.hfa
ref3c383 rd672350 347 347 struct oneshot * want = expected == 0p ? 1p : 2p; 348 348 if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { 349 if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p);return 0p; }349 if( expected == 0p ) { return 0p; } 350 350 thread$ * ret = post( *expected, do_unpark ); 351 351 __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST); -
libcfa/src/concurrency/kernel/private.hfa
ref3c383 rd672350 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // kernel _private.hfa --7 // kernel/private.hfa -- 8 8 // 9 9 // Author : Thierry Delisle … … 17 17 18 18 #if !defined(__cforall_thread__) 19 #error kernel _private.hfa should only be included in libcfathread source19 #error kernel/private.hfa should only be included in libcfathread source 20 20 #endif 21 21 … … 33 33 #else 34 34 #ifndef _GNU_SOURCE 35 #error kernel _private requires gnu_source35 #error kernel/private requires gnu_source 36 36 #endif 37 37 #include <sched.h> … … 59 59 60 60 extern bool __preemption_enabled(); 61 62 enum { 63 PREEMPT_NORMAL = 0, 64 PREEMPT_TERMINATE = 1, 65 PREEMPT_IO = 2, 66 }; 61 67 62 68 static inline void __disable_interrupts_checked() { … … 359 365 void ready_queue_shrink(struct cluster * cltr); 360 366 367 //----------------------------------------------------------------------- 368 // Decrease the width of the ready queue (number of lanes) by 4 369 void ready_queue_close(struct cluster * cltr); 361 370 362 371 // Local Variables: // -
libcfa/src/concurrency/kernel/startup.cfa
ref3c383 rd672350 18 18 19 19 // C Includes 20 #include <errno.h> 20 #include <errno.h> // errno 21 21 #include <signal.h> 22 #include <string.h> 23 #include <unistd.h> 22 #include <string.h> // strerror 23 #include <unistd.h> // sysconf 24 24 25 25 extern "C" { 26 #include <limits.h> 27 #include <unistd.h> 28 #include <sys/eventfd.h> 29 #include <sys/mman.h> 30 #include <sys/resource.h> 26 #include <limits.h> // PTHREAD_STACK_MIN 27 #include <unistd.h> // syscall 28 #include <sys/eventfd.h> // eventfd 29 #include <sys/mman.h> // mprotect 30 #include <sys/resource.h> // getrlimit 31 31 } 32 32 33 33 // CFA Includes 34 #include "kernel_private.hfa" 35 #include "startup.hfa" // STARTUP_PRIORITY_XXX 34 #include "kernel/private.hfa" 35 #include "iofwd.hfa" 36 #include "startup.hfa" // STARTUP_PRIORITY_XXX 36 37 #include "limits.hfa" 37 38 #include "math.hfa" … … 97 98 extern void __kernel_alarm_startup(void); 98 99 extern void __kernel_alarm_shutdown(void); 100 extern void __cfa_io_start( processor * ); 101 extern void __cfa_io_stop ( processor * ); 99 102 100 103 //----------------------------------------------------------------------------- … … 111 114 KERNEL_STORAGE(__stack_t, mainThreadCtx); 112 115 KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock); 116 KERNEL_STORAGE(eventfd_t, mainIdleEventFd); 117 KERNEL_STORAGE(io_future_t, mainIdleFuture); 113 118 #if !defined(__CFA_NO_STATISTICS__) 114 119 KERNEL_STORAGE(__stats_t, mainProcStats); … … 224 229 (*mainProcessor){}; 225 230 231 mainProcessor->idle_wctx.rdbuf = &storage_mainIdleEventFd; 232 mainProcessor->idle_wctx.ftr = (io_future_t*)&storage_mainIdleFuture; 233 /* paranoid */ verify( sizeof(storage_mainIdleEventFd) == sizeof(eventfd_t) ); 234 226 235 register_tls( mainProcessor ); 236 __cfa_io_start( mainProcessor ); 227 237 228 238 // Start by initializing the main thread … … 304 314 mainProcessor->local_data = 0p; 305 315 316 __cfa_io_stop( mainProcessor ); 306 317 unregister_tls( mainProcessor ); 307 318 … … 355 366 register_tls( proc ); 356 367 368 __cfa_io_start( proc ); 369 370 // used for idle sleep when io_uring is present 371 io_future_t future; 372 eventfd_t idle_buf; 373 proc->idle_wctx.ftr = &future; 374 proc->idle_wctx.rdbuf = &idle_buf; 375 376 357 377 // SKULLDUGGERY: We want to create a context for the processor coroutine 358 378 // which is needed for the 2-step context switch. However, there is no reason … … 381 401 // Main routine of the core returned, the core is now fully terminated 382 402 __cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner); 403 404 __cfa_io_stop( proc ); 383 405 384 406 #if !defined(__CFA_NO_STATISTICS__) … … 515 537 this.rdq.its = 0; 516 538 this.rdq.itr = 0; 517 this.rdq.id = MAX;539 this.rdq.id = 0; 518 540 this.rdq.target = MAX; 519 541 this.rdq.last = MAX; … … 532 554 this.local_data = 0p; 533 555 534 this.idle_fd = eventfd(0, 0);535 if (idle_ fd < 0) {556 idle_wctx.evfd = eventfd(0, 0); 557 if (idle_wctx.evfd < 0) { 536 558 abort("KERNEL ERROR: PROCESSOR EVENTFD - %s\n", strerror(errno)); 537 559 } 538 560 539 this.idle_wctx.fd= 0;561 idle_wctx.sem = 0; 540 562 541 563 // I'm assuming these two are reserved for standard input and output 542 564 // so I'm using them as sentinels with idle_wctx. 543 /* paranoid */ verify( this.idle_fd != 0 );544 /* paranoid */ verify( this.idle_fd != 1 );565 /* paranoid */ verify( idle_wctx.evfd != 0 ); 566 /* paranoid */ verify( idle_wctx.evfd != 1 ); 545 567 546 568 #if !defined(__CFA_NO_STATISTICS__) … … 554 576 // Not a ctor, it just preps the destruction but should not destroy members 555 577 static void deinit(processor & this) { 556 close(this.idle_ fd);578 close(this.idle_wctx.evfd); 557 579 } 558 580 … … 605 627 this.name = name; 606 628 this.preemption_rate = preemption_rate; 607 ready_queue{}; 629 this.sched.readyQ.data = 0p; 630 this.sched.readyQ.tscs = 0p; 631 this.sched.readyQ.count = 0; 632 this.sched.io.tscs = 0p; 633 this.sched.caches = 0p; 608 634 609 635 #if !defined(__CFA_NO_STATISTICS__) … … 644 670 // Unlock the RWlock 645 671 ready_mutate_unlock( last_size ); 672 673 ready_queue_close( &this ); 674 /* paranoid */ verify( this.sched.readyQ.data == 0p ); 675 /* paranoid */ verify( this.sched.readyQ.tscs == 0p ); 676 /* paranoid */ verify( this.sched.readyQ.count == 0 ); 677 /* paranoid */ verify( this.sched.io.tscs == 0p ); 678 /* paranoid */ verify( this.sched.caches == 0p ); 679 646 680 enable_interrupts( false ); // Don't poll, could be in main cluster 681 647 682 648 683 #if !defined(__CFA_NO_STATISTICS__) … … 736 771 check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute 737 772 738 size_t stacksize = DEFAULT_STACK_SIZE;773 size_t stacksize = max( PTHREAD_STACK_MIN, DEFAULT_STACK_SIZE ); 739 774 740 775 void * stack; -
libcfa/src/concurrency/locks.cfa
ref3c383 rd672350 19 19 20 20 #include "locks.hfa" 21 #include "kernel _private.hfa"21 #include "kernel/private.hfa" 22 22 23 23 #include <kernel.hfa> -
libcfa/src/concurrency/locks.hfa
ref3c383 rd672350 164 164 } 165 165 166 static inline boollock(linear_backoff_then_block_lock & this) with(this) {166 static inline void lock(linear_backoff_then_block_lock & this) with(this) { 167 167 // if owner just return 168 if (active_thread() == owner) return true;168 if (active_thread() == owner) return; 169 169 size_t compare_val = 0; 170 170 int spin = spin_start; … … 172 172 for( ;; ) { 173 173 compare_val = 0; 174 if (internal_try_lock(this, compare_val)) return true;174 if (internal_try_lock(this, compare_val)) return; 175 175 if (2 == compare_val) break; 176 176 for (int i = 0; i < spin; i++) Pause(); … … 179 179 } 180 180 181 if(2 != compare_val && try_lock_contention(this)) return true;181 if(2 != compare_val && try_lock_contention(this)) return; 182 182 // block until signalled 183 while (block(this)) if(try_lock_contention(this)) return true; 184 185 // this should never be reached as block(this) always returns true 186 return false; 183 while (block(this)) if(try_lock_contention(this)) return; 187 184 } 188 185 -
libcfa/src/concurrency/monitor.cfa
ref3c383 rd672350 22 22 #include <inttypes.h> 23 23 24 #include "kernel _private.hfa"24 #include "kernel/private.hfa" 25 25 26 26 #include "bits/algorithm.hfa" -
libcfa/src/concurrency/mutex.cfa
ref3c383 rd672350 21 21 #include "mutex.hfa" 22 22 23 #include "kernel _private.hfa"23 #include "kernel/private.hfa" 24 24 25 25 //----------------------------------------------------------------------------- -
libcfa/src/concurrency/mutex_stmt.hfa
ref3c383 rd672350 12 12 }; 13 13 14 15 struct __mutex_stmt_lock_guard { 16 void ** lockarr; 17 __lock_size_t count; 18 }; 19 20 static inline void ?{}( __mutex_stmt_lock_guard & this, void * lockarr [], __lock_size_t count ) { 21 this.lockarr = lockarr; 22 this.count = count; 23 24 // Sort locks based on address 25 __libcfa_small_sort(this.lockarr, count); 26 27 // acquire locks in order 28 // for ( size_t i = 0; i < count; i++ ) { 29 // lock(*this.lockarr[i]); 30 // } 31 } 32 33 static inline void ^?{}( __mutex_stmt_lock_guard & this ) with(this) { 34 // for ( size_t i = count; i > 0; i-- ) { 35 // unlock(*lockarr[i - 1]); 36 // } 37 } 38 14 39 forall(L & | is_lock(L)) { 15 16 struct __mutex_stmt_lock_guard {17 L ** lockarr;18 __lock_size_t count;19 };20 21 static inline void ?{}( __mutex_stmt_lock_guard(L) & this, L * lockarr [], __lock_size_t count ) {22 this.lockarr = lockarr;23 this.count = count;24 25 // Sort locks based on address26 __libcfa_small_sort(this.lockarr, count);27 28 // acquire locks in order29 for ( size_t i = 0; i < count; i++ ) {30 lock(*this.lockarr[i]);31 }32 }33 34 static inline void ^?{}( __mutex_stmt_lock_guard(L) & this ) with(this) {35 for ( size_t i = count; i > 0; i-- ) {36 unlock(*lockarr[i - 1]);37 }38 }39 40 40 41 struct scoped_lock { … … 51 52 } 52 53 53 static inline L * __get_ptr( L & this ) {54 static inline void * __get_mutexstmt_lock_ptr( L & this ) { 54 55 return &this; 55 56 } 56 57 57 static inline L __get_ type( L & this );58 static inline L __get_mutexstmt_lock_type( L & this ); 58 59 59 static inline L __get_ type( L * this );60 static inline L __get_mutexstmt_lock_type( L * this ); 60 61 } -
libcfa/src/concurrency/preemption.cfa
ref3c383 rd672350 10 10 // Created On : Mon Jun 5 14:20:42 2017 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Fri Nov 6 07:42:13 202013 // Update Count : 5 412 // Last Modified On : Thu Feb 17 11:18:57 2022 13 // Update Count : 59 14 14 // 15 15 … … 31 31 #include "bits/debug.hfa" 32 32 #include "bits/signal.hfa" 33 #include "kernel _private.hfa"33 #include "kernel/private.hfa" 34 34 35 35 … … 97 97 } 98 98 99 enum {100 PREEMPT_NORMAL = 0,101 PREEMPT_TERMINATE = 1,102 };103 104 99 //============================================================================================= 105 100 // Kernel Preemption logic … … 243 238 //---------- 244 239 // special case for preemption since used often 245 bool __preemption_enabled() {240 __attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() { 246 241 // create a assembler label before 247 242 // marked as clobber all to avoid movement … … 664 659 choose(sfp->si_value.sival_int) { 665 660 case PREEMPT_NORMAL : ;// Normal case, nothing to do here 661 case PREEMPT_IO : ;// I/O asked to stop spinning, nothing to do here 666 662 case PREEMPT_TERMINATE: verify( __atomic_load_n( &__cfaabi_tls.this_processor->do_terminate, __ATOMIC_SEQ_CST ) ); 667 663 default: -
libcfa/src/concurrency/ready_queue.cfa
ref3c383 rd672350 20 20 21 21 22 // #define USE_RELAXED_FIFO23 // #define USE_WORK_STEALING24 // #define USE_CPU_WORK_STEALING25 22 #define USE_AWARE_STEALING 26 23 27 24 #include "bits/defs.hfa" 28 25 #include "device/cpu.hfa" 29 #include "kernel _private.hfa"30 31 #include "stdlib.hfa" 26 #include "kernel/cluster.hfa" 27 #include "kernel/private.hfa" 28 32 29 #include "limits.hfa" 33 #include "math.hfa" 34 35 #include <errno.h> 36 #include <unistd.h> 37 38 extern "C" { 39 #include <sys/syscall.h> // __NR_xxx 40 } 30 31 // #include <errno.h> 32 // #include <unistd.h> 41 33 42 34 #include "ready_subqueue.hfa" … … 50 42 #endif 51 43 52 // No overriden function, no environment variable, no define53 // fall back to a magic number54 #ifndef __CFA_MAX_PROCESSORS__55 #define __CFA_MAX_PROCESSORS__ 102456 #endif57 58 #if defined(USE_AWARE_STEALING)59 #define READYQ_SHARD_FACTOR 260 #define SEQUENTIAL_SHARD 261 #elif defined(USE_CPU_WORK_STEALING)62 #define READYQ_SHARD_FACTOR 263 #elif defined(USE_RELAXED_FIFO)64 #define BIAS 465 #define READYQ_SHARD_FACTOR 466 #define SEQUENTIAL_SHARD 167 #elif defined(USE_WORK_STEALING)68 #define READYQ_SHARD_FACTOR 269 #define SEQUENTIAL_SHARD 270 #else71 #error no scheduling strategy selected72 #endif73 74 44 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)); 75 45 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)); 76 46 static inline struct thread$ * search(struct cluster * cltr); 77 static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred);78 79 80 // returns the maximum number of processors the RWLock support81 __attribute__((weak)) unsigned __max_processors() {82 const char * max_cores_s = getenv("CFA_MAX_PROCESSORS");83 if(!max_cores_s) {84 __cfadbg_print_nolock(ready_queue, "No CFA_MAX_PROCESSORS in ENV\n");85 return __CFA_MAX_PROCESSORS__;86 }87 88 char * endptr = 0p;89 long int max_cores_l = strtol(max_cores_s, &endptr, 10);90 if(max_cores_l < 1 || max_cores_l > 65535) {91 __cfadbg_print_nolock(ready_queue, "CFA_MAX_PROCESSORS out of range : %ld\n", max_cores_l);92 return __CFA_MAX_PROCESSORS__;93 }94 if('\0' != *endptr) {95 __cfadbg_print_nolock(ready_queue, "CFA_MAX_PROCESSORS not a decimal number : %s\n", max_cores_s);96 return __CFA_MAX_PROCESSORS__;97 }98 99 return max_cores_l;100 }101 102 #if defined(CFA_HAVE_LINUX_LIBRSEQ)103 // No forward declaration needed104 #define __kernel_rseq_register rseq_register_current_thread105 #define __kernel_rseq_unregister rseq_unregister_current_thread106 #elif defined(CFA_HAVE_LINUX_RSEQ_H)107 static void __kernel_raw_rseq_register (void);108 static void __kernel_raw_rseq_unregister(void);109 110 #define __kernel_rseq_register __kernel_raw_rseq_register111 #define __kernel_rseq_unregister __kernel_raw_rseq_unregister112 #else113 // No forward declaration needed114 // No initialization needed115 static inline void noop(void) {}116 117 #define __kernel_rseq_register noop118 #define __kernel_rseq_unregister noop119 #endif120 121 //=======================================================================122 // Cluster wide reader-writer lock123 //=======================================================================124 void ?{}(__scheduler_RWLock_t & this) {125 this.max = __max_processors();126 this.alloc = 0;127 this.ready = 0;128 this.data = alloc(this.max);129 this.write_lock = false;130 131 /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));132 /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));133 134 }135 void ^?{}(__scheduler_RWLock_t & this) {136 free(this.data);137 }138 139 140 //=======================================================================141 // Lock-Free registering/unregistering of threads142 unsigned register_proc_id( void ) with(*__scheduler_lock) {143 __kernel_rseq_register();144 145 bool * handle = (bool *)&kernelTLS().sched_lock;146 147 // Step - 1 : check if there is already space in the data148 uint_fast32_t s = ready;149 150 // Check among all the ready151 for(uint_fast32_t i = 0; i < s; i++) {152 bool * volatile * cell = (bool * volatile *)&data[i]; // Cforall is bugged and the double volatiles causes problems153 /* paranoid */ verify( handle != *cell );154 155 bool * null = 0p; // Re-write every loop since compare thrashes it156 if( __atomic_load_n(cell, (int)__ATOMIC_RELAXED) == null157 && __atomic_compare_exchange_n( cell, &null, handle, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {158 /* paranoid */ verify(i < ready);159 /* paranoid */ verify( (kernelTLS().sched_id = i, true) );160 return i;161 }162 }163 164 if(max <= alloc) abort("Trying to create more than %ud processors", __scheduler_lock->max);165 166 // Step - 2 : F&A to get a new spot in the array.167 uint_fast32_t n = __atomic_fetch_add(&alloc, 1, __ATOMIC_SEQ_CST);168 if(max <= n) abort("Trying to create more than %ud processors", __scheduler_lock->max);169 170 // Step - 3 : Mark space as used and then publish it.171 data[n] = handle;172 while() {173 unsigned copy = n;174 if( __atomic_load_n(&ready, __ATOMIC_RELAXED) == n175 && __atomic_compare_exchange_n(&ready, ©, n + 1, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))176 break;177 Pause();178 }179 180 // Return new spot.181 /* paranoid */ verify(n < ready);182 /* paranoid */ verify( (kernelTLS().sched_id = n, true) );183 return n;184 }185 186 void unregister_proc_id( unsigned id ) with(*__scheduler_lock) {187 /* paranoid */ verify(id < ready);188 /* paranoid */ verify(id == kernelTLS().sched_id);189 /* paranoid */ verify(data[id] == &kernelTLS().sched_lock);190 191 bool * volatile * cell = (bool * volatile *)&data[id]; // Cforall is bugged and the double volatiles causes problems192 193 __atomic_store_n(cell, 0p, __ATOMIC_RELEASE);194 195 __kernel_rseq_unregister();196 }197 198 //-----------------------------------------------------------------------199 // Writer side : acquire when changing the ready queue, e.g. adding more200 // queues or removing them.201 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {202 /* paranoid */ verify( ! __preemption_enabled() );203 204 // Step 1 : lock global lock205 // It is needed to avoid processors that register mid Critical-Section206 // to simply lock their own lock and enter.207 __atomic_acquire( &write_lock );208 209 // Make sure we won't deadlock ourself210 // Checking before acquiring the writer lock isn't safe211 // because someone else could have locked us.212 /* paranoid */ verify( ! kernelTLS().sched_lock );213 214 // Step 2 : lock per-proc lock215 // Processors that are currently being registered aren't counted216 // but can't be in read_lock or in the critical section.217 // All other processors are counted218 uint_fast32_t s = ready;219 for(uint_fast32_t i = 0; i < s; i++) {220 volatile bool * llock = data[i];221 if(llock) __atomic_acquire( llock );222 }223 224 /* paranoid */ verify( ! __preemption_enabled() );225 return s;226 }227 228 void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {229 /* paranoid */ verify( ! __preemption_enabled() );230 231 // Step 1 : release local locks232 // This must be done while the global lock is held to avoid233 // threads that where created mid critical section234 // to race to lock their local locks and have the writer235 // immidiately unlock them236 // Alternative solution : return s in write_lock and pass it to write_unlock237 for(uint_fast32_t i = 0; i < last_s; i++) {238 volatile bool * llock = data[i];239 if(llock) __atomic_store_n(llock, (bool)false, __ATOMIC_RELEASE);240 }241 242 // Step 2 : release global lock243 /*paranoid*/ assert(true == write_lock);244 __atomic_store_n(&write_lock, (bool)false, __ATOMIC_RELEASE);245 246 /* paranoid */ verify( ! __preemption_enabled() );247 }248 249 //=======================================================================250 // caches handling251 252 struct __attribute__((aligned(128))) __ready_queue_caches_t {253 // Count States:254 // - 0 : No one is looking after this cache255 // - 1 : No one is looking after this cache, BUT it's not empty256 // - 2+ : At least one processor is looking after this cache257 volatile unsigned count;258 };259 260 void ?{}(__ready_queue_caches_t & this) { this.count = 0; }261 void ^?{}(__ready_queue_caches_t & this) {}262 263 static inline void depart(__ready_queue_caches_t & cache) {264 /* paranoid */ verify( cache.count > 1);265 __atomic_fetch_add(&cache.count, -1, __ATOMIC_SEQ_CST);266 /* paranoid */ verify( cache.count != 0);267 /* paranoid */ verify( cache.count < 65536 ); // This verify assumes no cluster will have more than 65000 kernel threads mapped to a single cache, which could be correct but is super weird.268 }269 270 static inline void arrive(__ready_queue_caches_t & cache) {271 // for() {272 // unsigned expected = cache.count;273 // unsigned desired = 0 == expected ? 2 : expected + 1;274 // }275 }276 47 277 48 //======================================================================= 278 49 // Cforall Ready Queue used for scheduling 279 50 //======================================================================= 280 unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) { 281 /* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc ); 282 /* paranoid */ verifyf( instsc < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc ); 283 /* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg ); 284 285 const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0; 286 const unsigned long long total_weight = 16; 287 const unsigned long long new_weight = 4; 288 const unsigned long long old_weight = total_weight - new_weight; 289 const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight; 290 return ret; 291 } 292 293 void ?{}(__ready_queue_t & this) with (this) { 294 #if defined(USE_CPU_WORK_STEALING) 295 lanes.count = cpu_info.hthrd_count * READYQ_SHARD_FACTOR; 296 lanes.data = alloc( lanes.count ); 297 lanes.tscs = alloc( lanes.count ); 298 lanes.help = alloc( cpu_info.hthrd_count ); 299 300 for( idx; (size_t)lanes.count ) { 301 (lanes.data[idx]){}; 302 lanes.tscs[idx].tv = rdtscl(); 303 lanes.tscs[idx].ma = rdtscl(); 304 } 305 for( idx; (size_t)cpu_info.hthrd_count ) { 306 lanes.help[idx].src = 0; 307 lanes.help[idx].dst = 0; 308 lanes.help[idx].tri = 0; 309 } 310 #else 311 lanes.data = 0p; 312 lanes.tscs = 0p; 313 lanes.caches = 0p; 314 lanes.help = 0p; 315 lanes.count = 0; 316 #endif 317 } 318 319 void ^?{}(__ready_queue_t & this) with (this) { 320 #if !defined(USE_CPU_WORK_STEALING) 321 verify( SEQUENTIAL_SHARD == lanes.count ); 322 #endif 323 324 free(lanes.data); 325 free(lanes.tscs); 326 free(lanes.caches); 327 free(lanes.help); 328 } 329 330 //----------------------------------------------------------------------- 331 #if defined(USE_AWARE_STEALING) 332 __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) { 333 processor * const proc = kernelTLS().this_processor; 334 const bool external = (!proc) || (cltr != proc->cltr); 335 const bool remote = hint == UNPARK_REMOTE; 336 337 unsigned i; 338 if( external || remote ) { 339 // Figure out where thread was last time and make sure it's valid 340 /* paranoid */ verify(thrd->preferred >= 0); 341 if(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count) { 342 /* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count); 343 unsigned start = thrd->preferred * READYQ_SHARD_FACTOR; 344 do { 345 unsigned r = __tls_rand(); 346 i = start + (r % READYQ_SHARD_FACTOR); 347 /* paranoid */ verify( i < lanes.count ); 348 // If we can't lock it retry 349 } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); 350 } else { 351 do { 352 i = __tls_rand() % lanes.count; 353 } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); 354 } 51 // void ?{}(__ready_queue_t & this) with (this) { 52 // lanes.data = 0p; 53 // lanes.tscs = 0p; 54 // lanes.caches = 0p; 55 // lanes.count = 0; 56 // } 57 58 // void ^?{}(__ready_queue_t & this) with (this) { 59 // free(lanes.data); 60 // free(lanes.tscs); 61 // free(lanes.caches); 62 // } 63 64 //----------------------------------------------------------------------- 65 __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->sched) { 66 processor * const proc = kernelTLS().this_processor; 67 const bool external = (!proc) || (cltr != proc->cltr); 68 const bool remote = hint == UNPARK_REMOTE; 69 const size_t lanes_count = readyQ.count; 70 71 /* paranoid */ verify( __shard_factor.readyq > 0 ); 72 /* paranoid */ verify( lanes_count > 0 ); 73 74 unsigned i; 75 if( external || remote ) { 76 // Figure out where thread was last time and make sure it's valid 77 /* paranoid */ verify(thrd->preferred >= 0); 78 unsigned start = thrd->preferred * __shard_factor.readyq; 79 if(start < lanes_count) { 80 do { 81 unsigned r = __tls_rand(); 82 i = start + (r % __shard_factor.readyq); 83 /* paranoid */ verify( i < lanes_count ); 84 // If we can't lock it retry 85 } while( !__atomic_try_acquire( &readyQ.data[i].lock ) ); 355 86 } else { 356 87 do { 357 unsigned r = proc->rdq.its++; 358 i = proc->rdq.id + (r % READYQ_SHARD_FACTOR); 359 /* paranoid */ verify( i < lanes.count ); 360 // If we can't lock it retry 361 } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); 362 } 363 364 // Actually push it 365 push(lanes.data[i], thrd); 366 367 // Unlock and return 368 __atomic_unlock( &lanes.data[i].lock ); 369 370 #if !defined(__CFA_NO_STATISTICS__) 371 if(unlikely(external || remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); 372 else __tls_stats()->ready.push.local.success++; 373 #endif 374 } 375 376 static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) { 377 unsigned start = proc->rdq.id; 378 unsigned long long max = 0; 379 for(i; READYQ_SHARD_FACTOR) { 380 unsigned long long ptsc = ts(rdq.lanes.data[start + i]); 381 if(ptsc != -1ull) { 382 /* paranoid */ verify( start + i < rdq.lanes.count ); 383 unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma); 384 if(tsc > max) max = tsc; 385 } 386 } 387 return (max + 2 * max) / 2; 388 } 389 390 __attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) { 391 /* paranoid */ verify( lanes.count > 0 ); 392 /* paranoid */ verify( kernelTLS().this_processor ); 393 /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count ); 394 395 processor * const proc = kernelTLS().this_processor; 396 unsigned this = proc->rdq.id; 397 /* paranoid */ verify( this < lanes.count ); 398 __cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this); 399 400 // Figure out the current cpu and make sure it is valid 401 const int cpu = __kernel_getcpu(); 402 /* paranoid */ verify(cpu >= 0); 403 /* paranoid */ verify(cpu < cpu_info.hthrd_count); 404 unsigned this_cache = cpu_info.llc_map[cpu].cache; 405 406 // Super important: don't write the same value over and over again 407 // We want to maximise our chances that his particular values stays in cache 408 if(lanes.caches[this / READYQ_SHARD_FACTOR].id != this_cache) 409 __atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED); 410 411 const unsigned long long ctsc = rdtscl(); 412 413 if(proc->rdq.target == MAX) { 414 uint64_t chaos = __tls_rand(); 415 unsigned ext = chaos & 0xff; 416 unsigned other = (chaos >> 8) % (lanes.count); 417 418 if(ext < 3 || __atomic_load_n(&lanes.caches[other / READYQ_SHARD_FACTOR].id, __ATOMIC_RELAXED) == this_cache) { 419 proc->rdq.target = other; 420 } 421 } 422 else { 423 const unsigned target = proc->rdq.target; 424 __cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv); 425 /* paranoid */ verify( lanes.tscs[target].tv != MAX ); 426 if(target < lanes.count) { 427 const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue); 428 const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma); 429 __cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no"); 430 if(age > cutoff) { 431 thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help)); 432 if(t) return t; 433 } 434 } 435 proc->rdq.target = MAX; 436 } 437 438 for(READYQ_SHARD_FACTOR) { 439 unsigned i = this + (proc->rdq.itr++ % READYQ_SHARD_FACTOR); 440 if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t; 441 } 442 443 // All lanes where empty return 0p 444 return 0p; 445 446 } 447 __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) { 448 unsigned i = __tls_rand() % lanes.count; 449 return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal)); 450 } 451 __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) { 452 return search(cltr); 453 } 454 #endif 455 #if defined(USE_CPU_WORK_STEALING) 456 __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) { 457 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr); 458 459 processor * const proc = kernelTLS().this_processor; 460 const bool external = (!proc) || (cltr != proc->cltr); 461 462 // Figure out the current cpu and make sure it is valid 463 const int cpu = __kernel_getcpu(); 464 /* paranoid */ verify(cpu >= 0); 465 /* paranoid */ verify(cpu < cpu_info.hthrd_count); 466 /* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count); 467 468 // Figure out where thread was last time and make sure it's 469 /* paranoid */ verify(thrd->preferred >= 0); 470 /* paranoid */ verify(thrd->preferred < cpu_info.hthrd_count); 471 /* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count); 472 const int prf = thrd->preferred * READYQ_SHARD_FACTOR; 473 474 const cpu_map_entry_t & map; 475 choose(hint) { 476 case UNPARK_LOCAL : &map = &cpu_info.llc_map[cpu]; 477 case UNPARK_REMOTE: &map = &cpu_info.llc_map[prf]; 478 } 479 /* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count); 480 /* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count); 481 /* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR); 482 483 const int start = map.self * READYQ_SHARD_FACTOR; 484 unsigned i; 88 i = __tls_rand() % lanes_count; 89 } while( !__atomic_try_acquire( &readyQ.data[i].lock ) ); 90 } 91 } else { 485 92 do { 486 unsigned r; 487 if(unlikely(external)) { r = __tls_rand(); } 488 else { r = proc->rdq.its++; } 489 choose(hint) { 490 case UNPARK_LOCAL : i = start + (r % READYQ_SHARD_FACTOR); 491 case UNPARK_REMOTE: i = prf + (r % READYQ_SHARD_FACTOR); 492 } 93 unsigned r = proc->rdq.its++; 94 i = proc->rdq.id + (r % __shard_factor.readyq); 95 /* paranoid */ verify( i < lanes_count ); 493 96 // If we can't lock it retry 494 } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); 495 496 // Actually push it 497 push(lanes.data[i], thrd); 498 499 // Unlock and return 500 __atomic_unlock( &lanes.data[i].lock ); 501 502 #if !defined(__CFA_NO_STATISTICS__) 503 if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); 504 else __tls_stats()->ready.push.local.success++; 505 #endif 506 507 __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first); 508 509 } 510 511 // Pop from the ready queue from a given cluster 512 __attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) { 513 /* paranoid */ verify( lanes.count > 0 ); 514 /* paranoid */ verify( kernelTLS().this_processor ); 515 516 processor * const proc = kernelTLS().this_processor; 517 const int cpu = __kernel_getcpu(); 518 /* paranoid */ verify(cpu >= 0); 519 /* paranoid */ verify(cpu < cpu_info.hthrd_count); 520 /* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count); 521 522 const cpu_map_entry_t & map = cpu_info.llc_map[cpu]; 523 /* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count); 524 /* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count); 525 /* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR); 526 527 const int start = map.self * READYQ_SHARD_FACTOR; 528 const unsigned long long ctsc = rdtscl(); 529 530 // Did we already have a help target 531 if(proc->rdq.target == MAX) { 532 unsigned long long max = 0; 533 for(i; READYQ_SHARD_FACTOR) { 534 unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma); 535 if(tsc > max) max = tsc; 536 } 537 // proc->rdq.cutoff = (max + 2 * max) / 2; 538 /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores. 539 /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores. 540 541 if(0 == (__tls_rand() % 100)) { 542 proc->rdq.target = __tls_rand() % lanes.count; 543 } else { 544 unsigned cpu_chaos = map.start + (__tls_rand() % map.count); 545 proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (__tls_rand() % READYQ_SHARD_FACTOR); 546 /* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR)); 547 /* paranoid */ verify(proc->rdq.target < ((map.start + map.count) * READYQ_SHARD_FACTOR)); 548 } 549 550 /* paranoid */ verify(proc->rdq.target != MAX); 551 } 552 else { 553 unsigned long long max = 0; 554 for(i; READYQ_SHARD_FACTOR) { 555 unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma); 556 if(tsc > max) max = tsc; 557 } 558 const unsigned long long cutoff = (max + 2 * max) / 2; 559 { 560 unsigned target = proc->rdq.target; 561 proc->rdq.target = MAX; 562 lanes.help[target / READYQ_SHARD_FACTOR].tri++; 563 if(moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) { 564 thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help)); 565 proc->rdq.last = target; 566 if(t) return t; 567 } 568 proc->rdq.target = MAX; 569 } 570 571 unsigned last = proc->rdq.last; 572 if(last != MAX && moving_average(ctsc, lanes.tscs[last].tv, lanes.tscs[last].ma) > cutoff) { 573 thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help)); 574 if(t) return t; 575 } 576 else { 577 proc->rdq.last = MAX; 578 } 579 } 580 581 for(READYQ_SHARD_FACTOR) { 582 unsigned i = start + (proc->rdq.itr++ % READYQ_SHARD_FACTOR); 583 if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t; 584 } 585 586 // All lanes where empty return 0p 587 return 0p; 588 } 589 590 __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) { 591 processor * const proc = kernelTLS().this_processor; 592 unsigned last = proc->rdq.last; 593 if(last != MAX) { 594 struct thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal)); 595 if(t) return t; 596 proc->rdq.last = MAX; 597 } 598 599 unsigned i = __tls_rand() % lanes.count; 600 return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal)); 601 } 602 __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) { 603 return search(cltr); 604 } 605 #endif 606 #if defined(USE_RELAXED_FIFO) 607 //----------------------------------------------------------------------- 608 // get index from random number with or without bias towards queues 609 static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) { 610 unsigned i; 611 bool local; 612 unsigned rlow = r % BIAS; 613 unsigned rhigh = r / BIAS; 614 if((0 != rlow) && preferred >= 0) { 615 // (BIAS - 1) out of BIAS chances 616 // Use perferred queues 617 i = preferred + (rhigh % READYQ_SHARD_FACTOR); 618 local = true; 619 } 620 else { 621 // 1 out of BIAS chances 622 // Use all queues 623 i = rhigh; 624 local = false; 625 } 626 return [i, local]; 627 } 628 629 __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) { 630 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr); 631 632 const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr); 633 /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count ); 634 635 bool local; 636 int preferred = external ? -1 : kernelTLS().this_processor->rdq.id; 637 638 // Try to pick a lane and lock it 639 unsigned i; 640 do { 641 // Pick the index of a lane 642 unsigned r = __tls_rand_fwd(); 643 [i, local] = idx_from_r(r, preferred); 644 645 i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED ); 646 647 #if !defined(__CFA_NO_STATISTICS__) 648 if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED); 649 else if(local) __tls_stats()->ready.push.local.attempt++; 650 else __tls_stats()->ready.push.share.attempt++; 651 #endif 652 653 // If we can't lock it retry 654 } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); 655 656 // Actually push it 657 push(lanes.data[i], thrd); 658 659 // Unlock and return 660 __atomic_unlock( &lanes.data[i].lock ); 661 662 // Mark the current index in the tls rng instance as having an item 663 __tls_rand_advance_bck(); 664 665 __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first); 666 667 // Update statistics 668 #if !defined(__CFA_NO_STATISTICS__) 669 if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); 670 else if(local) __tls_stats()->ready.push.local.success++; 671 else __tls_stats()->ready.push.share.success++; 672 #endif 673 } 674 675 // Pop from the ready queue from a given cluster 676 __attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) { 677 /* paranoid */ verify( lanes.count > 0 ); 678 /* paranoid */ verify( kernelTLS().this_processor ); 679 /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count ); 680 681 unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED ); 682 int preferred = kernelTLS().this_processor->rdq.id; 683 684 685 // As long as the list is not empty, try finding a lane that isn't empty and pop from it 686 for(25) { 687 // Pick two lists at random 688 unsigned ri = __tls_rand_bck(); 689 unsigned rj = __tls_rand_bck(); 690 691 unsigned i, j; 692 __attribute__((unused)) bool locali, localj; 693 [i, locali] = idx_from_r(ri, preferred); 694 [j, localj] = idx_from_r(rj, preferred); 695 696 i %= count; 697 j %= count; 698 699 // try popping from the 2 picked lists 700 struct thread$ * thrd = try_pop(cltr, i, j __STATS(, *(locali || localj ? &__tls_stats()->ready.pop.local : &__tls_stats()->ready.pop.help))); 701 if(thrd) { 702 return thrd; 703 } 704 } 705 706 // All lanes where empty return 0p 707 return 0p; 708 } 709 710 __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) { return pop_fast(cltr); } 711 __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) { 712 return search(cltr); 713 } 714 #endif 715 #if defined(USE_WORK_STEALING) 716 __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) { 717 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr); 718 719 // #define USE_PREFERRED 720 #if !defined(USE_PREFERRED) 721 const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr); 722 /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count ); 723 #else 724 unsigned preferred = thrd->preferred; 725 const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == MAX || thrd->curr_cluster != cltr; 726 /* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count ); 727 728 unsigned r = preferred % READYQ_SHARD_FACTOR; 729 const unsigned start = preferred - r; 730 #endif 731 732 // Try to pick a lane and lock it 733 unsigned i; 734 do { 735 #if !defined(__CFA_NO_STATISTICS__) 736 if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED); 737 else __tls_stats()->ready.push.local.attempt++; 738 #endif 739 740 if(unlikely(external)) { 741 i = __tls_rand() % lanes.count; 742 } 743 else { 744 #if !defined(USE_PREFERRED) 745 processor * proc = kernelTLS().this_processor; 746 unsigned r = proc->rdq.its++; 747 i = proc->rdq.id + (r % READYQ_SHARD_FACTOR); 748 #else 749 i = start + (r++ % READYQ_SHARD_FACTOR); 750 #endif 751 } 752 // If we can't lock it retry 753 } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); 754 755 // Actually push it 756 push(lanes.data[i], thrd); 757 758 // Unlock and return 759 __atomic_unlock( &lanes.data[i].lock ); 760 761 #if !defined(__CFA_NO_STATISTICS__) 762 if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); 763 else __tls_stats()->ready.push.local.success++; 764 #endif 765 766 __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first); 767 } 768 769 // Pop from the ready queue from a given cluster 770 __attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) { 771 /* paranoid */ verify( lanes.count > 0 ); 772 /* paranoid */ verify( kernelTLS().this_processor ); 773 /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count ); 774 775 processor * proc = kernelTLS().this_processor; 776 777 if(proc->rdq.target == MAX) { 778 unsigned long long min = ts(lanes.data[proc->rdq.id]); 779 for(int i = 0; i < READYQ_SHARD_FACTOR; i++) { 780 unsigned long long tsc = ts(lanes.data[proc->rdq.id + i]); 781 if(tsc < min) min = tsc; 782 } 783 proc->rdq.cutoff = min; 784 proc->rdq.target = __tls_rand() % lanes.count; 785 } 786 else { 787 unsigned target = proc->rdq.target; 788 proc->rdq.target = MAX; 789 const unsigned long long bias = 0; //2_500_000_000; 790 const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff; 791 if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) { 97 } while( !__atomic_try_acquire( &readyQ.data[i].lock ) ); 98 } 99 100 // Actually push it 101 push(readyQ.data[i], thrd); 102 103 // Unlock and return 104 __atomic_unlock( &readyQ.data[i].lock ); 105 106 #if !defined(__CFA_NO_STATISTICS__) 107 if(unlikely(external || remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); 108 else __tls_stats()->ready.push.local.success++; 109 #endif 110 } 111 112 __attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->sched) { 113 const size_t lanes_count = readyQ.count; 114 115 /* paranoid */ verify( __shard_factor.readyq > 0 ); 116 /* paranoid */ verify( lanes_count > 0 ); 117 /* paranoid */ verify( kernelTLS().this_processor ); 118 /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes_count ); 119 120 processor * const proc = kernelTLS().this_processor; 121 unsigned this = proc->rdq.id; 122 /* paranoid */ verify( this < lanes_count ); 123 __cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this); 124 125 // Figure out the current cache is 126 const unsigned this_cache = cache_id(cltr, this / __shard_factor.readyq); 127 const unsigned long long ctsc = rdtscl(); 128 129 if(proc->rdq.target == MAX) { 130 uint64_t chaos = __tls_rand(); 131 unsigned ext = chaos & 0xff; 132 unsigned other = (chaos >> 8) % (lanes_count); 133 134 if(ext < 3 || __atomic_load_n(&caches[other / __shard_factor.readyq].id, __ATOMIC_RELAXED) == this_cache) { 135 proc->rdq.target = other; 136 } 137 } 138 else { 139 const unsigned target = proc->rdq.target; 140 __cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, readyQ.tscs[target].tv); 141 /* paranoid */ verify( readyQ.tscs[target].tv != MAX ); 142 if(target < lanes_count) { 143 const unsigned long long cutoff = calc_cutoff(ctsc, proc, lanes_count, cltr->sched.readyQ.data, cltr->sched.readyQ.tscs, __shard_factor.readyq); 144 const unsigned long long age = moving_average(ctsc, readyQ.tscs[target].tv, readyQ.tscs[target].ma); 145 __cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no"); 146 if(age > cutoff) { 792 147 thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help)); 793 148 if(t) return t; 794 149 } 795 150 } 796 797 for(READYQ_SHARD_FACTOR) { 798 unsigned i = proc->rdq.id + (proc->rdq.itr++ % READYQ_SHARD_FACTOR); 799 if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t; 800 } 801 return 0p; 802 } 803 804 __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) { 805 unsigned i = __tls_rand() % lanes.count; 806 return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal)); 807 } 808 809 __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) with (cltr->ready_queue) { 810 return search(cltr); 811 } 812 #endif 151 proc->rdq.target = MAX; 152 } 153 154 for(__shard_factor.readyq) { 155 unsigned i = this + (proc->rdq.itr++ % __shard_factor.readyq); 156 if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t; 157 } 158 159 // All lanes where empty return 0p 160 return 0p; 161 162 } 163 __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) { 164 unsigned i = __tls_rand() % (cltr->sched.readyQ.count); 165 return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal)); 166 } 167 __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) { 168 return search(cltr); 169 } 813 170 814 171 //======================================================================= … … 820 177 //----------------------------------------------------------------------- 821 178 // try to pop from a lane given by index w 822 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr-> ready_queue) {823 /* paranoid */ verify( w < lanes.count );179 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->sched) { 180 /* paranoid */ verify( w < readyQ.count ); 824 181 __STATS( stats.attempt++; ) 825 182 826 183 // Get relevant elements locally 827 __intrusive_lane_t & lane = lanes.data[w];184 __intrusive_lane_t & lane = readyQ.data[w]; 828 185 829 186 // If list looks empty retry … … 845 202 // Actually pop the list 846 203 struct thread$ * thrd; 847 #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING) 848 unsigned long long tsc_before = ts(lane); 849 #endif 204 unsigned long long tsc_before = ts(lane); 850 205 unsigned long long tsv; 851 206 [thrd, tsv] = pop(lane); … … 861 216 __STATS( stats.success++; ) 862 217 863 #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING) 864 if (tsv != MAX) { 865 unsigned long long now = rdtscl(); 866 unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED); 867 __atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED); 868 __atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED); 869 } 870 #endif 871 872 #if defined(USE_AWARE_STEALING) || defined(USE_CPU_WORK_STEALING) 873 thrd->preferred = w / READYQ_SHARD_FACTOR; 874 #else 875 thrd->preferred = w; 876 #endif 218 if (tsv != MAX) { 219 unsigned long long now = rdtscl(); 220 unsigned long long pma = __atomic_load_n(&readyQ.tscs[w].ma, __ATOMIC_RELAXED); 221 __atomic_store_n(&readyQ.tscs[w].tv, tsv, __ATOMIC_RELAXED); 222 __atomic_store_n(&readyQ.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED); 223 } 224 225 thrd->preferred = w / __shard_factor.readyq; 877 226 878 227 // return the popped thread … … 883 232 // try to pop from any lanes making sure you don't miss any threads push 884 233 // before the start of the function 885 static inline struct thread$ * search(struct cluster * cltr) with (cltr->ready_queue) { 886 /* paranoid */ verify( lanes.count > 0 ); 887 unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED ); 234 static inline struct thread$ * search(struct cluster * cltr) { 235 const size_t lanes_count = cltr->sched.readyQ.count; 236 /* paranoid */ verify( lanes_count > 0 ); 237 unsigned count = __atomic_load_n( &lanes_count, __ATOMIC_RELAXED ); 888 238 unsigned offset = __tls_rand(); 889 239 for(i; count) { … … 902 252 // get preferred ready for new thread 903 253 unsigned ready_queue_new_preferred() { 904 unsigned pref = 0;254 unsigned pref = MAX; 905 255 if(struct thread$ * thrd = publicTLS_get( this_thread )) { 906 256 pref = thrd->preferred; 907 257 } 908 else {909 #if defined(USE_CPU_WORK_STEALING)910 pref = __kernel_getcpu();911 #endif912 }913 914 #if defined(USE_CPU_WORK_STEALING)915 /* paranoid */ verify(pref >= 0);916 /* paranoid */ verify(pref < cpu_info.hthrd_count);917 #endif918 258 919 259 return pref; … … 921 261 922 262 //----------------------------------------------------------------------- 923 // Check that all the intrusive queues in the data structure are still consistent924 static void check( __ready_queue_t & q ) with (q) {925 #if defined(__CFA_WITH_VERIFY__)926 {927 for( idx ; lanes.count ) {928 __intrusive_lane_t & sl = lanes.data[idx];929 assert(!lanes.data[idx].lock);930 931 if(is_empty(sl)) {932 assert( sl.anchor.next == 0p );933 assert( sl.anchor.ts == -1llu );934 assert( mock_head(sl) == sl.prev );935 } else {936 assert( sl.anchor.next != 0p );937 assert( sl.anchor.ts != -1llu );938 assert( mock_head(sl) != sl.prev );939 }940 }941 }942 #endif943 }944 945 //-----------------------------------------------------------------------946 263 // Given 2 indexes, pick the list with the oldest push an try to pop from it 947 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr-> ready_queue) {264 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->sched) { 948 265 // Pick the bet list 949 266 int w = i; 950 if( __builtin_expect(!is_empty( lanes.data[j]), true) ) {951 w = (ts( lanes.data[i]) < ts(lanes.data[j])) ? i : j;267 if( __builtin_expect(!is_empty(readyQ.data[j]), true) ) { 268 w = (ts(readyQ.data[i]) < ts(readyQ.data[j])) ? i : j; 952 269 } 953 270 954 271 return try_pop(cltr, w __STATS(, stats)); 955 272 } 956 957 // Call this function of the intrusive list was moved using memcpy958 // fixes the list so that the pointers back to anchors aren't left dangling959 static inline void fix(__intrusive_lane_t & ll) {960 if(is_empty(ll)) {961 verify(ll.anchor.next == 0p);962 ll.prev = mock_head(ll);963 }964 }965 966 static void assign_list(unsigned & value, dlist(processor) & list, unsigned count) {967 processor * it = &list`first;968 for(unsigned i = 0; i < count; i++) {969 /* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);970 it->rdq.id = value;971 it->rdq.target = MAX;972 value += READYQ_SHARD_FACTOR;973 it = &(*it)`next;974 }975 }976 977 static void reassign_cltr_id(struct cluster * cltr) {978 unsigned preferred = 0;979 assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);980 assign_list(preferred, cltr->procs.idles , cltr->procs.idle );981 }982 983 static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {984 #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING)985 lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);986 for(i; lanes.count) {987 lanes.tscs[i].tv = rdtscl();988 lanes.tscs[i].ma = 0;989 }990 #endif991 }992 993 #if defined(USE_CPU_WORK_STEALING)994 // ready_queue size is fixed in this case995 void ready_queue_grow(struct cluster * cltr) {}996 void ready_queue_shrink(struct cluster * cltr) {}997 #else998 // Grow the ready queue999 void ready_queue_grow(struct cluster * cltr) {1000 size_t ncount;1001 int target = cltr->procs.total;1002 1003 /* paranoid */ verify( ready_mutate_islocked() );1004 __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");1005 1006 // Make sure that everything is consistent1007 /* paranoid */ check( cltr->ready_queue );1008 1009 // grow the ready queue1010 with( cltr->ready_queue ) {1011 // Find new count1012 // Make sure we always have atleast 1 list1013 if(target >= 2) {1014 ncount = target * READYQ_SHARD_FACTOR;1015 } else {1016 ncount = SEQUENTIAL_SHARD;1017 }1018 1019 // Allocate new array (uses realloc and memcpies the data)1020 lanes.data = alloc( ncount, lanes.data`realloc );1021 1022 // Fix the moved data1023 for( idx; (size_t)lanes.count ) {1024 fix(lanes.data[idx]);1025 }1026 1027 // Construct new data1028 for( idx; (size_t)lanes.count ~ ncount) {1029 (lanes.data[idx]){};1030 }1031 1032 // Update original1033 lanes.count = ncount;1034 1035 lanes.caches = alloc( target, lanes.caches`realloc );1036 }1037 1038 fix_times(cltr);1039 1040 reassign_cltr_id(cltr);1041 1042 // Make sure that everything is consistent1043 /* paranoid */ check( cltr->ready_queue );1044 1045 __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");1046 1047 /* paranoid */ verify( ready_mutate_islocked() );1048 }1049 1050 // Shrink the ready queue1051 void ready_queue_shrink(struct cluster * cltr) {1052 /* paranoid */ verify( ready_mutate_islocked() );1053 __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");1054 1055 // Make sure that everything is consistent1056 /* paranoid */ check( cltr->ready_queue );1057 1058 int target = cltr->procs.total;1059 1060 with( cltr->ready_queue ) {1061 // Remember old count1062 size_t ocount = lanes.count;1063 1064 // Find new count1065 // Make sure we always have atleast 1 list1066 lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;1067 /* paranoid */ verify( ocount >= lanes.count );1068 /* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );1069 1070 // for printing count the number of displaced threads1071 #if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)1072 __attribute__((unused)) size_t displaced = 0;1073 #endif1074 1075 // redistribute old data1076 for( idx; (size_t)lanes.count ~ ocount) {1077 // Lock is not strictly needed but makes checking invariants much easier1078 __attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);1079 verify(locked);1080 1081 // As long as we can pop from this lane to push the threads somewhere else in the queue1082 while(!is_empty(lanes.data[idx])) {1083 struct thread$ * thrd;1084 unsigned long long _;1085 [thrd, _] = pop(lanes.data[idx]);1086 1087 push(cltr, thrd, true);1088 1089 // for printing count the number of displaced threads1090 #if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)1091 displaced++;1092 #endif1093 }1094 1095 // Unlock the lane1096 __atomic_unlock(&lanes.data[idx].lock);1097 1098 // TODO print the queue statistics here1099 1100 ^(lanes.data[idx]){};1101 }1102 1103 __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);1104 1105 // Allocate new array (uses realloc and memcpies the data)1106 lanes.data = alloc( lanes.count, lanes.data`realloc );1107 1108 // Fix the moved data1109 for( idx; (size_t)lanes.count ) {1110 fix(lanes.data[idx]);1111 }1112 1113 lanes.caches = alloc( target, lanes.caches`realloc );1114 }1115 1116 fix_times(cltr);1117 1118 1119 reassign_cltr_id(cltr);1120 1121 // Make sure that everything is consistent1122 /* paranoid */ check( cltr->ready_queue );1123 1124 __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");1125 /* paranoid */ verify( ready_mutate_islocked() );1126 }1127 #endif1128 1129 #if !defined(__CFA_NO_STATISTICS__)1130 unsigned cnt(const __ready_queue_t & this, unsigned idx) {1131 /* paranoid */ verify(this.lanes.count > idx);1132 return this.lanes.data[idx].cnt;1133 }1134 #endif1135 1136 1137 #if defined(CFA_HAVE_LINUX_LIBRSEQ)1138 // No definition needed1139 #elif defined(CFA_HAVE_LINUX_RSEQ_H)1140 1141 #if defined( __x86_64 ) || defined( __i386 )1142 #define RSEQ_SIG 0x530530531143 #elif defined( __ARM_ARCH )1144 #ifdef __ARMEB__1145 #define RSEQ_SIG 0xf3def5e7 /* udf #24035 ; 0x5de3 (ARMv6+) */1146 #else1147 #define RSEQ_SIG 0xe7f5def3 /* udf #24035 ; 0x5de3 */1148 #endif1149 #endif1150 1151 extern void __disable_interrupts_hard();1152 extern void __enable_interrupts_hard();1153 1154 static void __kernel_raw_rseq_register (void) {1155 /* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );1156 1157 // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);1158 int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);1159 if(ret != 0) {1160 int e = errno;1161 switch(e) {1162 case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");1163 case ENOSYS: abort("KERNEL ERROR: rseq register no supported");1164 case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");1165 case EBUSY : abort("KERNEL ERROR: rseq register already registered");1166 case EPERM : abort("KERNEL ERROR: rseq register sig argument on unregistration does not match the signature received on registration");1167 default: abort("KERNEL ERROR: rseq register unexpected return %d", e);1168 }1169 }1170 }1171 1172 static void __kernel_raw_rseq_unregister(void) {1173 /* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );1174 1175 // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);1176 int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);1177 if(ret != 0) {1178 int e = errno;1179 switch(e) {1180 case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");1181 case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");1182 case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");1183 case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");1184 case EPERM : abort("KERNEL ERROR: rseq unregister sig argument on unregistration does not match the signature received on registration");1185 default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);1186 }1187 }1188 }1189 #else1190 // No definition needed1191 #endif -
libcfa/src/concurrency/ready_subqueue.hfa
ref3c383 rd672350 25 25 ); 26 26 return rhead; 27 }28 29 // Ctor30 void ?{}( __intrusive_lane_t & this ) {31 this.lock = false;32 this.prev = mock_head(this);33 this.anchor.next = 0p;34 this.anchor.ts = -1llu;35 #if !defined(__CFA_NO_STATISTICS__)36 this.cnt = 0;37 #endif38 39 // We add a boat-load of assertions here because the anchor code is very fragile40 /* paranoid */ _Static_assert( offsetof( thread$, link ) == offsetof(__intrusive_lane_t, anchor) );41 /* paranoid */ verify( offsetof( thread$, link ) == offsetof(__intrusive_lane_t, anchor) );42 /* paranoid */ verify( ((uintptr_t)( mock_head(this) ) + offsetof( thread$, link )) == (uintptr_t)(&this.anchor) );43 /* paranoid */ verify( &mock_head(this)->link.next == &this.anchor.next );44 /* paranoid */ verify( &mock_head(this)->link.ts == &this.anchor.ts );45 /* paranoid */ verify( mock_head(this)->link.next == 0p );46 /* paranoid */ verify( mock_head(this)->link.ts == -1llu );47 /* paranoid */ verify( mock_head(this) == this.prev );48 /* paranoid */ verify( __alignof__(__intrusive_lane_t) == 128 );49 /* paranoid */ verify( __alignof__(this) == 128 );50 /* paranoid */ verifyf( ((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128) );51 }52 53 // Dtor is trivial54 void ^?{}( __intrusive_lane_t & this ) {55 // Make sure the list is empty56 /* paranoid */ verify( this.anchor.next == 0p );57 /* paranoid */ verify( this.anchor.ts == -1llu );58 /* paranoid */ verify( mock_head(this) == this.prev );59 27 } 60 28 -
libcfa/src/concurrency/thread.cfa
ref3c383 rd672350 19 19 #include "thread.hfa" 20 20 21 #include "kernel _private.hfa"21 #include "kernel/private.hfa" 22 22 #include "exception.hfa" 23 23
Note:
See TracChangeset
for help on using the changeset viewer.