| [7768b8d] | 1 | // | 
|---|
|  | 2 | // Cforall Version 1.0.0 Copyright (C) 2019 University of Waterloo | 
|---|
|  | 3 | // | 
|---|
|  | 4 | // The contents of this file are covered under the licence agreement in the | 
|---|
|  | 5 | // file "LICENCE" distributed with Cforall. | 
|---|
|  | 6 | // | 
|---|
|  | 7 | // ready_queue.cfa -- | 
|---|
|  | 8 | // | 
|---|
|  | 9 | // Author           : Thierry Delisle | 
|---|
|  | 10 | // Created On       : Mon Nov dd 16:29:18 2019 | 
|---|
|  | 11 | // Last Modified By : | 
|---|
|  | 12 | // Last Modified On : | 
|---|
|  | 13 | // Update Count     : | 
|---|
|  | 14 | // | 
|---|
|  | 15 |  | 
|---|
|  | 16 | #define __cforall_thread__ | 
|---|
| [43784ac] | 17 | #define _GNU_SOURCE | 
|---|
|  | 18 |  | 
|---|
| [1b143de] | 19 | // #define __CFA_DEBUG_PRINT_READY_QUEUE__ | 
|---|
| [7768b8d] | 20 |  | 
|---|
| [1eb239e4] | 21 |  | 
|---|
| [a2a4566] | 22 | // #define USE_RELAXED_FIFO | 
|---|
| [9cc3a18] | 23 | // #define USE_WORK_STEALING | 
|---|
| [6ba6846] | 24 | // #define USE_CPU_WORK_STEALING | 
|---|
| [a2a4566] | 25 | #define USE_AWARE_STEALING | 
|---|
| [9cc3a18] | 26 |  | 
|---|
| [7768b8d] | 27 | #include "bits/defs.hfa" | 
|---|
| [12daa43] | 28 | #include "device/cpu.hfa" | 
|---|
| [7768b8d] | 29 | #include "kernel_private.hfa" | 
|---|
|  | 30 |  | 
|---|
|  | 31 | #include "stdlib.hfa" | 
|---|
| [a2a4566] | 32 | #include "limits.hfa" | 
|---|
| [61d7bec] | 33 | #include "math.hfa" | 
|---|
| [7768b8d] | 34 |  | 
|---|
| [0ee224b] | 35 | #include <errno.h> | 
|---|
| [04b5cef] | 36 | #include <unistd.h> | 
|---|
|  | 37 |  | 
|---|
| [0ee224b] | 38 | extern "C" { | 
|---|
|  | 39 | #include <sys/syscall.h>  // __NR_xxx | 
|---|
|  | 40 | } | 
|---|
|  | 41 |  | 
|---|
| [13c5e19] | 42 | #include "ready_subqueue.hfa" | 
|---|
|  | 43 |  | 
|---|
| [7768b8d] | 44 | static const size_t cache_line_size = 64; | 
|---|
|  | 45 |  | 
|---|
| [d2fadeb] | 46 | #if !defined(__CFA_NO_STATISTICS__) | 
|---|
|  | 47 | #define __STATS(...) __VA_ARGS__ | 
|---|
|  | 48 | #else | 
|---|
|  | 49 | #define __STATS(...) | 
|---|
|  | 50 | #endif | 
|---|
|  | 51 |  | 
|---|
| [dca5802] | 52 | // No overriden function, no environment variable, no define | 
|---|
|  | 53 | // fall back to a magic number | 
|---|
|  | 54 | #ifndef __CFA_MAX_PROCESSORS__ | 
|---|
| [b388ee81] | 55 | #define __CFA_MAX_PROCESSORS__ 1024 | 
|---|
| [dca5802] | 56 | #endif | 
|---|
| [7768b8d] | 57 |  | 
|---|
| [a2a4566] | 58 | #if   defined(USE_AWARE_STEALING) | 
|---|
|  | 59 | #define READYQ_SHARD_FACTOR 2 | 
|---|
|  | 60 | #define SEQUENTIAL_SHARD 2 | 
|---|
|  | 61 | #elif defined(USE_CPU_WORK_STEALING) | 
|---|
| [12daa43] | 62 | #define READYQ_SHARD_FACTOR 2 | 
|---|
|  | 63 | #elif defined(USE_RELAXED_FIFO) | 
|---|
| [9cc3a18] | 64 | #define BIAS 4 | 
|---|
|  | 65 | #define READYQ_SHARD_FACTOR 4 | 
|---|
| [5f6a172] | 66 | #define SEQUENTIAL_SHARD 1 | 
|---|
| [9cc3a18] | 67 | #elif defined(USE_WORK_STEALING) | 
|---|
|  | 68 | #define READYQ_SHARD_FACTOR 2 | 
|---|
| [5f6a172] | 69 | #define SEQUENTIAL_SHARD 2 | 
|---|
| [9cc3a18] | 70 | #else | 
|---|
|  | 71 | #error no scheduling strategy selected | 
|---|
|  | 72 | #endif | 
|---|
|  | 73 |  | 
|---|
| [e84ab3d] | 74 | static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)); | 
|---|
|  | 75 | static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)); | 
|---|
|  | 76 | static inline struct thread$ * search(struct cluster * cltr); | 
|---|
| [d2fadeb] | 77 | static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred); | 
|---|
| [9cc3a18] | 78 |  | 
|---|
| [04b5cef] | 79 |  | 
|---|
| [dca5802] | 80 | // returns the maximum number of processors the RWLock support | 
|---|
| [7768b8d] | 81 | __attribute__((weak)) unsigned __max_processors() { | 
|---|
|  | 82 | const char * max_cores_s = getenv("CFA_MAX_PROCESSORS"); | 
|---|
|  | 83 | if(!max_cores_s) { | 
|---|
| [504a7dc] | 84 | __cfadbg_print_nolock(ready_queue, "No CFA_MAX_PROCESSORS in ENV\n"); | 
|---|
| [dca5802] | 85 | return __CFA_MAX_PROCESSORS__; | 
|---|
| [7768b8d] | 86 | } | 
|---|
|  | 87 |  | 
|---|
|  | 88 | char * endptr = 0p; | 
|---|
|  | 89 | long int max_cores_l = strtol(max_cores_s, &endptr, 10); | 
|---|
|  | 90 | if(max_cores_l < 1 || max_cores_l > 65535) { | 
|---|
| [504a7dc] | 91 | __cfadbg_print_nolock(ready_queue, "CFA_MAX_PROCESSORS out of range : %ld\n", max_cores_l); | 
|---|
| [dca5802] | 92 | return __CFA_MAX_PROCESSORS__; | 
|---|
| [7768b8d] | 93 | } | 
|---|
|  | 94 | if('\0' != *endptr) { | 
|---|
| [504a7dc] | 95 | __cfadbg_print_nolock(ready_queue, "CFA_MAX_PROCESSORS not a decimal number : %s\n", max_cores_s); | 
|---|
| [dca5802] | 96 | return __CFA_MAX_PROCESSORS__; | 
|---|
| [7768b8d] | 97 | } | 
|---|
|  | 98 |  | 
|---|
|  | 99 | return max_cores_l; | 
|---|
|  | 100 | } | 
|---|
|  | 101 |  | 
|---|
| [0ee224b] | 102 | #if   defined(CFA_HAVE_LINUX_LIBRSEQ) | 
|---|
|  | 103 | // No forward declaration needed | 
|---|
|  | 104 | #define __kernel_rseq_register rseq_register_current_thread | 
|---|
|  | 105 | #define __kernel_rseq_unregister rseq_unregister_current_thread | 
|---|
|  | 106 | #elif defined(CFA_HAVE_LINUX_RSEQ_H) | 
|---|
| [75c7252] | 107 | static void __kernel_raw_rseq_register  (void); | 
|---|
|  | 108 | static void __kernel_raw_rseq_unregister(void); | 
|---|
| [0ee224b] | 109 |  | 
|---|
|  | 110 | #define __kernel_rseq_register __kernel_raw_rseq_register | 
|---|
|  | 111 | #define __kernel_rseq_unregister __kernel_raw_rseq_unregister | 
|---|
|  | 112 | #else | 
|---|
|  | 113 | // No forward declaration needed | 
|---|
|  | 114 | // No initialization needed | 
|---|
|  | 115 | static inline void noop(void) {} | 
|---|
|  | 116 |  | 
|---|
|  | 117 | #define __kernel_rseq_register noop | 
|---|
|  | 118 | #define __kernel_rseq_unregister noop | 
|---|
|  | 119 | #endif | 
|---|
|  | 120 |  | 
|---|
| [7768b8d] | 121 | //======================================================================= | 
|---|
|  | 122 | // Cluster wide reader-writer lock | 
|---|
|  | 123 | //======================================================================= | 
|---|
| [b388ee81] | 124 | void  ?{}(__scheduler_RWLock_t & this) { | 
|---|
| [7768b8d] | 125 | this.max   = __max_processors(); | 
|---|
|  | 126 | this.alloc = 0; | 
|---|
|  | 127 | this.ready = 0; | 
|---|
|  | 128 | this.data  = alloc(this.max); | 
|---|
| [c993b15] | 129 | this.write_lock  = false; | 
|---|
| [7768b8d] | 130 |  | 
|---|
|  | 131 | /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc)); | 
|---|
|  | 132 | /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready)); | 
|---|
|  | 133 |  | 
|---|
|  | 134 | } | 
|---|
| [b388ee81] | 135 | void ^?{}(__scheduler_RWLock_t & this) { | 
|---|
| [7768b8d] | 136 | free(this.data); | 
|---|
|  | 137 | } | 
|---|
|  | 138 |  | 
|---|
|  | 139 |  | 
|---|
|  | 140 | //======================================================================= | 
|---|
|  | 141 | // Lock-Free registering/unregistering of threads | 
|---|
| [c993b15] | 142 | unsigned register_proc_id( void ) with(*__scheduler_lock) { | 
|---|
| [0ee224b] | 143 | __kernel_rseq_register(); | 
|---|
|  | 144 |  | 
|---|
| [c993b15] | 145 | bool * handle = (bool *)&kernelTLS().sched_lock; | 
|---|
| [504a7dc] | 146 |  | 
|---|
| [7768b8d] | 147 | // Step - 1 : check if there is already space in the data | 
|---|
|  | 148 | uint_fast32_t s = ready; | 
|---|
|  | 149 |  | 
|---|
|  | 150 | // Check among all the ready | 
|---|
|  | 151 | for(uint_fast32_t i = 0; i < s; i++) { | 
|---|
| [c993b15] | 152 | bool * volatile * cell = (bool * volatile *)&data[i]; // Cforall is bugged and the double volatiles causes problems | 
|---|
|  | 153 | /* paranoid */ verify( handle != *cell ); | 
|---|
|  | 154 |  | 
|---|
|  | 155 | bool * null = 0p; // Re-write every loop since compare thrashes it | 
|---|
|  | 156 | if( __atomic_load_n(cell, (int)__ATOMIC_RELAXED) == null | 
|---|
|  | 157 | && __atomic_compare_exchange_n( cell, &null, handle, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { | 
|---|
|  | 158 | /* paranoid */ verify(i < ready); | 
|---|
|  | 159 | /* paranoid */ verify( (kernelTLS().sched_id = i, true) ); | 
|---|
|  | 160 | return i; | 
|---|
| [7768b8d] | 161 | } | 
|---|
|  | 162 | } | 
|---|
|  | 163 |  | 
|---|
| [b388ee81] | 164 | if(max <= alloc) abort("Trying to create more than %ud processors", __scheduler_lock->max); | 
|---|
| [7768b8d] | 165 |  | 
|---|
|  | 166 | // Step - 2 : F&A to get a new spot in the array. | 
|---|
|  | 167 | uint_fast32_t n = __atomic_fetch_add(&alloc, 1, __ATOMIC_SEQ_CST); | 
|---|
| [b388ee81] | 168 | if(max <= n) abort("Trying to create more than %ud processors", __scheduler_lock->max); | 
|---|
| [7768b8d] | 169 |  | 
|---|
|  | 170 | // Step - 3 : Mark space as used and then publish it. | 
|---|
| [c993b15] | 171 | data[n] = handle; | 
|---|
| [fd9b524] | 172 | while() { | 
|---|
| [7768b8d] | 173 | unsigned copy = n; | 
|---|
|  | 174 | if( __atomic_load_n(&ready, __ATOMIC_RELAXED) == n | 
|---|
|  | 175 | && __atomic_compare_exchange_n(&ready, ©, n + 1, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) | 
|---|
|  | 176 | break; | 
|---|
| [fd9b524] | 177 | Pause(); | 
|---|
| [7768b8d] | 178 | } | 
|---|
|  | 179 |  | 
|---|
|  | 180 | // Return new spot. | 
|---|
| [c993b15] | 181 | /* paranoid */ verify(n < ready); | 
|---|
|  | 182 | /* paranoid */ verify( (kernelTLS().sched_id = n, true) ); | 
|---|
|  | 183 | return n; | 
|---|
| [7768b8d] | 184 | } | 
|---|
|  | 185 |  | 
|---|
| [c993b15] | 186 | void unregister_proc_id( unsigned id ) with(*__scheduler_lock) { | 
|---|
|  | 187 | /* paranoid */ verify(id < ready); | 
|---|
|  | 188 | /* paranoid */ verify(id == kernelTLS().sched_id); | 
|---|
|  | 189 | /* paranoid */ verify(data[id] == &kernelTLS().sched_lock); | 
|---|
|  | 190 |  | 
|---|
|  | 191 | bool * volatile * cell = (bool * volatile *)&data[id]; // Cforall is bugged and the double volatiles causes problems | 
|---|
|  | 192 |  | 
|---|
|  | 193 | __atomic_store_n(cell, 0p, __ATOMIC_RELEASE); | 
|---|
| [504a7dc] | 194 |  | 
|---|
| [0ee224b] | 195 | __kernel_rseq_unregister(); | 
|---|
| [7768b8d] | 196 | } | 
|---|
|  | 197 |  | 
|---|
|  | 198 | //----------------------------------------------------------------------- | 
|---|
|  | 199 | // Writer side : acquire when changing the ready queue, e.g. adding more | 
|---|
|  | 200 | //  queues or removing them. | 
|---|
| [b388ee81] | 201 | uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) { | 
|---|
| [8fc652e0] | 202 | /* paranoid */ verify( ! __preemption_enabled() ); | 
|---|
| [62502cc4] | 203 |  | 
|---|
| [7768b8d] | 204 | // Step 1 : lock global lock | 
|---|
|  | 205 | // It is needed to avoid processors that register mid Critical-Section | 
|---|
|  | 206 | //   to simply lock their own lock and enter. | 
|---|
| [c993b15] | 207 | __atomic_acquire( &write_lock ); | 
|---|
| [7768b8d] | 208 |  | 
|---|
| [46bbcaf] | 209 | // Make sure we won't deadlock ourself | 
|---|
|  | 210 | // Checking before acquiring the writer lock isn't safe | 
|---|
|  | 211 | // because someone else could have locked us. | 
|---|
|  | 212 | /* paranoid */ verify( ! kernelTLS().sched_lock ); | 
|---|
|  | 213 |  | 
|---|
| [7768b8d] | 214 | // Step 2 : lock per-proc lock | 
|---|
|  | 215 | // Processors that are currently being registered aren't counted | 
|---|
|  | 216 | //   but can't be in read_lock or in the critical section. | 
|---|
|  | 217 | // All other processors are counted | 
|---|
|  | 218 | uint_fast32_t s = ready; | 
|---|
|  | 219 | for(uint_fast32_t i = 0; i < s; i++) { | 
|---|
| [c993b15] | 220 | volatile bool * llock = data[i]; | 
|---|
|  | 221 | if(llock) __atomic_acquire( llock ); | 
|---|
| [7768b8d] | 222 | } | 
|---|
|  | 223 |  | 
|---|
| [8fc652e0] | 224 | /* paranoid */ verify( ! __preemption_enabled() ); | 
|---|
| [7768b8d] | 225 | return s; | 
|---|
|  | 226 | } | 
|---|
|  | 227 |  | 
|---|
| [b388ee81] | 228 | void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) { | 
|---|
| [8fc652e0] | 229 | /* paranoid */ verify( ! __preemption_enabled() ); | 
|---|
| [62502cc4] | 230 |  | 
|---|
| [7768b8d] | 231 | // Step 1 : release local locks | 
|---|
|  | 232 | // This must be done while the global lock is held to avoid | 
|---|
|  | 233 | //   threads that where created mid critical section | 
|---|
|  | 234 | //   to race to lock their local locks and have the writer | 
|---|
|  | 235 | //   immidiately unlock them | 
|---|
|  | 236 | // Alternative solution : return s in write_lock and pass it to write_unlock | 
|---|
|  | 237 | for(uint_fast32_t i = 0; i < last_s; i++) { | 
|---|
| [c993b15] | 238 | volatile bool * llock = data[i]; | 
|---|
|  | 239 | if(llock) __atomic_store_n(llock, (bool)false, __ATOMIC_RELEASE); | 
|---|
| [7768b8d] | 240 | } | 
|---|
|  | 241 |  | 
|---|
|  | 242 | // Step 2 : release global lock | 
|---|
| [c993b15] | 243 | /*paranoid*/ assert(true == write_lock); | 
|---|
|  | 244 | __atomic_store_n(&write_lock, (bool)false, __ATOMIC_RELEASE); | 
|---|
| [62502cc4] | 245 |  | 
|---|
| [8fc652e0] | 246 | /* paranoid */ verify( ! __preemption_enabled() ); | 
|---|
| [7768b8d] | 247 | } | 
|---|
|  | 248 |  | 
|---|
| [a2a4566] | 249 | //======================================================================= | 
|---|
|  | 250 | // caches handling | 
|---|
|  | 251 |  | 
|---|
|  | 252 | struct __attribute__((aligned(128))) __ready_queue_caches_t { | 
|---|
|  | 253 | // Count States: | 
|---|
|  | 254 | // - 0  : No one is looking after this cache | 
|---|
|  | 255 | // - 1  : No one is looking after this cache, BUT it's not empty | 
|---|
|  | 256 | // - 2+ : At least one processor is looking after this cache | 
|---|
|  | 257 | volatile unsigned count; | 
|---|
|  | 258 | }; | 
|---|
|  | 259 |  | 
|---|
|  | 260 | void  ?{}(__ready_queue_caches_t & this) { this.count = 0; } | 
|---|
|  | 261 | void ^?{}(__ready_queue_caches_t & this) {} | 
|---|
|  | 262 |  | 
|---|
|  | 263 | static inline void depart(__ready_queue_caches_t & cache) { | 
|---|
|  | 264 | /* paranoid */ verify( cache.count > 1); | 
|---|
|  | 265 | __atomic_fetch_add(&cache.count, -1, __ATOMIC_SEQ_CST); | 
|---|
|  | 266 | /* paranoid */ verify( cache.count != 0); | 
|---|
|  | 267 | /* paranoid */ verify( cache.count < 65536 ); // This verify assumes no cluster will have more than 65000 kernel threads mapped to a single cache, which could be correct but is super weird. | 
|---|
|  | 268 | } | 
|---|
|  | 269 |  | 
|---|
|  | 270 | static inline void arrive(__ready_queue_caches_t & cache) { | 
|---|
|  | 271 | // for() { | 
|---|
|  | 272 | //      unsigned expected = cache.count; | 
|---|
|  | 273 | //      unsigned desired  = 0 == expected ? 2 : expected + 1; | 
|---|
|  | 274 | // } | 
|---|
|  | 275 | } | 
|---|
|  | 276 |  | 
|---|
| [7768b8d] | 277 | //======================================================================= | 
|---|
| [9cc3a18] | 278 | // Cforall Ready Queue used for scheduling | 
|---|
| [b798713] | 279 | //======================================================================= | 
|---|
| [a2a4566] | 280 | unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) { | 
|---|
|  | 281 | /* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc ); | 
|---|
|  | 282 | /* paranoid */ verifyf( instsc  < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc ); | 
|---|
|  | 283 | /* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg ); | 
|---|
|  | 284 |  | 
|---|
|  | 285 | const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0; | 
|---|
|  | 286 | const unsigned long long total_weight = 16; | 
|---|
|  | 287 | const unsigned long long new_weight   = 4; | 
|---|
|  | 288 | const unsigned long long old_weight = total_weight - new_weight; | 
|---|
|  | 289 | const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight; | 
|---|
|  | 290 | return ret; | 
|---|
| [089d30c] | 291 | } | 
|---|
|  | 292 |  | 
|---|
| [b798713] | 293 | void ?{}(__ready_queue_t & this) with (this) { | 
|---|
| [12daa43] | 294 | #if defined(USE_CPU_WORK_STEALING) | 
|---|
|  | 295 | lanes.count = cpu_info.hthrd_count * READYQ_SHARD_FACTOR; | 
|---|
|  | 296 | lanes.data = alloc( lanes.count ); | 
|---|
|  | 297 | lanes.tscs = alloc( lanes.count ); | 
|---|
| [089d30c] | 298 | lanes.help = alloc( cpu_info.hthrd_count ); | 
|---|
| [12daa43] | 299 |  | 
|---|
|  | 300 | for( idx; (size_t)lanes.count ) { | 
|---|
|  | 301 | (lanes.data[idx]){}; | 
|---|
|  | 302 | lanes.tscs[idx].tv = rdtscl(); | 
|---|
| [089d30c] | 303 | lanes.tscs[idx].ma = rdtscl(); | 
|---|
|  | 304 | } | 
|---|
|  | 305 | for( idx; (size_t)cpu_info.hthrd_count ) { | 
|---|
|  | 306 | lanes.help[idx].src = 0; | 
|---|
|  | 307 | lanes.help[idx].dst = 0; | 
|---|
|  | 308 | lanes.help[idx].tri = 0; | 
|---|
| [12daa43] | 309 | } | 
|---|
|  | 310 | #else | 
|---|
| [a2a4566] | 311 | lanes.data   = 0p; | 
|---|
|  | 312 | lanes.tscs   = 0p; | 
|---|
|  | 313 | lanes.caches = 0p; | 
|---|
|  | 314 | lanes.help   = 0p; | 
|---|
|  | 315 | lanes.count  = 0; | 
|---|
| [12daa43] | 316 | #endif | 
|---|
| [b798713] | 317 | } | 
|---|
|  | 318 |  | 
|---|
|  | 319 | void ^?{}(__ready_queue_t & this) with (this) { | 
|---|
| [12daa43] | 320 | #if !defined(USE_CPU_WORK_STEALING) | 
|---|
|  | 321 | verify( SEQUENTIAL_SHARD == lanes.count ); | 
|---|
|  | 322 | #endif | 
|---|
|  | 323 |  | 
|---|
| [dca5802] | 324 | free(lanes.data); | 
|---|
| [9cc3a18] | 325 | free(lanes.tscs); | 
|---|
| [a2a4566] | 326 | free(lanes.caches); | 
|---|
| [089d30c] | 327 | free(lanes.help); | 
|---|
| [dca5802] | 328 | } | 
|---|
|  | 329 |  | 
|---|
| [64a7146] | 330 | //----------------------------------------------------------------------- | 
|---|
| [a2a4566] | 331 | #if defined(USE_AWARE_STEALING) | 
|---|
|  | 332 | __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) { | 
|---|
|  | 333 | processor * const proc = kernelTLS().this_processor; | 
|---|
|  | 334 | const bool external = (!proc) || (cltr != proc->cltr); | 
|---|
|  | 335 | const bool remote   = hint == UNPARK_REMOTE; | 
|---|
|  | 336 |  | 
|---|
|  | 337 | unsigned i; | 
|---|
|  | 338 | if( external || remote ) { | 
|---|
|  | 339 | // Figure out where thread was last time and make sure it's valid | 
|---|
|  | 340 | /* paranoid */ verify(thrd->preferred >= 0); | 
|---|
|  | 341 | if(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count) { | 
|---|
|  | 342 | /* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count); | 
|---|
|  | 343 | unsigned start = thrd->preferred * READYQ_SHARD_FACTOR; | 
|---|
|  | 344 | do { | 
|---|
|  | 345 | unsigned r = __tls_rand(); | 
|---|
|  | 346 | i = start + (r % READYQ_SHARD_FACTOR); | 
|---|
|  | 347 | /* paranoid */ verify( i < lanes.count ); | 
|---|
|  | 348 | // If we can't lock it retry | 
|---|
|  | 349 | } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); | 
|---|
|  | 350 | } else { | 
|---|
|  | 351 | do { | 
|---|
|  | 352 | i = __tls_rand() % lanes.count; | 
|---|
|  | 353 | } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); | 
|---|
|  | 354 | } | 
|---|
|  | 355 | } else { | 
|---|
|  | 356 | do { | 
|---|
|  | 357 | unsigned r = proc->rdq.its++; | 
|---|
|  | 358 | i = proc->rdq.id + (r % READYQ_SHARD_FACTOR); | 
|---|
|  | 359 | /* paranoid */ verify( i < lanes.count ); | 
|---|
|  | 360 | // If we can't lock it retry | 
|---|
|  | 361 | } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); | 
|---|
|  | 362 | } | 
|---|
|  | 363 |  | 
|---|
|  | 364 | // Actually push it | 
|---|
|  | 365 | push(lanes.data[i], thrd); | 
|---|
|  | 366 |  | 
|---|
|  | 367 | // Unlock and return | 
|---|
|  | 368 | __atomic_unlock( &lanes.data[i].lock ); | 
|---|
|  | 369 |  | 
|---|
|  | 370 | #if !defined(__CFA_NO_STATISTICS__) | 
|---|
|  | 371 | if(unlikely(external || remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); | 
|---|
|  | 372 | else __tls_stats()->ready.push.local.success++; | 
|---|
|  | 373 | #endif | 
|---|
|  | 374 | } | 
|---|
|  | 375 |  | 
|---|
|  | 376 | static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) { | 
|---|
|  | 377 | unsigned start = proc->rdq.id; | 
|---|
|  | 378 | unsigned long long max = 0; | 
|---|
|  | 379 | for(i; READYQ_SHARD_FACTOR) { | 
|---|
|  | 380 | unsigned long long ptsc = ts(rdq.lanes.data[start + i]); | 
|---|
|  | 381 | if(ptsc != -1ull) { | 
|---|
|  | 382 | /* paranoid */ verify( start + i < rdq.lanes.count ); | 
|---|
|  | 383 | unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma); | 
|---|
|  | 384 | if(tsc > max) max = tsc; | 
|---|
|  | 385 | } | 
|---|
|  | 386 | } | 
|---|
|  | 387 | return (max + 2 * max) / 2; | 
|---|
|  | 388 | } | 
|---|
|  | 389 |  | 
|---|
|  | 390 | __attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
|  | 391 | /* paranoid */ verify( lanes.count > 0 ); | 
|---|
|  | 392 | /* paranoid */ verify( kernelTLS().this_processor ); | 
|---|
|  | 393 | /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count ); | 
|---|
|  | 394 |  | 
|---|
|  | 395 | processor * const proc = kernelTLS().this_processor; | 
|---|
|  | 396 | unsigned this = proc->rdq.id; | 
|---|
|  | 397 | /* paranoid */ verify( this < lanes.count ); | 
|---|
|  | 398 | __cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this); | 
|---|
|  | 399 |  | 
|---|
|  | 400 | // Figure out the current cpu and make sure it is valid | 
|---|
|  | 401 | const int cpu = __kernel_getcpu(); | 
|---|
|  | 402 | /* paranoid */ verify(cpu >= 0); | 
|---|
|  | 403 | /* paranoid */ verify(cpu < cpu_info.hthrd_count); | 
|---|
|  | 404 | unsigned this_cache = cpu_info.llc_map[cpu].cache; | 
|---|
| [0fb3ee5] | 405 |  | 
|---|
|  | 406 | // Super important: don't write the same value over and over again | 
|---|
|  | 407 | // We want to maximise our chances that his particular values stays in cache | 
|---|
|  | 408 | if(lanes.caches[this / READYQ_SHARD_FACTOR].id != this_cache) | 
|---|
|  | 409 | __atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED); | 
|---|
| [a2a4566] | 410 |  | 
|---|
|  | 411 | const unsigned long long ctsc = rdtscl(); | 
|---|
|  | 412 |  | 
|---|
|  | 413 | if(proc->rdq.target == MAX) { | 
|---|
|  | 414 | uint64_t chaos = __tls_rand(); | 
|---|
|  | 415 | unsigned ext = chaos & 0xff; | 
|---|
|  | 416 | unsigned other  = (chaos >> 8) % (lanes.count); | 
|---|
|  | 417 |  | 
|---|
|  | 418 | if(ext < 3 || __atomic_load_n(&lanes.caches[other / READYQ_SHARD_FACTOR].id, __ATOMIC_RELAXED) == this_cache) { | 
|---|
|  | 419 | proc->rdq.target = other; | 
|---|
|  | 420 | } | 
|---|
|  | 421 | } | 
|---|
|  | 422 | else { | 
|---|
|  | 423 | const unsigned target = proc->rdq.target; | 
|---|
|  | 424 | __cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv); | 
|---|
|  | 425 | /* paranoid */ verify( lanes.tscs[target].tv != MAX ); | 
|---|
|  | 426 | if(target < lanes.count) { | 
|---|
|  | 427 | const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue); | 
|---|
|  | 428 | const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma); | 
|---|
|  | 429 | __cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no"); | 
|---|
|  | 430 | if(age > cutoff) { | 
|---|
|  | 431 | thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help)); | 
|---|
|  | 432 | if(t) return t; | 
|---|
|  | 433 | } | 
|---|
|  | 434 | } | 
|---|
|  | 435 | proc->rdq.target = MAX; | 
|---|
|  | 436 | } | 
|---|
|  | 437 |  | 
|---|
|  | 438 | for(READYQ_SHARD_FACTOR) { | 
|---|
|  | 439 | unsigned i = this + (proc->rdq.itr++ % READYQ_SHARD_FACTOR); | 
|---|
|  | 440 | if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t; | 
|---|
|  | 441 | } | 
|---|
|  | 442 |  | 
|---|
|  | 443 | // All lanes where empty return 0p | 
|---|
|  | 444 | return 0p; | 
|---|
|  | 445 |  | 
|---|
|  | 446 | } | 
|---|
|  | 447 | __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
|  | 448 | unsigned i = __tls_rand() % lanes.count; | 
|---|
|  | 449 | return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal)); | 
|---|
|  | 450 | } | 
|---|
|  | 451 | __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) { | 
|---|
|  | 452 | return search(cltr); | 
|---|
|  | 453 | } | 
|---|
|  | 454 | #endif | 
|---|
| [12daa43] | 455 | #if defined(USE_CPU_WORK_STEALING) | 
|---|
| [24e321c] | 456 | __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) { | 
|---|
| [12daa43] | 457 | __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr); | 
|---|
|  | 458 |  | 
|---|
|  | 459 | processor * const proc = kernelTLS().this_processor; | 
|---|
| [75c7252] | 460 | const bool external = (!proc) || (cltr != proc->cltr); | 
|---|
| [12daa43] | 461 |  | 
|---|
| [75c7252] | 462 | // Figure out the current cpu and make sure it is valid | 
|---|
| [12daa43] | 463 | const int cpu = __kernel_getcpu(); | 
|---|
|  | 464 | /* paranoid */ verify(cpu >= 0); | 
|---|
|  | 465 | /* paranoid */ verify(cpu < cpu_info.hthrd_count); | 
|---|
|  | 466 | /* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count); | 
|---|
|  | 467 |  | 
|---|
| [75c7252] | 468 | // Figure out where thread was last time and make sure it's | 
|---|
|  | 469 | /* paranoid */ verify(thrd->preferred >= 0); | 
|---|
|  | 470 | /* paranoid */ verify(thrd->preferred < cpu_info.hthrd_count); | 
|---|
|  | 471 | /* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count); | 
|---|
|  | 472 | const int prf = thrd->preferred * READYQ_SHARD_FACTOR; | 
|---|
|  | 473 |  | 
|---|
|  | 474 | const cpu_map_entry_t & map; | 
|---|
|  | 475 | choose(hint) { | 
|---|
|  | 476 | case UNPARK_LOCAL : &map = &cpu_info.llc_map[cpu]; | 
|---|
|  | 477 | case UNPARK_REMOTE: &map = &cpu_info.llc_map[prf]; | 
|---|
|  | 478 | } | 
|---|
| [df7597e0] | 479 | /* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count); | 
|---|
|  | 480 | /* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count); | 
|---|
| [5614552a] | 481 | /* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR); | 
|---|
| [df7597e0] | 482 |  | 
|---|
|  | 483 | const int start = map.self * READYQ_SHARD_FACTOR; | 
|---|
| [12daa43] | 484 | unsigned i; | 
|---|
|  | 485 | do { | 
|---|
|  | 486 | unsigned r; | 
|---|
|  | 487 | if(unlikely(external)) { r = __tls_rand(); } | 
|---|
|  | 488 | else { r = proc->rdq.its++; } | 
|---|
| [75c7252] | 489 | choose(hint) { | 
|---|
|  | 490 | case UNPARK_LOCAL : i = start + (r % READYQ_SHARD_FACTOR); | 
|---|
|  | 491 | case UNPARK_REMOTE: i = prf   + (r % READYQ_SHARD_FACTOR); | 
|---|
|  | 492 | } | 
|---|
| [12daa43] | 493 | // If we can't lock it retry | 
|---|
|  | 494 | } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); | 
|---|
|  | 495 |  | 
|---|
|  | 496 | // Actually push it | 
|---|
|  | 497 | push(lanes.data[i], thrd); | 
|---|
|  | 498 |  | 
|---|
|  | 499 | // Unlock and return | 
|---|
|  | 500 | __atomic_unlock( &lanes.data[i].lock ); | 
|---|
|  | 501 |  | 
|---|
|  | 502 | #if !defined(__CFA_NO_STATISTICS__) | 
|---|
|  | 503 | if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); | 
|---|
|  | 504 | else __tls_stats()->ready.push.local.success++; | 
|---|
|  | 505 | #endif | 
|---|
|  | 506 |  | 
|---|
|  | 507 | __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first); | 
|---|
|  | 508 |  | 
|---|
|  | 509 | } | 
|---|
|  | 510 |  | 
|---|
|  | 511 | // Pop from the ready queue from a given cluster | 
|---|
| [e84ab3d] | 512 | __attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
| [12daa43] | 513 | /* paranoid */ verify( lanes.count > 0 ); | 
|---|
|  | 514 | /* paranoid */ verify( kernelTLS().this_processor ); | 
|---|
|  | 515 |  | 
|---|
| [a2a4566] | 516 | processor * const proc = kernelTLS().this_processor; | 
|---|
| [25337e0] | 517 | const int cpu = __kernel_getcpu(); | 
|---|
| [12daa43] | 518 | /* paranoid */ verify(cpu >= 0); | 
|---|
|  | 519 | /* paranoid */ verify(cpu < cpu_info.hthrd_count); | 
|---|
| [df7597e0] | 520 | /* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count); | 
|---|
|  | 521 |  | 
|---|
|  | 522 | const cpu_map_entry_t & map = cpu_info.llc_map[cpu]; | 
|---|
|  | 523 | /* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count); | 
|---|
|  | 524 | /* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count); | 
|---|
| [5614552a] | 525 | /* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR); | 
|---|
| [12daa43] | 526 |  | 
|---|
| [df7597e0] | 527 | const int start = map.self * READYQ_SHARD_FACTOR; | 
|---|
| [089d30c] | 528 | const unsigned long long ctsc = rdtscl(); | 
|---|
| [12daa43] | 529 |  | 
|---|
|  | 530 | // Did we already have a help target | 
|---|
| [a2a4566] | 531 | if(proc->rdq.target == MAX) { | 
|---|
| [089d30c] | 532 | unsigned long long max = 0; | 
|---|
| [12daa43] | 533 | for(i; READYQ_SHARD_FACTOR) { | 
|---|
| [25337e0] | 534 | unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma); | 
|---|
| [089d30c] | 535 | if(tsc > max) max = tsc; | 
|---|
| [12daa43] | 536 | } | 
|---|
| [a2a4566] | 537 | //  proc->rdq.cutoff = (max + 2 * max) / 2; | 
|---|
| [953827a] | 538 | /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores. | 
|---|
|  | 539 | /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores. | 
|---|
|  | 540 |  | 
|---|
| [089d30c] | 541 | if(0 == (__tls_rand() % 100)) { | 
|---|
| [1f45c7d] | 542 | proc->rdq.target = __tls_rand() % lanes.count; | 
|---|
| [953827a] | 543 | } else { | 
|---|
| [1f45c7d] | 544 | unsigned cpu_chaos = map.start + (__tls_rand() % map.count); | 
|---|
|  | 545 | proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (__tls_rand() % READYQ_SHARD_FACTOR); | 
|---|
| [953827a] | 546 | /* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR)); | 
|---|
|  | 547 | /* paranoid */ verify(proc->rdq.target <  ((map.start + map.count) * READYQ_SHARD_FACTOR)); | 
|---|
|  | 548 | } | 
|---|
|  | 549 |  | 
|---|
| [a2a4566] | 550 | /* paranoid */ verify(proc->rdq.target != MAX); | 
|---|
| [12daa43] | 551 | } | 
|---|
|  | 552 | else { | 
|---|
| [089d30c] | 553 | unsigned long long max = 0; | 
|---|
|  | 554 | for(i; READYQ_SHARD_FACTOR) { | 
|---|
| [25337e0] | 555 | unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma); | 
|---|
| [089d30c] | 556 | if(tsc > max) max = tsc; | 
|---|
|  | 557 | } | 
|---|
|  | 558 | const unsigned long long cutoff = (max + 2 * max) / 2; | 
|---|
| [12daa43] | 559 | { | 
|---|
|  | 560 | unsigned target = proc->rdq.target; | 
|---|
| [a2a4566] | 561 | proc->rdq.target = MAX; | 
|---|
| [fcd65ca] | 562 | lanes.help[target / READYQ_SHARD_FACTOR].tri++; | 
|---|
| [25337e0] | 563 | if(moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) { | 
|---|
| [e84ab3d] | 564 | thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help)); | 
|---|
| [12daa43] | 565 | proc->rdq.last = target; | 
|---|
|  | 566 | if(t) return t; | 
|---|
|  | 567 | } | 
|---|
| [a2a4566] | 568 | proc->rdq.target = MAX; | 
|---|
| [12daa43] | 569 | } | 
|---|
|  | 570 |  | 
|---|
|  | 571 | unsigned last = proc->rdq.last; | 
|---|
| [25337e0] | 572 | if(last != MAX && moving_average(ctsc, lanes.tscs[last].tv, lanes.tscs[last].ma) > cutoff) { | 
|---|
| [e84ab3d] | 573 | thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help)); | 
|---|
| [12daa43] | 574 | if(t) return t; | 
|---|
|  | 575 | } | 
|---|
|  | 576 | else { | 
|---|
| [a2a4566] | 577 | proc->rdq.last = MAX; | 
|---|
| [12daa43] | 578 | } | 
|---|
|  | 579 | } | 
|---|
|  | 580 |  | 
|---|
|  | 581 | for(READYQ_SHARD_FACTOR) { | 
|---|
|  | 582 | unsigned i = start + (proc->rdq.itr++ % READYQ_SHARD_FACTOR); | 
|---|
| [e84ab3d] | 583 | if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t; | 
|---|
| [12daa43] | 584 | } | 
|---|
|  | 585 |  | 
|---|
|  | 586 | // All lanes where empty return 0p | 
|---|
|  | 587 | return 0p; | 
|---|
|  | 588 | } | 
|---|
|  | 589 |  | 
|---|
| [e84ab3d] | 590 | __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
| [12daa43] | 591 | processor * const proc = kernelTLS().this_processor; | 
|---|
|  | 592 | unsigned last = proc->rdq.last; | 
|---|
| [a2a4566] | 593 | if(last != MAX) { | 
|---|
| [e84ab3d] | 594 | struct thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal)); | 
|---|
| [953827a] | 595 | if(t) return t; | 
|---|
| [a2a4566] | 596 | proc->rdq.last = MAX; | 
|---|
| [953827a] | 597 | } | 
|---|
| [12daa43] | 598 |  | 
|---|
|  | 599 | unsigned i = __tls_rand() % lanes.count; | 
|---|
|  | 600 | return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal)); | 
|---|
|  | 601 | } | 
|---|
| [e84ab3d] | 602 | __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) { | 
|---|
| [12daa43] | 603 | return search(cltr); | 
|---|
|  | 604 | } | 
|---|
|  | 605 | #endif | 
|---|
| [431cd4f] | 606 | #if defined(USE_RELAXED_FIFO) | 
|---|
|  | 607 | //----------------------------------------------------------------------- | 
|---|
|  | 608 | // get index from random number with or without bias towards queues | 
|---|
|  | 609 | static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) { | 
|---|
|  | 610 | unsigned i; | 
|---|
|  | 611 | bool local; | 
|---|
|  | 612 | unsigned rlow  = r % BIAS; | 
|---|
|  | 613 | unsigned rhigh = r / BIAS; | 
|---|
|  | 614 | if((0 != rlow) && preferred >= 0) { | 
|---|
|  | 615 | // (BIAS - 1) out of BIAS chances | 
|---|
|  | 616 | // Use perferred queues | 
|---|
|  | 617 | i = preferred + (rhigh % READYQ_SHARD_FACTOR); | 
|---|
|  | 618 | local = true; | 
|---|
|  | 619 | } | 
|---|
|  | 620 | else { | 
|---|
|  | 621 | // 1 out of BIAS chances | 
|---|
|  | 622 | // Use all queues | 
|---|
|  | 623 | i = rhigh; | 
|---|
|  | 624 | local = false; | 
|---|
|  | 625 | } | 
|---|
|  | 626 | return [i, local]; | 
|---|
|  | 627 | } | 
|---|
|  | 628 |  | 
|---|
| [24e321c] | 629 | __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) { | 
|---|
| [431cd4f] | 630 | __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr); | 
|---|
| [1b143de] | 631 |  | 
|---|
| [24e321c] | 632 | const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr); | 
|---|
| [431cd4f] | 633 | /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count ); | 
|---|
| [fd1f65e] | 634 |  | 
|---|
| [431cd4f] | 635 | bool local; | 
|---|
|  | 636 | int preferred = external ? -1 : kernelTLS().this_processor->rdq.id; | 
|---|
| [52769ba] | 637 |  | 
|---|
| [431cd4f] | 638 | // Try to pick a lane and lock it | 
|---|
|  | 639 | unsigned i; | 
|---|
|  | 640 | do { | 
|---|
|  | 641 | // Pick the index of a lane | 
|---|
|  | 642 | unsigned r = __tls_rand_fwd(); | 
|---|
|  | 643 | [i, local] = idx_from_r(r, preferred); | 
|---|
| [772411a] | 644 |  | 
|---|
| [431cd4f] | 645 | i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED ); | 
|---|
|  | 646 |  | 
|---|
|  | 647 | #if !defined(__CFA_NO_STATISTICS__) | 
|---|
| [d2fadeb] | 648 | if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED); | 
|---|
|  | 649 | else if(local) __tls_stats()->ready.push.local.attempt++; | 
|---|
|  | 650 | else __tls_stats()->ready.push.share.attempt++; | 
|---|
| [431cd4f] | 651 | #endif | 
|---|
| [b798713] | 652 |  | 
|---|
| [431cd4f] | 653 | // If we can't lock it retry | 
|---|
|  | 654 | } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); | 
|---|
|  | 655 |  | 
|---|
|  | 656 | // Actually push it | 
|---|
|  | 657 | push(lanes.data[i], thrd); | 
|---|
|  | 658 |  | 
|---|
| [b808625] | 659 | // Unlock and return | 
|---|
|  | 660 | __atomic_unlock( &lanes.data[i].lock ); | 
|---|
| [431cd4f] | 661 |  | 
|---|
|  | 662 | // Mark the current index in the tls rng instance as having an item | 
|---|
|  | 663 | __tls_rand_advance_bck(); | 
|---|
|  | 664 |  | 
|---|
|  | 665 | __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first); | 
|---|
|  | 666 |  | 
|---|
|  | 667 | // Update statistics | 
|---|
| [b798713] | 668 | #if !defined(__CFA_NO_STATISTICS__) | 
|---|
| [d2fadeb] | 669 | if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); | 
|---|
|  | 670 | else if(local) __tls_stats()->ready.push.local.success++; | 
|---|
|  | 671 | else __tls_stats()->ready.push.share.success++; | 
|---|
| [b798713] | 672 | #endif | 
|---|
| [431cd4f] | 673 | } | 
|---|
| [b798713] | 674 |  | 
|---|
| [431cd4f] | 675 | // Pop from the ready queue from a given cluster | 
|---|
| [e84ab3d] | 676 | __attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
| [431cd4f] | 677 | /* paranoid */ verify( lanes.count > 0 ); | 
|---|
|  | 678 | /* paranoid */ verify( kernelTLS().this_processor ); | 
|---|
|  | 679 | /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count ); | 
|---|
| [b798713] | 680 |  | 
|---|
| [431cd4f] | 681 | unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED ); | 
|---|
|  | 682 | int preferred = kernelTLS().this_processor->rdq.id; | 
|---|
| [dca5802] | 683 |  | 
|---|
|  | 684 |  | 
|---|
| [431cd4f] | 685 | // As long as the list is not empty, try finding a lane that isn't empty and pop from it | 
|---|
|  | 686 | for(25) { | 
|---|
|  | 687 | // Pick two lists at random | 
|---|
|  | 688 | unsigned ri = __tls_rand_bck(); | 
|---|
|  | 689 | unsigned rj = __tls_rand_bck(); | 
|---|
| [c426b03] | 690 |  | 
|---|
| [431cd4f] | 691 | unsigned i, j; | 
|---|
|  | 692 | __attribute__((unused)) bool locali, localj; | 
|---|
|  | 693 | [i, locali] = idx_from_r(ri, preferred); | 
|---|
|  | 694 | [j, localj] = idx_from_r(rj, preferred); | 
|---|
| [1b143de] | 695 |  | 
|---|
| [431cd4f] | 696 | i %= count; | 
|---|
|  | 697 | j %= count; | 
|---|
| [9cc3a18] | 698 |  | 
|---|
| [431cd4f] | 699 | // try popping from the 2 picked lists | 
|---|
| [e84ab3d] | 700 | struct thread$ * thrd = try_pop(cltr, i, j __STATS(, *(locali || localj ? &__tls_stats()->ready.pop.local : &__tls_stats()->ready.pop.help))); | 
|---|
| [431cd4f] | 701 | if(thrd) { | 
|---|
|  | 702 | return thrd; | 
|---|
|  | 703 | } | 
|---|
|  | 704 | } | 
|---|
| [13c5e19] | 705 |  | 
|---|
| [431cd4f] | 706 | // All lanes where empty return 0p | 
|---|
|  | 707 | return 0p; | 
|---|
|  | 708 | } | 
|---|
| [772411a] | 709 |  | 
|---|
| [e84ab3d] | 710 | __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) { return pop_fast(cltr); } | 
|---|
|  | 711 | __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) { | 
|---|
| [431cd4f] | 712 | return search(cltr); | 
|---|
|  | 713 | } | 
|---|
|  | 714 | #endif | 
|---|
|  | 715 | #if defined(USE_WORK_STEALING) | 
|---|
| [24e321c] | 716 | __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) { | 
|---|
| [431cd4f] | 717 | __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr); | 
|---|
| [772411a] | 718 |  | 
|---|
| [d3ba775] | 719 | // #define USE_PREFERRED | 
|---|
|  | 720 | #if !defined(USE_PREFERRED) | 
|---|
| [24e321c] | 721 | const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr); | 
|---|
| [431cd4f] | 722 | /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count ); | 
|---|
| [d3ba775] | 723 | #else | 
|---|
|  | 724 | unsigned preferred = thrd->preferred; | 
|---|
| [a2a4566] | 725 | const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == MAX || thrd->curr_cluster != cltr; | 
|---|
| [d3ba775] | 726 | /* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count ); | 
|---|
| [772411a] | 727 |  | 
|---|
| [d3ba775] | 728 | unsigned r = preferred % READYQ_SHARD_FACTOR; | 
|---|
|  | 729 | const unsigned start = preferred - r; | 
|---|
| [2b96031] | 730 | #endif | 
|---|
| [431cd4f] | 731 |  | 
|---|
|  | 732 | // Try to pick a lane and lock it | 
|---|
|  | 733 | unsigned i; | 
|---|
|  | 734 | do { | 
|---|
| [d2fadeb] | 735 | #if !defined(__CFA_NO_STATISTICS__) | 
|---|
|  | 736 | if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.attempt, 1, __ATOMIC_RELAXED); | 
|---|
|  | 737 | else __tls_stats()->ready.push.local.attempt++; | 
|---|
|  | 738 | #endif | 
|---|
|  | 739 |  | 
|---|
| [431cd4f] | 740 | if(unlikely(external)) { | 
|---|
|  | 741 | i = __tls_rand() % lanes.count; | 
|---|
|  | 742 | } | 
|---|
|  | 743 | else { | 
|---|
| [d3ba775] | 744 | #if !defined(USE_PREFERRED) | 
|---|
| [b808625] | 745 | processor * proc = kernelTLS().this_processor; | 
|---|
|  | 746 | unsigned r = proc->rdq.its++; | 
|---|
|  | 747 | i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR); | 
|---|
|  | 748 | #else | 
|---|
| [d3ba775] | 749 | i = start + (r++ % READYQ_SHARD_FACTOR); | 
|---|
|  | 750 | #endif | 
|---|
|  | 751 | } | 
|---|
| [431cd4f] | 752 | // If we can't lock it retry | 
|---|
|  | 753 | } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); | 
|---|
| [13c5e19] | 754 |  | 
|---|
| [431cd4f] | 755 | // Actually push it | 
|---|
|  | 756 | push(lanes.data[i], thrd); | 
|---|
| [13c5e19] | 757 |  | 
|---|
| [b808625] | 758 | // Unlock and return | 
|---|
|  | 759 | __atomic_unlock( &lanes.data[i].lock ); | 
|---|
| [431cd4f] | 760 |  | 
|---|
| [d2fadeb] | 761 | #if !defined(__CFA_NO_STATISTICS__) | 
|---|
|  | 762 | if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED); | 
|---|
|  | 763 | else __tls_stats()->ready.push.local.success++; | 
|---|
|  | 764 | #endif | 
|---|
|  | 765 |  | 
|---|
| [431cd4f] | 766 | __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first); | 
|---|
| [13c5e19] | 767 | } | 
|---|
|  | 768 |  | 
|---|
| [431cd4f] | 769 | // Pop from the ready queue from a given cluster | 
|---|
| [e84ab3d] | 770 | __attribute__((hot)) thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
| [431cd4f] | 771 | /* paranoid */ verify( lanes.count > 0 ); | 
|---|
|  | 772 | /* paranoid */ verify( kernelTLS().this_processor ); | 
|---|
|  | 773 | /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count ); | 
|---|
|  | 774 |  | 
|---|
|  | 775 | processor * proc = kernelTLS().this_processor; | 
|---|
|  | 776 |  | 
|---|
| [a2a4566] | 777 | if(proc->rdq.target == MAX) { | 
|---|
| [1680072] | 778 | unsigned long long min = ts(lanes.data[proc->rdq.id]); | 
|---|
|  | 779 | for(int i = 0; i < READYQ_SHARD_FACTOR; i++) { | 
|---|
|  | 780 | unsigned long long tsc = ts(lanes.data[proc->rdq.id + i]); | 
|---|
|  | 781 | if(tsc < min) min = tsc; | 
|---|
|  | 782 | } | 
|---|
|  | 783 | proc->rdq.cutoff = min; | 
|---|
| [f55d54d] | 784 | proc->rdq.target = __tls_rand() % lanes.count; | 
|---|
| [431cd4f] | 785 | } | 
|---|
| [341aa39] | 786 | else { | 
|---|
|  | 787 | unsigned target = proc->rdq.target; | 
|---|
| [a2a4566] | 788 | proc->rdq.target = MAX; | 
|---|
| [9cac0da] | 789 | const unsigned long long bias = 0; //2_500_000_000; | 
|---|
|  | 790 | const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff; | 
|---|
|  | 791 | if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) { | 
|---|
| [e84ab3d] | 792 | thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help)); | 
|---|
| [341aa39] | 793 | if(t) return t; | 
|---|
|  | 794 | } | 
|---|
| [431cd4f] | 795 | } | 
|---|
| [13c5e19] | 796 |  | 
|---|
| [431cd4f] | 797 | for(READYQ_SHARD_FACTOR) { | 
|---|
| [f55d54d] | 798 | unsigned i = proc->rdq.id + (proc->rdq.itr++ % READYQ_SHARD_FACTOR); | 
|---|
| [e84ab3d] | 799 | if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t; | 
|---|
| [431cd4f] | 800 | } | 
|---|
|  | 801 | return 0p; | 
|---|
| [1eb239e4] | 802 | } | 
|---|
|  | 803 |  | 
|---|
| [e84ab3d] | 804 | __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
| [fc59df78] | 805 | unsigned i = __tls_rand() % lanes.count; | 
|---|
|  | 806 | return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal)); | 
|---|
|  | 807 | } | 
|---|
| [431cd4f] | 808 |  | 
|---|
| [e84ab3d] | 809 | __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
| [431cd4f] | 810 | return search(cltr); | 
|---|
|  | 811 | } | 
|---|
|  | 812 | #endif | 
|---|
| [1eb239e4] | 813 |  | 
|---|
| [9cc3a18] | 814 | //======================================================================= | 
|---|
|  | 815 | // Various Ready Queue utilities | 
|---|
|  | 816 | //======================================================================= | 
|---|
|  | 817 | // these function work the same or almost the same | 
|---|
|  | 818 | // whether they are using work-stealing or relaxed fifo scheduling | 
|---|
| [1eb239e4] | 819 |  | 
|---|
| [9cc3a18] | 820 | //----------------------------------------------------------------------- | 
|---|
|  | 821 | // try to pop from a lane given by index w | 
|---|
| [e84ab3d] | 822 | static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) { | 
|---|
| [a2a4566] | 823 | /* paranoid */ verify( w < lanes.count ); | 
|---|
| [d2fadeb] | 824 | __STATS( stats.attempt++; ) | 
|---|
|  | 825 |  | 
|---|
| [dca5802] | 826 | // Get relevant elements locally | 
|---|
|  | 827 | __intrusive_lane_t & lane = lanes.data[w]; | 
|---|
|  | 828 |  | 
|---|
| [b798713] | 829 | // If list looks empty retry | 
|---|
| [d2fadeb] | 830 | if( is_empty(lane) ) { | 
|---|
|  | 831 | return 0p; | 
|---|
|  | 832 | } | 
|---|
| [b798713] | 833 |  | 
|---|
|  | 834 | // If we can't get the lock retry | 
|---|
| [d2fadeb] | 835 | if( !__atomic_try_acquire(&lane.lock) ) { | 
|---|
|  | 836 | return 0p; | 
|---|
|  | 837 | } | 
|---|
| [b798713] | 838 |  | 
|---|
|  | 839 | // If list is empty, unlock and retry | 
|---|
| [dca5802] | 840 | if( is_empty(lane) ) { | 
|---|
|  | 841 | __atomic_unlock(&lane.lock); | 
|---|
| [b798713] | 842 | return 0p; | 
|---|
|  | 843 | } | 
|---|
|  | 844 |  | 
|---|
|  | 845 | // Actually pop the list | 
|---|
| [e84ab3d] | 846 | struct thread$ * thrd; | 
|---|
| [a2a4566] | 847 | #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING) | 
|---|
| [078fb05] | 848 | unsigned long long tsc_before = ts(lane); | 
|---|
|  | 849 | #endif | 
|---|
| [f302d80] | 850 | unsigned long long tsv; | 
|---|
|  | 851 | [thrd, tsv] = pop(lane); | 
|---|
| [b798713] | 852 |  | 
|---|
| [dca5802] | 853 | /* paranoid */ verify(thrd); | 
|---|
| [78ea291] | 854 | /* paranoid */ verify(tsv); | 
|---|
| [dca5802] | 855 | /* paranoid */ verify(lane.lock); | 
|---|
| [b798713] | 856 |  | 
|---|
|  | 857 | // Unlock and return | 
|---|
| [dca5802] | 858 | __atomic_unlock(&lane.lock); | 
|---|
| [b798713] | 859 |  | 
|---|
| [dca5802] | 860 | // Update statistics | 
|---|
| [d2fadeb] | 861 | __STATS( stats.success++; ) | 
|---|
| [b798713] | 862 |  | 
|---|
| [a2a4566] | 863 | #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING) | 
|---|
|  | 864 | if (tsv != MAX) { | 
|---|
|  | 865 | unsigned long long now = rdtscl(); | 
|---|
|  | 866 | unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED); | 
|---|
|  | 867 | __atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED); | 
|---|
|  | 868 | __atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED); | 
|---|
|  | 869 | } | 
|---|
| [9cc3a18] | 870 | #endif | 
|---|
| [d72c074] | 871 |  | 
|---|
| [a2a4566] | 872 | #if defined(USE_AWARE_STEALING) || defined(USE_CPU_WORK_STEALING) | 
|---|
| [24e321c] | 873 | thrd->preferred = w / READYQ_SHARD_FACTOR; | 
|---|
|  | 874 | #else | 
|---|
|  | 875 | thrd->preferred = w; | 
|---|
|  | 876 | #endif | 
|---|
| [d3ba775] | 877 |  | 
|---|
| [dca5802] | 878 | // return the popped thread | 
|---|
| [b798713] | 879 | return thrd; | 
|---|
|  | 880 | } | 
|---|
| [04b5cef] | 881 |  | 
|---|
| [9cc3a18] | 882 | //----------------------------------------------------------------------- | 
|---|
|  | 883 | // try to pop from any lanes making sure you don't miss any threads push | 
|---|
|  | 884 | // before the start of the function | 
|---|
| [e84ab3d] | 885 | static inline struct thread$ * search(struct cluster * cltr) with (cltr->ready_queue) { | 
|---|
| [9cc3a18] | 886 | /* paranoid */ verify( lanes.count > 0 ); | 
|---|
|  | 887 | unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED ); | 
|---|
|  | 888 | unsigned offset = __tls_rand(); | 
|---|
|  | 889 | for(i; count) { | 
|---|
|  | 890 | unsigned idx = (offset + i) % count; | 
|---|
| [e84ab3d] | 891 | struct thread$ * thrd = try_pop(cltr, idx __STATS(, __tls_stats()->ready.pop.search)); | 
|---|
| [9cc3a18] | 892 | if(thrd) { | 
|---|
|  | 893 | return thrd; | 
|---|
|  | 894 | } | 
|---|
| [13c5e19] | 895 | } | 
|---|
| [9cc3a18] | 896 |  | 
|---|
|  | 897 | // All lanes where empty return 0p | 
|---|
|  | 898 | return 0p; | 
|---|
| [b798713] | 899 | } | 
|---|
|  | 900 |  | 
|---|
| [24e321c] | 901 | //----------------------------------------------------------------------- | 
|---|
|  | 902 | // get preferred ready for new thread | 
|---|
|  | 903 | unsigned ready_queue_new_preferred() { | 
|---|
|  | 904 | unsigned pref = 0; | 
|---|
|  | 905 | if(struct thread$ * thrd = publicTLS_get( this_thread )) { | 
|---|
|  | 906 | pref = thrd->preferred; | 
|---|
|  | 907 | } | 
|---|
|  | 908 | else { | 
|---|
|  | 909 | #if defined(USE_CPU_WORK_STEALING) | 
|---|
|  | 910 | pref = __kernel_getcpu(); | 
|---|
|  | 911 | #endif | 
|---|
|  | 912 | } | 
|---|
|  | 913 |  | 
|---|
|  | 914 | #if defined(USE_CPU_WORK_STEALING) | 
|---|
|  | 915 | /* paranoid */ verify(pref >= 0); | 
|---|
|  | 916 | /* paranoid */ verify(pref < cpu_info.hthrd_count); | 
|---|
|  | 917 | #endif | 
|---|
|  | 918 |  | 
|---|
|  | 919 | return pref; | 
|---|
|  | 920 | } | 
|---|
|  | 921 |  | 
|---|
| [b798713] | 922 | //----------------------------------------------------------------------- | 
|---|
| [9cc3a18] | 923 | // Check that all the intrusive queues in the data structure are still consistent | 
|---|
| [b798713] | 924 | static void check( __ready_queue_t & q ) with (q) { | 
|---|
| [d3ba775] | 925 | #if defined(__CFA_WITH_VERIFY__) | 
|---|
| [b798713] | 926 | { | 
|---|
| [dca5802] | 927 | for( idx ; lanes.count ) { | 
|---|
|  | 928 | __intrusive_lane_t & sl = lanes.data[idx]; | 
|---|
|  | 929 | assert(!lanes.data[idx].lock); | 
|---|
| [b798713] | 930 |  | 
|---|
| [2b96031] | 931 | if(is_empty(sl)) { | 
|---|
|  | 932 | assert( sl.anchor.next == 0p ); | 
|---|
| [ef94ae7] | 933 | assert( sl.anchor.ts   == -1llu ); | 
|---|
| [2b96031] | 934 | assert( mock_head(sl)  == sl.prev ); | 
|---|
|  | 935 | } else { | 
|---|
|  | 936 | assert( sl.anchor.next != 0p ); | 
|---|
| [ef94ae7] | 937 | assert( sl.anchor.ts   != -1llu ); | 
|---|
| [2b96031] | 938 | assert( mock_head(sl)  != sl.prev ); | 
|---|
|  | 939 | } | 
|---|
| [b798713] | 940 | } | 
|---|
|  | 941 | } | 
|---|
|  | 942 | #endif | 
|---|
|  | 943 | } | 
|---|
|  | 944 |  | 
|---|
| [9cc3a18] | 945 | //----------------------------------------------------------------------- | 
|---|
|  | 946 | // Given 2 indexes, pick the list with the oldest push an try to pop from it | 
|---|
| [e84ab3d] | 947 | static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) { | 
|---|
| [9cc3a18] | 948 | // Pick the bet list | 
|---|
|  | 949 | int w = i; | 
|---|
|  | 950 | if( __builtin_expect(!is_empty(lanes.data[j]), true) ) { | 
|---|
|  | 951 | w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j; | 
|---|
|  | 952 | } | 
|---|
|  | 953 |  | 
|---|
| [d2fadeb] | 954 | return try_pop(cltr, w __STATS(, stats)); | 
|---|
| [9cc3a18] | 955 | } | 
|---|
|  | 956 |  | 
|---|
| [b798713] | 957 | // Call this function of the intrusive list was moved using memcpy | 
|---|
| [dca5802] | 958 | // fixes the list so that the pointers back to anchors aren't left dangling | 
|---|
|  | 959 | static inline void fix(__intrusive_lane_t & ll) { | 
|---|
| [2b96031] | 960 | if(is_empty(ll)) { | 
|---|
|  | 961 | verify(ll.anchor.next == 0p); | 
|---|
|  | 962 | ll.prev = mock_head(ll); | 
|---|
|  | 963 | } | 
|---|
| [b798713] | 964 | } | 
|---|
|  | 965 |  | 
|---|
| [69914cbc] | 966 | static void assign_list(unsigned & value, dlist(processor) & list, unsigned count) { | 
|---|
| [a017ee7] | 967 | processor * it = &list`first; | 
|---|
|  | 968 | for(unsigned i = 0; i < count; i++) { | 
|---|
|  | 969 | /* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count); | 
|---|
| [431cd4f] | 970 | it->rdq.id = value; | 
|---|
| [a2a4566] | 971 | it->rdq.target = MAX; | 
|---|
| [9cc3a18] | 972 | value += READYQ_SHARD_FACTOR; | 
|---|
| [a017ee7] | 973 | it = &(*it)`next; | 
|---|
|  | 974 | } | 
|---|
|  | 975 | } | 
|---|
|  | 976 |  | 
|---|
| [9cc3a18] | 977 | static void reassign_cltr_id(struct cluster * cltr) { | 
|---|
| [a017ee7] | 978 | unsigned preferred = 0; | 
|---|
| [9cc3a18] | 979 | assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle); | 
|---|
|  | 980 | assign_list(preferred, cltr->procs.idles  , cltr->procs.idle ); | 
|---|
| [a017ee7] | 981 | } | 
|---|
|  | 982 |  | 
|---|
| [431cd4f] | 983 | static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) { | 
|---|
| [a2a4566] | 984 | #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) | 
|---|
| [431cd4f] | 985 | lanes.tscs = alloc(lanes.count, lanes.tscs`realloc); | 
|---|
|  | 986 | for(i; lanes.count) { | 
|---|
| [a2a4566] | 987 | lanes.tscs[i].tv = rdtscl(); | 
|---|
|  | 988 | lanes.tscs[i].ma = 0; | 
|---|
| [431cd4f] | 989 | } | 
|---|
|  | 990 | #endif | 
|---|
|  | 991 | } | 
|---|
|  | 992 |  | 
|---|
| [12daa43] | 993 | #if defined(USE_CPU_WORK_STEALING) | 
|---|
|  | 994 | // ready_queue size is fixed in this case | 
|---|
|  | 995 | void ready_queue_grow(struct cluster * cltr) {} | 
|---|
|  | 996 | void ready_queue_shrink(struct cluster * cltr) {} | 
|---|
|  | 997 | #else | 
|---|
|  | 998 | // Grow the ready queue | 
|---|
|  | 999 | void ready_queue_grow(struct cluster * cltr) { | 
|---|
|  | 1000 | size_t ncount; | 
|---|
|  | 1001 | int target = cltr->procs.total; | 
|---|
|  | 1002 |  | 
|---|
|  | 1003 | /* paranoid */ verify( ready_mutate_islocked() ); | 
|---|
|  | 1004 | __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n"); | 
|---|
|  | 1005 |  | 
|---|
|  | 1006 | // Make sure that everything is consistent | 
|---|
|  | 1007 | /* paranoid */ check( cltr->ready_queue ); | 
|---|
|  | 1008 |  | 
|---|
|  | 1009 | // grow the ready queue | 
|---|
|  | 1010 | with( cltr->ready_queue ) { | 
|---|
|  | 1011 | // Find new count | 
|---|
|  | 1012 | // Make sure we always have atleast 1 list | 
|---|
|  | 1013 | if(target >= 2) { | 
|---|
|  | 1014 | ncount = target * READYQ_SHARD_FACTOR; | 
|---|
|  | 1015 | } else { | 
|---|
|  | 1016 | ncount = SEQUENTIAL_SHARD; | 
|---|
|  | 1017 | } | 
|---|
| [b798713] | 1018 |  | 
|---|
| [12daa43] | 1019 | // Allocate new array (uses realloc and memcpies the data) | 
|---|
|  | 1020 | lanes.data = alloc( ncount, lanes.data`realloc ); | 
|---|
| [b798713] | 1021 |  | 
|---|
| [12daa43] | 1022 | // Fix the moved data | 
|---|
|  | 1023 | for( idx; (size_t)lanes.count ) { | 
|---|
|  | 1024 | fix(lanes.data[idx]); | 
|---|
|  | 1025 | } | 
|---|
| [b798713] | 1026 |  | 
|---|
| [12daa43] | 1027 | // Construct new data | 
|---|
|  | 1028 | for( idx; (size_t)lanes.count ~ ncount) { | 
|---|
|  | 1029 | (lanes.data[idx]){}; | 
|---|
|  | 1030 | } | 
|---|
| [b798713] | 1031 |  | 
|---|
| [12daa43] | 1032 | // Update original | 
|---|
|  | 1033 | lanes.count = ncount; | 
|---|
| [a2a4566] | 1034 |  | 
|---|
|  | 1035 | lanes.caches = alloc( target, lanes.caches`realloc ); | 
|---|
| [12daa43] | 1036 | } | 
|---|
| [b798713] | 1037 |  | 
|---|
| [12daa43] | 1038 | fix_times(cltr); | 
|---|
| [9cc3a18] | 1039 |  | 
|---|
| [12daa43] | 1040 | reassign_cltr_id(cltr); | 
|---|
| [a017ee7] | 1041 |  | 
|---|
| [12daa43] | 1042 | // Make sure that everything is consistent | 
|---|
|  | 1043 | /* paranoid */ check( cltr->ready_queue ); | 
|---|
| [dca5802] | 1044 |  | 
|---|
| [12daa43] | 1045 | __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n"); | 
|---|
| [dca5802] | 1046 |  | 
|---|
| [12daa43] | 1047 | /* paranoid */ verify( ready_mutate_islocked() ); | 
|---|
|  | 1048 | } | 
|---|
| [b798713] | 1049 |  | 
|---|
| [12daa43] | 1050 | // Shrink the ready queue | 
|---|
|  | 1051 | void ready_queue_shrink(struct cluster * cltr) { | 
|---|
|  | 1052 | /* paranoid */ verify( ready_mutate_islocked() ); | 
|---|
|  | 1053 | __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n"); | 
|---|
| [dca5802] | 1054 |  | 
|---|
| [12daa43] | 1055 | // Make sure that everything is consistent | 
|---|
|  | 1056 | /* paranoid */ check( cltr->ready_queue ); | 
|---|
| [dca5802] | 1057 |  | 
|---|
| [12daa43] | 1058 | int target = cltr->procs.total; | 
|---|
| [a017ee7] | 1059 |  | 
|---|
| [12daa43] | 1060 | with( cltr->ready_queue ) { | 
|---|
|  | 1061 | // Remember old count | 
|---|
|  | 1062 | size_t ocount = lanes.count; | 
|---|
| [b798713] | 1063 |  | 
|---|
| [12daa43] | 1064 | // Find new count | 
|---|
|  | 1065 | // Make sure we always have atleast 1 list | 
|---|
|  | 1066 | lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD; | 
|---|
|  | 1067 | /* paranoid */ verify( ocount >= lanes.count ); | 
|---|
|  | 1068 | /* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 ); | 
|---|
| [dca5802] | 1069 |  | 
|---|
| [12daa43] | 1070 | // for printing count the number of displaced threads | 
|---|
|  | 1071 | #if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__) | 
|---|
|  | 1072 | __attribute__((unused)) size_t displaced = 0; | 
|---|
|  | 1073 | #endif | 
|---|
| [b798713] | 1074 |  | 
|---|
| [12daa43] | 1075 | // redistribute old data | 
|---|
|  | 1076 | for( idx; (size_t)lanes.count ~ ocount) { | 
|---|
|  | 1077 | // Lock is not strictly needed but makes checking invariants much easier | 
|---|
|  | 1078 | __attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock); | 
|---|
|  | 1079 | verify(locked); | 
|---|
| [dca5802] | 1080 |  | 
|---|
| [12daa43] | 1081 | // As long as we can pop from this lane to push the threads somewhere else in the queue | 
|---|
|  | 1082 | while(!is_empty(lanes.data[idx])) { | 
|---|
| [e84ab3d] | 1083 | struct thread$ * thrd; | 
|---|
| [12daa43] | 1084 | unsigned long long _; | 
|---|
|  | 1085 | [thrd, _] = pop(lanes.data[idx]); | 
|---|
| [dca5802] | 1086 |  | 
|---|
| [12daa43] | 1087 | push(cltr, thrd, true); | 
|---|
| [dca5802] | 1088 |  | 
|---|
| [12daa43] | 1089 | // for printing count the number of displaced threads | 
|---|
|  | 1090 | #if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__) | 
|---|
|  | 1091 | displaced++; | 
|---|
|  | 1092 | #endif | 
|---|
|  | 1093 | } | 
|---|
| [b798713] | 1094 |  | 
|---|
| [12daa43] | 1095 | // Unlock the lane | 
|---|
|  | 1096 | __atomic_unlock(&lanes.data[idx].lock); | 
|---|
| [b798713] | 1097 |  | 
|---|
| [12daa43] | 1098 | // TODO print the queue statistics here | 
|---|
| [b798713] | 1099 |  | 
|---|
| [12daa43] | 1100 | ^(lanes.data[idx]){}; | 
|---|
|  | 1101 | } | 
|---|
| [b798713] | 1102 |  | 
|---|
| [12daa43] | 1103 | __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced); | 
|---|
| [c84b4be] | 1104 |  | 
|---|
| [12daa43] | 1105 | // Allocate new array (uses realloc and memcpies the data) | 
|---|
|  | 1106 | lanes.data = alloc( lanes.count, lanes.data`realloc ); | 
|---|
| [b798713] | 1107 |  | 
|---|
| [12daa43] | 1108 | // Fix the moved data | 
|---|
|  | 1109 | for( idx; (size_t)lanes.count ) { | 
|---|
|  | 1110 | fix(lanes.data[idx]); | 
|---|
|  | 1111 | } | 
|---|
| [a2a4566] | 1112 |  | 
|---|
|  | 1113 | lanes.caches = alloc( target, lanes.caches`realloc ); | 
|---|
| [b798713] | 1114 | } | 
|---|
|  | 1115 |  | 
|---|
| [12daa43] | 1116 | fix_times(cltr); | 
|---|
| [9cc3a18] | 1117 |  | 
|---|
| [a2a4566] | 1118 |  | 
|---|
| [12daa43] | 1119 | reassign_cltr_id(cltr); | 
|---|
| [a017ee7] | 1120 |  | 
|---|
| [12daa43] | 1121 | // Make sure that everything is consistent | 
|---|
|  | 1122 | /* paranoid */ check( cltr->ready_queue ); | 
|---|
| [dca5802] | 1123 |  | 
|---|
| [12daa43] | 1124 | __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n"); | 
|---|
|  | 1125 | /* paranoid */ verify( ready_mutate_islocked() ); | 
|---|
|  | 1126 | } | 
|---|
|  | 1127 | #endif | 
|---|
| [8cd5434] | 1128 |  | 
|---|
|  | 1129 | #if !defined(__CFA_NO_STATISTICS__) | 
|---|
|  | 1130 | unsigned cnt(const __ready_queue_t & this, unsigned idx) { | 
|---|
|  | 1131 | /* paranoid */ verify(this.lanes.count > idx); | 
|---|
|  | 1132 | return this.lanes.data[idx].cnt; | 
|---|
|  | 1133 | } | 
|---|
|  | 1134 | #endif | 
|---|
| [0ee224b] | 1135 |  | 
|---|
|  | 1136 |  | 
|---|
|  | 1137 | #if   defined(CFA_HAVE_LINUX_LIBRSEQ) | 
|---|
|  | 1138 | // No definition needed | 
|---|
|  | 1139 | #elif defined(CFA_HAVE_LINUX_RSEQ_H) | 
|---|
|  | 1140 |  | 
|---|
|  | 1141 | #if defined( __x86_64 ) || defined( __i386 ) | 
|---|
|  | 1142 | #define RSEQ_SIG        0x53053053 | 
|---|
|  | 1143 | #elif defined( __ARM_ARCH ) | 
|---|
|  | 1144 | #ifdef __ARMEB__ | 
|---|
|  | 1145 | #define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */ | 
|---|
|  | 1146 | #else | 
|---|
|  | 1147 | #define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */ | 
|---|
|  | 1148 | #endif | 
|---|
|  | 1149 | #endif | 
|---|
|  | 1150 |  | 
|---|
|  | 1151 | extern void __disable_interrupts_hard(); | 
|---|
|  | 1152 | extern void __enable_interrupts_hard(); | 
|---|
|  | 1153 |  | 
|---|
| [75c7252] | 1154 | static void __kernel_raw_rseq_register  (void) { | 
|---|
| [0ee224b] | 1155 | /* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED ); | 
|---|
|  | 1156 |  | 
|---|
|  | 1157 | // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8); | 
|---|
|  | 1158 | int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG); | 
|---|
|  | 1159 | if(ret != 0) { | 
|---|
|  | 1160 | int e = errno; | 
|---|
|  | 1161 | switch(e) { | 
|---|
|  | 1162 | case EINVAL: abort("KERNEL ERROR: rseq register invalid argument"); | 
|---|
|  | 1163 | case ENOSYS: abort("KERNEL ERROR: rseq register no supported"); | 
|---|
|  | 1164 | case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument"); | 
|---|
|  | 1165 | case EBUSY : abort("KERNEL ERROR: rseq register already registered"); | 
|---|
|  | 1166 | case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration"); | 
|---|
|  | 1167 | default: abort("KERNEL ERROR: rseq register unexpected return %d", e); | 
|---|
|  | 1168 | } | 
|---|
|  | 1169 | } | 
|---|
|  | 1170 | } | 
|---|
|  | 1171 |  | 
|---|
| [75c7252] | 1172 | static void __kernel_raw_rseq_unregister(void) { | 
|---|
| [0ee224b] | 1173 | /* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 ); | 
|---|
|  | 1174 |  | 
|---|
|  | 1175 | // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8); | 
|---|
|  | 1176 | int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG); | 
|---|
|  | 1177 | if(ret != 0) { | 
|---|
|  | 1178 | int e = errno; | 
|---|
|  | 1179 | switch(e) { | 
|---|
|  | 1180 | case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument"); | 
|---|
|  | 1181 | case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported"); | 
|---|
|  | 1182 | case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument"); | 
|---|
|  | 1183 | case EBUSY : abort("KERNEL ERROR: rseq unregister already registered"); | 
|---|
|  | 1184 | case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration"); | 
|---|
|  | 1185 | default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e); | 
|---|
|  | 1186 | } | 
|---|
|  | 1187 | } | 
|---|
|  | 1188 | } | 
|---|
|  | 1189 | #else | 
|---|
|  | 1190 | // No definition needed | 
|---|
| [a2a4566] | 1191 | #endif | 
|---|