Changeset dca5802
- Timestamp:
- Jan 30, 2020, 1:40:05 PM (5 years ago)
- Branches:
- ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum
- Children:
- b7d6a36
- Parents:
- 75ca7f4
- Location:
- libcfa/src
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
libcfa/src/bits/defs.hfa
r75ca7f4 rdca5802 56 56 // #define __CFA_NO_BIT_TEST_AND_SET__ 57 57 58 static inline bool bts(volatile unsigned long long int * target, unsigned long long int bit ) { 58 #if defined( __i386 ) 59 static inline bool __atomic_bts(volatile unsigned long int * target, unsigned long int bit ) { 60 #if defined(__CFA_NO_BIT_TEST_AND_SET__) 61 unsigned long int mask = 1ul << bit; 62 unsigned long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED); 63 return (ret & mask) != 0; 64 #else 65 int result = 0; 66 asm volatile( 67 "LOCK btsl %[bit], %[target]\n\t" 68 : "=@ccc" (result) 69 : [target] "m" (*target), [bit] "r" (bit) 70 ); 71 return result != 0; 72 #endif 73 } 74 75 static inline bool __atomic_btr(volatile unsigned long int * target, unsigned long int bit ) { 76 #if defined(__CFA_NO_BIT_TEST_AND_SET__) 77 unsigned long int mask = 1ul << bit; 78 unsigned long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED); 79 return (ret & mask) != 0; 80 #else 81 int result = 0; 82 asm volatile( 83 "LOCK btrl %[bit], %[target]\n\t" 84 :"=@ccc" (result) 85 : [target] "m" (*target), [bit] "r" (bit) 86 ); 87 return result != 0; 88 #endif 89 } 90 #elif defined( __x86_64 ) 91 static inline bool __atomic_bts(volatile unsigned long long int * target, unsigned long long int bit ) { 59 92 #if defined(__CFA_NO_BIT_TEST_AND_SET__) 60 93 unsigned long long int mask = 1ul << bit; … … 72 105 } 73 106 74 static inline bool btr(volatile unsigned long long int * target, unsigned long long int bit ) {107 static inline bool __atomic_btr(volatile unsigned long long int * target, unsigned long long int bit ) { 75 108 #if defined(__CFA_NO_BIT_TEST_AND_SET__) 76 109 unsigned long long int mask = 1ul << bit; … … 87 120 #endif 88 121 } 122 #elif defined( __ARM_ARCH ) 123 #error __atomic_bts and __atomic_btr not implemented for arm 124 #else 125 #error uknown hardware architecture 126 #endif -
libcfa/src/concurrency/kernel.hfa
r75ca7f4 rdca5802 165 165 //----------------------------------------------------------------------------- 166 166 // Cluster Tools 167 168 // Cells use by the reader writer lock 169 // while not generic it only relies on a opaque pointer 167 170 struct __processor_id; 168 171 169 172 // Reader-Writer lock protecting the ready-queue 173 // while this lock is mostly generic some aspects 174 // have been hard-coded to for the ready-queue for 175 // simplicity and performance 170 176 struct __clusterRWLock_t { 171 177 // total cachelines allocated … … 189 195 void ^?{}(__clusterRWLock_t & this); 190 196 191 // Underlying sub quues of theready queue192 struct __attribute__((aligned(128))) __intrusive_ ready_queue_t {197 // Intrusives lanes which are used by the relaxed ready queue 198 struct __attribute__((aligned(128))) __intrusive_lane_t { 193 199 // spin lock protecting the queue 194 200 volatile bool lock; 195 unsigned int last_id;196 unsigned int count;197 201 198 202 // anchor for the head and the tail of the queue … … 204 208 } before, after; 205 209 210 #if defined(__CFA_WITH_VERIFY__) 211 // id of last processor to acquire the lock 212 // needed only to check for mutual exclusion violations 213 unsigned int last_id; 214 215 // number of items on this list 216 // needed only to check for deadlocks 217 unsigned int count; 218 #endif 219 206 220 // Optional statistic counters 207 221 #if !defined(__CFA_NO_SCHED_STATS__) … … 217 231 }; 218 232 219 void ?{}(__intrusive_ ready_queue_t & this);220 void ^?{}(__intrusive_ ready_queue_t & this);233 void ?{}(__intrusive_lane_t & this); 234 void ^?{}(__intrusive_lane_t & this); 221 235 222 236 typedef unsigned long long __cfa_readyQ_mask_t; … … 227 241 // }; 228 242 229 #define __cfa_ readyQ_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))230 #define __cfa_max_ readyQs (__cfa_readyQ_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))243 #define __cfa_lane_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t)) 244 #define __cfa_max_lanes (__cfa_lane_mask_size * 8 * sizeof(__cfa_readyQ_mask_t)) 231 245 232 246 //TODO adjust cache size to ARCHITECTURE 247 // Structure holding the relaxed ready queue 233 248 struct __attribute__((aligned(128))) __ready_queue_t { 249 // Data tracking how many/which lanes are used 250 // Aligned to 128 for cache locality 234 251 struct { 252 // number of non-empty lanes 235 253 volatile size_t count; 236 volatile __cfa_readyQ_mask_t mask[ __cfa_readyQ_mask_size ]; 237 } empty; 238 254 255 // bit mask, set bits indentify which lanes are non-empty 256 volatile __cfa_readyQ_mask_t mask[ __cfa_lane_mask_size ]; 257 } used; 258 259 // Data tracking the actual lanes 260 // On a seperate cacheline from the used struct since 261 // used can change on each push/pop but this data 262 // only changes on shrink/grow 239 263 struct __attribute__((aligned(64))) { 240 __intrusive_ready_queue_t * volatile data; 264 // Arary of lanes 265 __intrusive_lane_t * volatile data; 266 267 // Number of lanes (empty or not) 241 268 volatile size_t count; 242 } list; 243 269 } lanes; 270 271 // Statistics 244 272 #if !defined(__CFA_NO_STATISTICS__) 245 273 __attribute__((aligned(64))) struct { 246 274 struct { 275 // Push statistic 247 276 struct { 277 // number of attemps at pushing something 248 278 volatile size_t attempt; 279 280 // number of successes at pushing 249 281 volatile size_t success; 250 282 } push; 283 284 // Pop statistic 251 285 struct { 286 // number of reads of the mask 287 // picking an empty __cfa_readyQ_mask_t counts here 288 // but not as an attempt 252 289 volatile size_t maskrds; 290 291 // number of attemps at poping something 253 292 volatile size_t attempt; 293 294 // number of successes at poping 254 295 volatile size_t success; 255 296 } pop; 256 297 } pick; 298 299 // stats on the "used" struct of the queue 300 // tracks average number of queues that are not empty 301 // when pushing / poping 257 302 struct { 258 303 volatile size_t value; 259 304 volatile size_t count; 260 } full;305 } used; 261 306 } global_stats; 262 307 -
libcfa/src/concurrency/kernel_private.hfa
r75ca7f4 rdca5802 136 136 // Concurrent with doregister/unregister, 137 137 // i.e., threads can be added at any point during or between the entry/exit 138 139 //----------------------------------------------------------------------- 140 // simple spinlock underlying the RWLock 141 // Blocking acquire 138 142 static inline void __atomic_acquire(volatile bool * ll) { 139 143 while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) { … … 144 148 } 145 149 150 // Non-Blocking acquire 146 151 static inline bool __atomic_try_acquire(volatile bool * ll) { 147 152 return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST); 148 153 } 149 154 155 // Release 150 156 static inline void __atomic_unlock(volatile bool * ll) { 151 157 /* paranoid */ verify(*ll); … … 180 186 /*paranoid*/ verify(iproc < ready); 181 187 /*paranoid*/ verify(data[iproc].lock); 182 __atomic_ store_n(&data[iproc].lock, false, __ATOMIC_RELEASE);188 __atomic_unlock(&data[iproc].lock); 183 189 } 184 190 … … 188 194 uint_fast32_t ready_mutate_lock( struct cluster & cltr ); 189 195 190 void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );196 void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t /* value returned by lock */ ); 191 197 192 198 //======================================================================= 193 199 // Ready-Queue API 194 200 //----------------------------------------------------------------------- 201 // push thread onto a ready queue for a cluster 202 // returns true if the list was previously empty, false otherwise 195 203 __attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd); 204 205 //----------------------------------------------------------------------- 206 // pop thread from the ready queue of a cluster 207 // returns 0p if empty 196 208 __attribute__((hot)) thread_desc * pop(struct cluster * cltr); 209 210 //----------------------------------------------------------------------- 211 // Increase the width of the ready queue (number of lanes) by 4 197 212 void ready_queue_grow (struct cluster * cltr); 213 214 //----------------------------------------------------------------------- 215 // Decrease the width of the ready queue (number of lanes) by 4 198 216 void ready_queue_shrink(struct cluster * cltr); 199 217 218 //----------------------------------------------------------------------- 219 // Statics call at the end of each thread to register statistics 200 220 #if !defined(__CFA_NO_STATISTICS__) 201 221 void stats_tls_tally(struct cluster * cltr); -
libcfa/src/concurrency/ready_queue.cfa
r75ca7f4 rdca5802 24 24 static const size_t cache_line_size = 64; 25 25 26 static inline unsigned __max_processors_fallback() { 27 #ifdef __CFA_MAX_PROCESSORS__ 28 return __CFA_MAX_PROCESSORS__; 29 #else 30 // No overriden function, no environment variable, no define 31 // fall back to a magic number 32 return 128; 33 #endif 34 } 35 26 // No overriden function, no environment variable, no define 27 // fall back to a magic number 28 #ifndef __CFA_MAX_PROCESSORS__ 29 #define __CFA_MAX_PROCESSORS__ 128 30 #endif 31 32 // returns the maximum number of processors the RWLock support 36 33 __attribute__((weak)) unsigned __max_processors() { 37 34 const char * max_cores_s = getenv("CFA_MAX_PROCESSORS"); 38 35 if(!max_cores_s) { 39 36 __cfaabi_dbg_print_nolock("No CFA_MAX_PROCESSORS in ENV"); 40 return __ max_processors_fallback();37 return __CFA_MAX_PROCESSORS__; 41 38 } 42 39 … … 45 42 if(max_cores_l < 1 || max_cores_l > 65535) { 46 43 __cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS out of range : %ld", max_cores_l); 47 return __ max_processors_fallback();44 return __CFA_MAX_PROCESSORS__; 48 45 } 49 46 if('\0' != *endptr) { 50 47 __cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS not a decimal number : %s", max_cores_s); 51 return __ max_processors_fallback();48 return __CFA_MAX_PROCESSORS__; 52 49 } 53 50 … … 55 52 } 56 53 57 static inline unsigned rand_bit(unsigned rnum, size_t mask) { 58 verify(sizeof(mask) == 8); 54 // Picks a random 1 bit in 'mask' according to random number 'rnum'. 55 static inline unsigned rand_bit(unsigned rnum, __cfa_readyQ_mask_t mask) { 56 #if defined( __i386 ) 57 static_assert(sizeof(mask) == 4); 58 unsigned bit = mask ? rnum % __builtin_popcount(mask) : 0; 59 #if !defined(__BMI2__) 60 #error rand_bit not implemented for non __BMI2__ i386 61 #else 62 uint32_t picked = _pdep_u32(1ul << bit, mask); 63 return picked ? __builtin_ctz(picked) : 0; 64 #endif 65 #elif defined( __x86_64 ) 66 static_assert(sizeof(mask) == 8); 59 67 unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0; 60 #if !defined(__BMI2__) 61 uint64_t v = mask; // Input value to find position with rank r. 62 unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. 63 unsigned int s; // Output: Resulting position of bit with rank r [1-64] 64 uint64_t a, b, c, d; // Intermediate temporaries for bit count. 65 unsigned int t; // Bit count temporary. 66 67 // Do a normal parallel bit count for a 64-bit integer, 68 // but store all intermediate steps. 69 a = v - ((v >> 1) & ~0UL/3); 70 b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); 71 c = (b + (b >> 4)) & ~0UL/0x11; 72 d = (c + (c >> 8)) & ~0UL/0x101; 73 74 75 t = (d >> 32) + (d >> 48); 76 // Now do branchless select! 77 s = 64; 78 s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); 79 t = (d >> (s - 16)) & 0xff; 80 s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); 81 t = (c >> (s - 8)) & 0xf; 82 s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); 83 t = (b >> (s - 4)) & 0x7; 84 s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); 85 t = (a >> (s - 2)) & 0x3; 86 s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); 87 t = (v >> (s - 1)) & 0x1; 88 s -= ((t - r) & 256) >> 8; 89 return s - 1; 68 #if !defined(__BMI2__) 69 uint64_t v = mask; // Input value to find position with rank r. 70 unsigned int r = bit + 1;// Input: bit's desired rank [1-64]. 71 unsigned int s; // Output: Resulting position of bit with rank r [1-64] 72 uint64_t a, b, c, d; // Intermediate temporaries for bit count. 73 unsigned int t; // Bit count temporary. 74 75 // Do a normal parallel bit count for a 64-bit integer, 76 // but store all intermediate steps. 77 a = v - ((v >> 1) & ~0UL/3); 78 b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); 79 c = (b + (b >> 4)) & ~0UL/0x11; 80 d = (c + (c >> 8)) & ~0UL/0x101; 81 82 83 t = (d >> 32) + (d >> 48); 84 // Now do branchless select! 85 s = 64; 86 s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); 87 t = (d >> (s - 16)) & 0xff; 88 s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); 89 t = (c >> (s - 8)) & 0xf; 90 s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); 91 t = (b >> (s - 4)) & 0x7; 92 s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); 93 t = (a >> (s - 2)) & 0x3; 94 s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); 95 t = (v >> (s - 1)) & 0x1; 96 s -= ((t - r) & 256) >> 8; 97 return s - 1; 98 #else 99 uint64_t picked = _pdep_u64(1ul << bit, mask); 100 return picked ? __builtin_ctzl(picked) : 0; 101 #endif 102 #elif defined( __ARM_ARCH ) 103 #error rand_bit not implemented for arm 90 104 #else 91 uint64_t picked = _pdep_u64(1ul << bit, mask); 92 return picked ? __builtin_ctzl(picked) : 0; 105 #error uknown hardware architecture 93 106 #endif 94 107 } 95 108 96 static inline __cfa_readyQ_mask_t readyQ_mask_full () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; } 97 static inline __cfa_readyQ_mask_t readyQ_mask_shit_length() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(readyQ_mask_full()); } 98 109 110 //----------------------------------------------------------------------------- 111 // Helpers used by extract 112 // (_mask_bitsidx() & X) returns a bit index valid for a __cfa_readyQ_mask_t, where X is any integer 113 static inline __cfa_readyQ_mask_t _mask_bitsidx () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; } 114 115 // (X >> _mask_shiftidx()) retuns an index into an array of __cfa_readyQ_mask_t 116 static inline __cfa_readyQ_mask_t _mask_shiftidx() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(_mask_bitsidx()); } 117 118 119 // Assuming a large bit mask represented as an array of __cfa_readyQ_mask_t 120 // Given an index into the large mask, returns the bit index and which __cfa_readyQ_mask_t index in the array 99 121 static inline [__cfa_readyQ_mask_t, __cfa_readyQ_mask_t] extract(__cfa_readyQ_mask_t idx) { 100 __cfa_readyQ_mask_t word = idx >> readyQ_mask_shit_length();101 __cfa_readyQ_mask_t bit = idx & readyQ_mask_full();122 __cfa_readyQ_mask_t word = idx >> _mask_bitsidx(); 123 __cfa_readyQ_mask_t bit = idx & _mask_shiftidx(); 102 124 return [bit, word]; 103 125 } … … 219 241 //======================================================================= 220 242 // Get the head pointer (one before the first element) from the anchor 221 static inline thread_desc * head(const __intrusive_ ready_queue_t & this) {243 static inline thread_desc * head(const __intrusive_lane_t & this) { 222 244 thread_desc * rhead = (thread_desc *)( 223 245 (uintptr_t)( &this.before ) - offsetof( thread_desc, link ) … … 228 250 229 251 // Get the tail pointer (one after the last element) from the anchor 230 static inline thread_desc * tail(const __intrusive_ ready_queue_t & this) {252 static inline thread_desc * tail(const __intrusive_lane_t & this) { 231 253 thread_desc * rtail = (thread_desc *)( 232 254 (uintptr_t)( &this.after ) - offsetof( thread_desc, link ) … … 237 259 238 260 // Ctor 239 void ?{}( __intrusive_ ready_queue_t & this ) {261 void ?{}( __intrusive_lane_t & this ) { 240 262 this.lock = false; 241 263 this.last_id = -1u; … … 267 289 /* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev ); 268 290 /* paranoid */ verify(&tail(this)->link.next == &this.after .link.next ); 269 /* paranoid */ verify(sizeof(__intrusive_ ready_queue_t) == 128);291 /* paranoid */ verify(sizeof(__intrusive_lane_t) == 128); 270 292 /* paranoid */ verify(sizeof(this) == 128); 271 /* paranoid */ verify(__alignof__(__intrusive_ ready_queue_t) == 128);293 /* paranoid */ verify(__alignof__(__intrusive_lane_t) == 128); 272 294 /* paranoid */ verify(__alignof__(this) == 128); 273 295 /* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128)); 274 296 275 /* paranoid */ verifyf( readyQ_mask_shit_length() == 6 , "%zu", readyQ_mask_shit_length());276 /* paranoid */ verifyf( readyQ_mask_full() == 63, "%zu", readyQ_mask_full());297 /* paranoid */ verifyf(_mask_shiftidx() == 6 , "%zu", _mask_shiftidx()); 298 /* paranoid */ verifyf(_mask_bitsidx () == 63, "%zu", _mask_bitsidx()); 277 299 } 278 300 279 301 // Dtor is trivial 280 void ^?{}( __intrusive_ ready_queue_t & this ) {302 void ^?{}( __intrusive_lane_t & this ) { 281 303 // Make sure the list is empty 282 304 /* paranoid */ verify(head(this)->link.prev == 0p ); … … 287 309 } 288 310 289 290 291 bool push(__intrusive_ready_queue_t & this, thread_desc * node) { 292 verify(this.lock); 293 verify(node->link.ts != 0); 294 verify(node->link.next == 0p); 295 verify(node->link.prev == 0p); 296 297 this.count++; 298 299 if(this.before.link.ts == 0l) { 300 verify(tail(this)->link.next == 0p); 301 verify(tail(this)->link.prev == head(this)); 302 verify(head(this)->link.next == tail(this)); 303 verify(head(this)->link.prev == 0p); 304 } 311 // Push a thread onto this lane 312 // returns true of lane was empty before push, false otherwise 313 bool push(__intrusive_lane_t & this, thread_desc * node) { 314 #if defined(__CFA_WITH_VERIFY__) 315 /* paranoid */ verify(this.lock); 316 /* paranoid */ verify(node->link.ts != 0); 317 /* paranoid */ verify(node->link.next == 0p); 318 /* paranoid */ verify(node->link.prev == 0p); 319 320 this.count++; 321 322 if(this.before.link.ts == 0l) { 323 /* paranoid */ verify(tail(this)->link.next == 0p); 324 /* paranoid */ verify(tail(this)->link.prev == head(this)); 325 /* paranoid */ verify(head(this)->link.next == tail(this)); 326 /* paranoid */ verify(head(this)->link.prev == 0p); 327 } 328 #endif 305 329 306 330 // Get the relevant nodes locally … … 315 339 316 340 // Update stats 317 #if ndef __CFA_NO_SCHED_STATS__341 #if !defined(__CFA_NO_SCHED_STATS__) 318 342 this.stat.diff++; 319 343 this.stat.push++; … … 325 349 if(this.before.link.ts == 0l) { 326 350 this.before.link.ts = node->link.ts; 327 verify(node->link.prev == head(this));351 /* paranoid */ verify(node->link.prev == head(this)); 328 352 return true; 329 353 } … … 331 355 } 332 356 333 [thread_desc *, bool] pop(__intrusive_ready_queue_t & this) { 334 verify(this.lock); 335 verify(this.before.link.ts != 0ul); 357 // Pop a thread from this lane (must be non-empty) 358 // returns popped 359 // returns true of lane was empty before push, false otherwise 360 [thread_desc *, bool] pop(__intrusive_lane_t & this) { 361 /* paranoid */ verify(this.lock); 362 /* paranoid */ verify(this.before.link.ts != 0ul); 363 364 // Get anchors locally 336 365 thread_desc * head = head(this); 337 366 thread_desc * tail = tail(this); 338 367 368 // Get the relevant nodes locally 339 369 thread_desc * node = head->link.next; 340 370 thread_desc * next = node->link.next; 341 if(node == tail) { 342 verify(false); 343 verify(this.before.link.ts == 0ul); 344 verify(tail(this)->link.next == 0p); 345 verify(tail(this)->link.prev == head(this)); 346 verify(head(this)->link.next == tail(this)); 347 verify(head(this)->link.prev == 0p); 348 return [0p, false]; 349 } 350 351 this.count--; 352 /* paranoid */ verify(node); 353 371 372 #if defined(__CFA_WITH_VERIFY__) 373 this.count--; 374 /* paranoid */ verify(node != tail); 375 /* paranoid */ verify(node); 376 #endif 377 378 // Do the pop 354 379 head->link.next = next; 355 380 next->link.prev = head; 356 381 node->link.[next, prev] = 0p; 382 383 // Update head time stamp 384 this.before.link.ts = next->link.ts; 385 386 // Update stats 357 387 #ifndef __CFA_NO_SCHED_STATS__ 358 388 this.stat.diff--; … … 360 390 #endif 361 391 392 // Check if we emptied list and return accordingly 362 393 if(next == tail) { 363 this.before.link.ts = 0ul; 364 verify(tail(this)->link.next == 0p); 365 verify(tail(this)->link.prev == head(this)); 366 verify(head(this)->link.next == tail(this)); 367 verify(head(this)->link.prev == 0p); 368 node->link.[next, prev] = 0p; 394 /* paranoid */ verify(this.before.link.ts == 0); 395 /* paranoid */ verify(tail(this)->link.next == 0p); 396 /* paranoid */ verify(tail(this)->link.prev == head(this)); 397 /* paranoid */ verify(head(this)->link.next == tail(this)); 398 /* paranoid */ verify(head(this)->link.prev == 0p); 369 399 return [node, true]; 370 400 } 371 401 else { 372 verify(next->link.ts != 0); 373 this.before.link.ts = next->link.ts; 374 verify(this.before.link.ts != 0); 375 node->link.[next, prev] = 0p; 402 /* paranoid */ verify(next->link.ts != 0); 403 /* paranoid */ verify(this.before.link.ts != 0); 376 404 return [node, false]; 377 405 } 378 406 } 379 407 380 static inline unsigned long long ts(__intrusive_ready_queue_t & this) { 408 // Check whether or not list is empty 409 static inline bool is_empty(__intrusive_lane_t & this) { 410 verify( (this.before.link.ts == 0) == (this.count == 0) ); 411 return this.before.link.ts == 0; 412 } 413 414 // Return the timestamp 415 static inline unsigned long long ts(__intrusive_lane_t & this) { 416 verify( this.before.link.ts == this.before.link.next->link.ts ); 381 417 return this.before.link.ts; 382 418 } … … 386 422 //======================================================================= 387 423 424 // Thread local mirror of ready queue statistics 425 #if !defined(__CFA_NO_STATISTICS__) 388 426 static __attribute__((aligned(128))) thread_local struct { 389 427 struct { … … 401 439 size_t value; 402 440 size_t count; 403 } full;441 } used; 404 442 } tls = { 405 443 /* pick */{ … … 407 445 /* pop */{ 0, 0, 0 }, 408 446 }, 409 /* full*/{ 0, 0 }447 /* used */{ 0, 0 } 410 448 }; 449 #endif 411 450 412 451 //----------------------------------------------------------------------- 413 452 414 453 void ?{}(__ready_queue_t & this) with (this) { 415 empty.count = 0;416 for( i ; __cfa_ readyQ_mask_size ) {417 empty.mask[i] = 0;418 } 419 420 l ist.data = alloc(4);454 used.count = 0; 455 for( i ; __cfa_lane_mask_size ) { 456 used.mask[i] = 0; 457 } 458 459 lanes.data = alloc(4); 421 460 for( i; 4 ) { 422 (l ist.data[i]){};423 } 424 l ist.count = 4;461 (lanes.data[i]){}; 462 } 463 lanes.count = 4; 425 464 426 465 #if !defined(__CFA_NO_STATISTICS__) … … 431 470 global_stats.pick.pop .success = 0; 432 471 433 global_stats. full.value = 0;434 global_stats. full.count = 0;472 global_stats.used.value = 0; 473 global_stats.used.count = 0; 435 474 #endif 436 475 } 437 476 438 477 void ^?{}(__ready_queue_t & this) with (this) { 439 verify( 4 == l ist.count );440 verify( 0 == empty.count );478 verify( 4 == lanes.count ); 479 verify( 0 == used .count ); 441 480 442 481 for( i; 4 ) { 443 ^(l ist.data[i]){};444 } 445 free(l ist.data);482 ^(lanes.data[i]){}; 483 } 484 free(lanes.data); 446 485 447 486 448 487 #if defined(__CFA_WITH_VERIFY__) 449 for( i ; __cfa_ readyQ_mask_size ) {450 assert( 0 == empty.mask[i] );488 for( i ; __cfa_lane_mask_size ) { 489 assert( 0 == used.mask[i] ); 451 490 } 452 491 #endif … … 454 493 455 494 //----------------------------------------------------------------------- 456 495 enum mask_strictness { 496 STRICT, 497 NOCHECK 498 }; 499 500 // Set a given bit in the bit mask array 501 // strictness determines of the bit had to be cleared before 502 static inline void mask_set(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) { 503 // Extract the array and bit indexes 504 __cfa_readyQ_mask_t word; 505 __cfa_readyQ_mask_t bit; 506 [bit, word] = extract(index); 507 508 // Conditional check 509 verifyf( 510 strict == STRICT && // Conditional check if it was expected to be cleared 511 ((mask[word] & (1ull << bit)) == 0), 512 "Before set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit) 513 ); 514 515 // Atomically set the bit 516 __attribute__((unused)) bool ret = __atomic_bts(&mask[word], bit); 517 518 // Conditional check 519 verifyf( 520 strict == STRICT && // Conditional check if it was expected to be cleared 521 !ret, 522 "Bit was not set but bts returned true" 523 ); 524 525 // Unconditional check 526 verifyf( 527 (mask[word] & (1ull << bit)) != 0, 528 "After set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit) 529 ); 530 } 531 532 static inline void mask_clear(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) { 533 // Extract the array and bit indexes 534 __cfa_readyQ_mask_t word; 535 __cfa_readyQ_mask_t bit; 536 [bit, word] = extract(index); 537 538 // Conditional check 539 verifyf( 540 strict == STRICT && // Conditional check if it was expected to be set 541 ((mask[word] & (1ull << bit)) != 0), 542 "Before clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit) 543 ); 544 545 // Atomically clear the bit 546 __attribute__((unused)) bool ret = __atomic_btr(&mask[word], bit); 547 548 // Conditional check 549 verifyf( 550 strict == STRICT && // Conditional check if it was expected to be cleared 551 ret, 552 "Bit was set but btr returned false" 553 ); 554 555 // Unconditional check 556 verifyf( 557 (mask[word] & (1ull << bit)) == 0, 558 "After clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit) 559 ); 560 } 561 562 //----------------------------------------------------------------------- 457 563 __attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd) with (cltr->ready_queue) { 564 // write timestamp 458 565 thrd->link.ts = rdtscl(); 459 566 460 while(true) { 461 // Pick a random list 462 unsigned i = tls_rand() % list.count; 567 // Try to pick a lane and lock it 568 unsigned i; 569 do { 570 // Pick the index of a lane 571 unsigned i = tls_rand() % lanes.count; 463 572 464 573 #if !defined(__CFA_NO_STATISTICS__) … … 467 576 468 577 // If we can't lock it retry 469 if( !__atomic_try_acquire( &list.data[i].lock ) ) continue; 470 verify(list.data[i].last_id == -1u); 471 list.data[i].last_id = kernelTLS.this_processor->id; 472 473 __attribute__((unused)) size_t num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED ); 474 bool first = false; 475 476 verify( list.data[i].last_id == kernelTLS.this_processor->id ); 477 verify( list.data[i].lock ); 478 // Actually push it 479 if(push(list.data[i], thrd)) { 480 size_t ret = __atomic_fetch_add( &empty.count, 1z, __ATOMIC_SEQ_CST); 481 first = (ret == 0); 482 483 __cfa_readyQ_mask_t word; 484 __cfa_readyQ_mask_t bit; 485 [bit, word] = extract(i); 486 verifyf((empty.mask[word] & (1ull << bit)) == 0, "Before set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit)); 487 __attribute__((unused)) bool ret = bts(&empty.mask[word], bit); 488 verify(!(bool)ret); 489 verifyf((empty.mask[word] & (1ull << bit)) != 0, "After set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit)); 490 } 491 verifyf( empty.count <= list.count, "Non-empty count (%zu) exceeds actual count (%zu)\n", empty.count, list.count ); 492 verifyf( list.data[i].last_id == kernelTLS.this_processor->id, "Expected last processor to lock queue %u to be %u, was %u\n", i, list.data[i].last_id, kernelTLS.this_processor->id ); 493 verifyf( list.data[i].lock, "List %u is not locked\n", i ); 494 495 // Unlock and return 496 list.data[i].last_id = -1u; 497 __atomic_unlock( &list.data[i].lock ); 498 499 #if !defined(__CFA_NO_STATISTICS__) 500 tls.pick.push.success++; 501 tls.full.value += num; 502 tls.full.count += 1; 503 #endif 504 return first; 505 } 578 } while( !__atomic_try_acquire( &lanes.data[i].lock ) ); 579 580 #if defined(__CFA_WITH_VERIFY__) 581 /* paranoid */ verify(lanes.data[i].last_id == -1u); 582 /* paranoid */ lanes.data[i].last_id = kernelTLS.this_processor->id; 583 #endif 584 585 __attribute__((unused)) size_t num = __atomic_load_n( &used.count, __ATOMIC_RELAXED ); 586 bool first = false; 587 588 // Actually push it 589 bool lane_first = push(lanes.data[i], thrd); 590 591 // If this lane used to be empty we need to do more 592 if(lane_first) { 593 // Update the global count 594 size_t ret = __atomic_fetch_add( &used.count, 1z, __ATOMIC_SEQ_CST); 595 596 // Check if the entire quue used to be empty 597 first = (ret == 0); 598 599 // Update the bit mask 600 mask_set((__cfa_readyQ_mask_t *)used.mask, i, STRICT); 601 } 602 603 #if defined(__CFA_WITH_VERIFY__) 604 /* paranoid */ verifyf( used.count <= lanes.count, "Non-empty count (%zu) exceeds actual count (%zu)\n", used.count, lanes.count ); 605 /* paranoid */ verifyf( lanes.data[i].last_id == kernelTLS.this_processor->id, "Expected last processor to lock queue %u to be %u, was %u\n", i, lanes.data[i].last_id, kernelTLS.this_processor->id ); 606 /* paranoid */ verifyf( lanes.data[i].lock, "List %u is not locked\n", i ); 607 /* paranoid */ lanes.data[i].last_id = -1u; 608 #endif 609 610 // Unlock and return 611 __atomic_unlock( &lanes.data[i].lock ); 612 613 // Update statistics 614 #if !defined(__CFA_NO_STATISTICS__) 615 tls.pick.push.success++; 616 tls.used.value += num; 617 tls.used.count += 1; 618 #endif 619 620 // return whether or not the list was empty before this push 621 return first; 506 622 } 507 623 508 624 //----------------------------------------------------------------------- 509 625 // Given 2 indexes, pick the list with the oldest push an try to pop from it 510 626 static struct thread_desc * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) { 511 627 #if !defined(__CFA_NO_STATISTICS__) … … 515 631 // Pick the bet list 516 632 int w = i; 517 if( __builtin_expect(ts(list.data[j]) != 0, true) ) { 518 w = (ts(list.data[i]) < ts(list.data[j])) ? i : j; 519 } 520 521 __intrusive_ready_queue_t & list = list.data[w]; 633 if( __builtin_expect(!is_empty(lanes.data[j]), true) ) { 634 w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j; 635 } 636 637 // Get relevant elements locally 638 __intrusive_lane_t & lane = lanes.data[w]; 639 522 640 // If list looks empty retry 523 if( ts(list) == 0) return 0p;641 if( is_empty(lane) ) return 0p; 524 642 525 643 // If we can't get the lock retry 526 if( !__atomic_try_acquire(&list.lock) ) return 0p; 527 verify(list.last_id == -1u); 528 list.last_id = kernelTLS.this_processor->id; 529 530 verify(list.last_id == kernelTLS.this_processor->id); 531 532 __attribute__((unused)) int num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED ); 644 if( !__atomic_try_acquire(&lane.lock) ) return 0p; 645 646 #if defined(__CFA_WITH_VERIFY__) 647 /* paranoid */ verify(lane.last_id == -1u); 648 /* paranoid */ lane.last_id = kernelTLS.this_processor->id; 649 #endif 533 650 534 651 535 652 // If list is empty, unlock and retry 536 if( ts(list) == 0 ) { 537 list.last_id = -1u; 538 __atomic_unlock(&list.lock); 653 if( is_empty(lane) ) { 654 #if defined(__CFA_WITH_VERIFY__) 655 /* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id); 656 /* paranoid */ lane.last_id = -1u; 657 #endif 658 659 __atomic_unlock(&lane.lock); 539 660 return 0p; 540 661 } 541 {542 __cfa_readyQ_mask_t word;543 __cfa_readyQ_mask_t bit;544 [bit, word] = extract(w);545 verify((empty.mask[word] & (1ull << bit)) != 0);546 }547 548 verify(list.last_id == kernelTLS.this_processor->id);549 verify(list.lock);550 662 551 663 // Actually pop the list 552 664 struct thread_desc * thrd; 553 665 bool emptied; 554 [thrd, emptied] = pop(list); 555 verify(thrd); 556 557 verify(list.last_id == kernelTLS.this_processor->id); 558 verify(list.lock); 559 666 [thrd, emptied] = pop(lane); 667 668 /* paranoid */ verify(thrd); 669 /* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id); 670 /* paranoid */ verify(lane.lock); 671 672 // If this was the last element in the lane 560 673 if(emptied) { 561 __atomic_fetch_sub( &empty.count, 1z, __ATOMIC_SEQ_CST); 562 563 __cfa_readyQ_mask_t word; 564 __cfa_readyQ_mask_t bit; 565 [bit, word] = extract(w); 566 verify((empty.mask[word] & (1ull << bit)) != 0); 567 __attribute__((unused)) bool ret = btr(&empty.mask[word], bit); 568 verify(ret); 569 verify((empty.mask[word] & (1ull << bit)) == 0); 570 } 571 572 verify(list.lock); 674 // Update the global count 675 __atomic_fetch_sub( &used.count, 1z, __ATOMIC_SEQ_CST); 676 677 // Update the bit mask 678 mask_clear((__cfa_readyQ_mask_t *)used.mask, w, STRICT); 679 } 680 681 #if defined(__CFA_WITH_VERIFY__) 682 /* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id); 683 /* paranoid */ lane.last_id = -1u; 684 #endif 685 686 // For statistics, check the count before we release the lock 687 #if !defined(__CFA_NO_STATISTICS__) 688 int num = __atomic_load_n( &used.count, __ATOMIC_RELAXED ); 689 #endif 573 690 574 691 // Unlock and return 575 list.last_id = -1u; 576 __atomic_unlock(&list.lock); 577 verify(empty.count >= 0); 578 692 __atomic_unlock(&lane.lock); 693 694 // Update statistics 579 695 #if !defined(__CFA_NO_STATISTICS__) 580 696 tls.pick.pop.success++; 581 tls.full.value += num; 582 tls.full.count += 1; 583 #endif 584 697 tls.used.value += num; 698 tls.used.count += 1; 699 #endif 700 701 // return the popped thread 585 702 return thrd; 586 703 } 587 704 705 // Pop from the ready queue from a given cluster 588 706 __attribute__((hot)) thread_desc * pop(struct cluster * cltr) with (cltr->ready_queue) { 589 verify( list.count > 0 ); 590 while( __atomic_load_n( &empty.count, __ATOMIC_RELAXED ) != 0) { 707 /* paranoid */ verify( lanes.count > 0 ); 708 709 // As long as the list is not empty, try finding a lane that isn't empty and pop from it 710 while( __atomic_load_n( &used.count, __ATOMIC_RELAXED ) != 0) { 591 711 #if !defined(__CFA_READQ_NO_BITMASK__) 592 tls.pick.pop.maskrds++; 593 unsigned i, j; 594 { 595 #if !defined(__CFA_NO_SCHED_STATS__) 596 tls.pick.pop.maskrds++; 597 #endif 598 599 // Pick two lists at random 600 unsigned num = ((__atomic_load_n( &list.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1; 601 602 unsigned ri = tls_rand(); 603 unsigned rj = tls_rand(); 604 605 unsigned wdxi = (ri >> 6u) % num; 606 unsigned wdxj = (rj >> 6u) % num; 607 608 size_t maski = __atomic_load_n( &empty.mask[wdxi], __ATOMIC_RELAXED ); 609 size_t maskj = __atomic_load_n( &empty.mask[wdxj], __ATOMIC_RELAXED ); 610 611 if(maski == 0 && maskj == 0) continue; 612 613 unsigned bi = rand_bit(ri, maski); 614 unsigned bj = rand_bit(rj, maskj); 615 616 verifyf(bi < 64, "%zu %u", maski, bi); 617 verifyf(bj < 64, "%zu %u", maskj, bj); 618 619 i = bi | (wdxi << 6); 620 j = bj | (wdxj << 6); 621 622 verifyf(i < list.count, "%u", wdxi << 6); 623 verifyf(j < list.count, "%u", wdxj << 6); 624 } 625 712 // If using bit masks 713 #if !defined(__CFA_NO_SCHED_STATS__) 714 tls.pick.pop.maskrds++; 715 #endif 716 717 // Pick two lists at random 718 unsigned ri = tls_rand(); 719 unsigned rj = tls_rand(); 720 721 // Find which __cfa_readyQ_mask_t the two lists belong 722 unsigned num = ((__atomic_load_n( &lanes.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1; 723 unsigned wdxi = (ri >> 6u) % num; 724 unsigned wdxj = (rj >> 6u) % num; 725 726 // Get the actual __cfa_readyQ_mask_t 727 size_t maski = __atomic_load_n( &used.mask[wdxi], __ATOMIC_RELAXED ); 728 size_t maskj = __atomic_load_n( &used.mask[wdxj], __ATOMIC_RELAXED ); 729 730 // If both of these masks are empty, retry 731 if(maski == 0 && maskj == 0) continue; 732 733 // Pick one of the non-zero bits in the masks and get the bit indexes 734 unsigned bi = rand_bit(ri, maski); 735 unsigned bj = rand_bit(rj, maskj); 736 737 // some checks 738 /* paranoid */ verifyf(bi < 64, "%zu %u", maski, bi); 739 /* paranoid */ verifyf(bj < 64, "%zu %u", maskj, bj); 740 741 // get the general list index 742 unsigned i = bi | (wdxi << 6); 743 unsigned j = bj | (wdxj << 6); 744 745 // some more checks 746 /* paranoid */ verifyf(i < lanes.count, "%u", wdxi << 6); 747 /* paranoid */ verifyf(j < lanes.count, "%u", wdxj << 6); 748 749 // try popping from the 2 picked lists 626 750 struct thread_desc * thrd = try_pop(cltr, i, j); 627 751 if(thrd) return thrd; 628 752 #else 629 753 // Pick two lists at random 630 int i = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED ); 631 int j = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED ); 632 754 int i = tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED ); 755 int j = tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED ); 756 757 // try popping from the 2 picked lists 633 758 struct thread_desc * thrd = try_pop(cltr, i, j); 634 759 if(thrd) return thrd; … … 636 761 } 637 762 763 // All lanes where empty return 0p 638 764 return 0p; 639 765 } … … 645 771 { 646 772 int idx = 0; 647 for( w ; __cfa_ readyQ_mask_size ) {773 for( w ; __cfa_lane_mask_size ) { 648 774 for( b ; 8 * sizeof(__cfa_readyQ_mask_t) ) { 649 bool is_empty = idx < l ist.count ? (ts(list.data[idx]) == 0) : true;650 bool should_be_empty = 0 == ( empty.mask[w] & (1z << b));775 bool is_empty = idx < lanes.count ? (ts(lanes.data[idx]) == 0) : true; 776 bool should_be_empty = 0 == (used.mask[w] & (1z << b)); 651 777 assertf(should_be_empty == is_empty, "Inconsistent list %d, mask expect : %d, actual is got %d", idx, should_be_empty, (bool)is_empty); 652 assert(__cfa_max_ readyQs > idx);778 assert(__cfa_max_lanes > idx); 653 779 idx++; 654 780 } … … 657 783 658 784 { 659 for( idx ; l ist.count ) {660 __intrusive_ ready_queue_t & sl = list.data[idx];661 assert(!l ist.data[idx].lock);785 for( idx ; lanes.count ) { 786 __intrusive_lane_t & sl = lanes.data[idx]; 787 assert(!lanes.data[idx].lock); 662 788 663 789 assert(head(sl)->link.prev == 0p ); … … 678 804 679 805 // Call this function of the intrusive list was moved using memcpy 680 // fixes the list so that the pointers back to anchors aren't left 681 // dangling 682 static inline void fix(__intrusive_ready_queue_t & ll) { 683 // if the list is not empty then follow he pointer 684 // and fix its reverse 685 if(ll.before.link.ts != 0l) { 806 // fixes the list so that the pointers back to anchors aren't left dangling 807 static inline void fix(__intrusive_lane_t & ll) { 808 // if the list is not empty then follow he pointer and fix its reverse 809 if(!is_empty(ll)) { 686 810 head(ll)->link.next->link.prev = head(ll); 687 811 tail(ll)->link.prev->link.next = tail(ll); … … 689 813 // Otherwise just reset the list 690 814 else { 691 tail(ll)->link.next = 0p;815 verify(tail(ll)->link.next == 0p); 692 816 tail(ll)->link.prev = head(ll); 693 817 head(ll)->link.next = tail(ll); 694 head(ll)->link.prev = 0p; 695 } 696 } 697 818 verify(head(ll)->link.prev == 0p); 819 } 820 } 821 822 // Grow the ready queue 698 823 void ready_queue_grow (struct cluster * cltr) { 824 // Lock the RWlock so no-one pushes/pops while we are changing the queue 699 825 uint_fast32_t last_size = ready_mutate_lock( *cltr ); 826 700 827 __cfaabi_dbg_print_safe("Kernel : Growing ready queue\n"); 701 check( cltr->ready_queue ); 702 828 829 // Make sure that everything is consistent 830 /* paranoid */ check( cltr->ready_queue ); 831 832 // grow the ready queue 703 833 with( cltr->ready_queue ) { 704 size_t ncount = l ist.count;834 size_t ncount = lanes.count; 705 835 706 836 // Check that we have some space left 707 if(ncount + 4 >= __cfa_max_readyQs) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_readyQs); 708 837 if(ncount + 4 >= __cfa_max_lanes) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_lanes); 838 839 // increase count 709 840 ncount += 4; 710 841 711 // Allocate new array 712 l ist.data = alloc(list.data, ncount);842 // Allocate new array (uses realloc and memcpies the data) 843 lanes.data = alloc(lanes.data, ncount); 713 844 714 845 // Fix the moved data 715 for( idx; (size_t)l ist.count ) {716 fix(l ist.data[idx]);846 for( idx; (size_t)lanes.count ) { 847 fix(lanes.data[idx]); 717 848 } 718 849 719 850 // Construct new data 720 for( idx; (size_t)l ist.count ~ ncount) {721 (l ist.data[idx]){};851 for( idx; (size_t)lanes.count ~ ncount) { 852 (lanes.data[idx]){}; 722 853 } 723 854 724 855 // Update original 725 list.count = ncount; 726 // fields in empty don't need to change 856 lanes.count = ncount; 857 858 // fields in 'used' don't need to change when growing 727 859 } 728 860 729 861 // Make sure that everything is consistent 730 check( cltr->ready_queue ); 862 /* paranoid */ check( cltr->ready_queue ); 863 731 864 __cfaabi_dbg_print_safe("Kernel : Growing ready queue done\n"); 865 866 // Unlock the RWlock 732 867 ready_mutate_unlock( *cltr, last_size ); 733 868 } 734 869 870 // Shrink the ready queue 735 871 void ready_queue_shrink(struct cluster * cltr) { 872 // Lock the RWlock so no-one pushes/pops while we are changing the queue 736 873 uint_fast32_t last_size = ready_mutate_lock( *cltr ); 874 737 875 __cfaabi_dbg_print_safe("Kernel : Shrinking ready queue\n"); 876 877 // Make sure that everything is consistent 878 /* paranoid */ check( cltr->ready_queue ); 879 738 880 with( cltr->ready_queue ) { 881 // Make sure that the total thread count stays the same 739 882 #if defined(__CFA_WITH_VERIFY__) 740 883 size_t nthreads = 0; 741 for( idx; (size_t)l ist.count ) {742 nthreads += l ist.data[idx].count;884 for( idx; (size_t)lanes.count ) { 885 nthreads += lanes.data[idx].count; 743 886 } 744 887 #endif 745 888 746 size_t ocount = l ist.count;889 size_t ocount = lanes.count; 747 890 // Check that we have some space left 748 891 if(ocount < 8) abort("Program attempted to destroy more Ready Queues than were created"); 749 892 750 list.count -= 4; 893 // reduce the actual count so push doesn't use the old queues 894 lanes.count -= 4; 895 verify(ocount > lanes.count); 896 897 // for printing count the number of displaced threads 898 #if defined(__CFA_DEBUG_PRINT__) 899 __attribute__((unused)) size_t displaced = 0; 900 #endif 751 901 752 902 // redistribute old data 753 verify(ocount > list.count); 754 __attribute__((unused)) size_t displaced = 0; 755 for( idx; (size_t)list.count ~ ocount) { 756 // This is not strictly needed but makes checking invariants much easier 757 bool locked = __atomic_try_acquire(&list.data[idx].lock); 903 for( idx; (size_t)lanes.count ~ ocount) { 904 // Lock is not strictly needed but makes checking invariants much easier 905 bool locked = __atomic_try_acquire(&lanes.data[idx].lock); 758 906 verify(locked); 759 while(0 != ts(list.data[idx])) { 907 908 // As long as we can pop from this lane to push the threads somewhere else in the queue 909 while(!is_empty(lanes.data[idx])) { 760 910 struct thread_desc * thrd; 761 911 __attribute__((unused)) bool _; 762 [thrd, _] = pop(l ist.data[idx]);763 verify(thrd); 912 [thrd, _] = pop(lanes.data[idx]); 913 764 914 push(cltr, thrd); 765 displaced++; 915 916 // for printing count the number of displaced threads 917 #if defined(__CFA_DEBUG_PRINT__) 918 displaced++; 919 #endif 766 920 } 767 921 768 __atomic_unlock(&list.data[idx].lock); 922 mask_clear((__cfa_readyQ_mask_t *)used.mask, idx, NOCHECK); 923 924 // Unlock the lane 925 __atomic_unlock(&lanes.data[idx].lock); 769 926 770 927 // TODO print the queue statistics here 771 928 772 ^(l ist.data[idx]){};929 ^(lanes.data[idx]){}; 773 930 } 774 931 775 932 __cfaabi_dbg_print_safe("Kernel : Shrinking ready queue displaced %zu threads\n", displaced); 776 933 777 // clear the now unused masks 778 { 779 __cfa_readyQ_mask_t fword, fbit, lword, lbit; 780 [fbit, fword] = extract(ocount); 781 [lbit, lword] = extract(list.count); 782 783 // For now assume that all queues where coverd by the same bitmask 784 // This is highly probable as long as grow and shrink use groups of 4 785 // exclusively 786 verify(fword == lword); 787 __cfa_readyQ_mask_t clears = ~0; 788 789 for( b ; lbit ~ fbit ) { 790 clears ^= 1 << b; 934 // recompute the used.count instead of maintaining it 935 used.count = 0; 936 for( i ; __cfa_lane_mask_size ) { 937 used.count += __builtin_popcountl(used.mask[i]); 938 } 939 940 // Allocate new array (uses realloc and memcpies the data) 941 lanes.data = alloc(lanes.data, lanes.count); 942 943 // Fix the moved data 944 for( idx; (size_t)lanes.count ) { 945 fix(lanes.data[idx]); 946 } 947 948 // Make sure that the total thread count stayed the same 949 #if defined(__CFA_WITH_VERIFY__) 950 for( idx; (size_t)lanes.count ) { 951 nthreads -= lanes.data[idx].count; 791 952 } 792 793 empty.mask[fword] &= clears; 794 795 796 empty.count = 0; 797 for( i ; 0 ~= lword ) { 798 empty.count += __builtin_popcountl(empty.mask[i]); 799 } 800 } 801 802 // Allocate new array 803 list.data = alloc(list.data, list.count); 804 805 // Fix the moved data 806 for( idx; (size_t)list.count ) { 807 fix(list.data[idx]); 808 } 809 810 #if defined(__CFA_WITH_VERIFY__) 811 for( idx; (size_t)list.count ) { 812 nthreads -= list.data[idx].count; 813 } 814 assertf(nthreads == 0, "Shrinking changed number of threads"); 953 verifyf(nthreads == 0, "Shrinking changed number of threads"); 815 954 #endif 816 955 } 817 956 818 957 // Make sure that everything is consistent 819 check( cltr->ready_queue ); 958 /* paranoid */ check( cltr->ready_queue ); 959 820 960 __cfaabi_dbg_print_safe("Kernel : Shrinking ready queue done\n"); 961 962 // Unlock the RWlock 821 963 ready_mutate_unlock( *cltr, last_size ); 822 964 } … … 832 974 __atomic_fetch_add( &global_stats.pick.pop .success, tls.pick.pop .success, __ATOMIC_SEQ_CST ); 833 975 834 __atomic_fetch_add( &global_stats. full.value, tls.full.value, __ATOMIC_SEQ_CST );835 __atomic_fetch_add( &global_stats. full.count, tls.full.count, __ATOMIC_SEQ_CST );976 __atomic_fetch_add( &global_stats.used.value, tls.used.value, __ATOMIC_SEQ_CST ); 977 __atomic_fetch_add( &global_stats.used.count, tls.used.count, __ATOMIC_SEQ_CST ); 836 978 } 837 979 #endif -
libcfa/src/stdhdr/assert.h
r75ca7f4 rdca5802 33 33 #define verify(x) assert(x) 34 34 #define verifyf(x, ...) assertf(x, __VA_ARGS__) 35 #define verifyfail(...) 35 36 #define __CFA_WITH_VERIFY__ 36 37 #else 37 38 #define verify(x) 38 39 #define verifyf(x, ...) 40 #define verifyfail(...) 39 41 #endif 40 42
Note: See TracChangeset
for help on using the changeset viewer.