Context Navigation

← Previous Changeset
Next Changeset →

Changeset dca5802

Timestamp:

Jan 30, 2020, 1:40:05 PM (6 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum

Children:

Parents:

Message:

Started doing some of the x86 implementations and some changes after a code review

Location:

Files:

: 5 edited

bits/defs.hfa (modified) (3 diffs)
concurrency/kernel.hfa (modified) (5 diffs)
concurrency/kernel_private.hfa (modified) (4 diffs)
concurrency/ready_queue.cfa (modified) (25 diffs)
stdhdr/assert.h (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/bits/defs.hfa

-              r75ca7f4
+              rdca5802
 // #define __CFA_NO_BIT_TEST_AND_SET__
+static inline bool bts(volatile unsigned long long int * target, unsigned long long int bit ) {
+#if defined( __i386 )
+static inline bool __atomic_bts(volatile unsigned long int * target, unsigned long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsl %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+static inline bool __atomic_btr(volatile unsigned long int * target, unsigned long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+        #else
+        int result = 0;
+        asm volatile(
+            "LOCK btrl %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __x86_64 )
+static inline bool __atomic_bts(volatile unsigned long long int * target, unsigned long long int bit ) {
         #if defined(__CFA_NO_BIT_TEST_AND_SET__)
         unsigned long long int mask = 1ul << bit;
 …
+}
 static inline bool btr(volatile unsigned long long int * target, unsigned long long int bit ) {
+static inline bool __atomic_btr(volatile unsigned long long int * target, unsigned long long int bit ) {
         #if defined(__CFA_NO_BIT_TEST_AND_SET__)
         unsigned long long int mask = 1ul << bit;
 …
     #endif
+}
+#elif defined( __ARM_ARCH )
+    #error __atomic_bts and __atomic_btr not implemented for arm
+#else
+        #error uknown hardware architecture
+#endif

libcfa/src/concurrency/kernel.hfa

-              r75ca7f4
+              rdca5802
 //-----------------------------------------------------------------------------
 // Cluster Tools
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
 struct __processor_id;
 // Reader-Writer lock protecting the ready-queue
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
 struct __clusterRWLock_t {
         // total cachelines allocated
 …
 void ^?{}(__clusterRWLock_t & this);
 // Underlying sub quues of the ready queue
 struct __attribute__((aligned(128))) __intrusive_ready_queue_t {
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t {
         // spin lock protecting the queue
         volatile bool lock;
-        unsigned int last_id;
-        unsigned int count;
         // anchor for the head and the tail of the queue
 …
         } before, after;
+#if defined(__CFA_WITH_VERIFY__)
+        // id of last processor to acquire the lock
+        // needed only to check for mutual exclusion violations
+        unsigned int last_id;
+        // number of items on this list
+        // needed only to check for deadlocks
+        unsigned int count;
+#endif
         // Optional statistic counters
         #if !defined(__CFA_NO_SCHED_STATS__)
 …
 };
 void  ?{}(__intrusive_ready_queue_t & this);
 void ^?{}(__intrusive_ready_queue_t & this);
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
 typedef unsigned long long __cfa_readyQ_mask_t;
 …
 // };
 #define __cfa_readyQ_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
 #define __cfa_max_readyQs (__cfa_readyQ_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+#define __cfa_lane_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_lanes (__cfa_lane_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
 //TODO adjust cache size to ARCHITECTURE
+// Structure holding the relaxed ready queue
 struct __attribute__((aligned(128))) __ready_queue_t {
+        // Data tracking how many/which lanes are used
+        // Aligned to 128 for cache locality
         struct {
+                // number of non-empty lanes
                 volatile size_t count;
+                volatile __cfa_readyQ_mask_t mask[ __cfa_readyQ_mask_size ];
+        } empty;
+                // bit mask, set bits indentify which lanes are non-empty
+                volatile __cfa_readyQ_mask_t mask[ __cfa_lane_mask_size ];
+        } used;
+        // Data tracking the actual lanes
+        // On a seperate cacheline from the used struct since
+        // used can change on each push/pop but this data
+        // only changes on shrink/grow
         struct __attribute__((aligned(64))) {
+                __intrusive_ready_queue_t * volatile data;
+                // Arary of lanes
+                __intrusive_lane_t * volatile data;
+                // Number of lanes (empty or not)
                 volatile size_t count;
+        } list;
+        } lanes;
+        // Statistics
         #if !defined(__CFA_NO_STATISTICS__)
                 __attribute__((aligned(64))) struct {
                         struct {
+                                // Push statistic
                                 struct {
+                                        // number of attemps at pushing something
                                         volatile size_t attempt;
+                                        // number of successes at pushing
                                         volatile size_t success;
                                 } push;
+                                // Pop statistic
                                 struct {
+                                        // number of reads of the mask
+                                        // picking an empty __cfa_readyQ_mask_t counts here
+                                        // but not as an attempt
                                         volatile size_t maskrds;
+                                        // number of attemps at poping something
                                         volatile size_t attempt;
+                                        // number of successes at poping
                                         volatile size_t success;
                                 } pop;
                         } pick;
+                        // stats on the "used" struct of the queue
+                        // tracks average number of queues that are not empty
+                        // when pushing / poping
                         struct {
                                 volatile size_t value;
                                 volatile size_t count;
                         } full;
+                        } used;
                 } global_stats;

libcfa/src/concurrency/kernel_private.hfa

-              r75ca7f4
+              rdca5802
 // Concurrent with doregister/unregister,
 //    i.e., threads can be added at any point during or between the entry/exit
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
 static inline void __atomic_acquire(volatile bool * ll) {
         while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
 …
+}
+// Non-Blocking acquire
 static inline bool __atomic_try_acquire(volatile bool * ll) {
         return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+// Release
 static inline void __atomic_unlock(volatile bool * ll) {
         /* paranoid */ verify(*ll);
 …
         /*paranoid*/ verify(iproc < ready);
         /*paranoid*/ verify(data[iproc].lock);
         __atomic_store_n(&data[iproc].lock, false, __ATOMIC_RELEASE);
+        __atomic_unlock(&data[iproc].lock);
+}
 …
 uint_fast32_t ready_mutate_lock( struct cluster & cltr );
 void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t /* value returned by lock */ );
 //=======================================================================
 // Ready-Queue API
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
 __attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
 __attribute__((hot)) thread_desc * pop(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
 void ready_queue_grow  (struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
 void ready_queue_shrink(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Statics call at the end of each thread to register statistics
 #if !defined(__CFA_NO_STATISTICS__)
 void stats_tls_tally(struct cluster * cltr);

libcfa/src/concurrency/ready_queue.cfa

-              r75ca7f4
+              rdca5802
 static const size_t cache_line_size = 64;
+static inline unsigned __max_processors_fallback() {
+        #ifdef __CFA_MAX_PROCESSORS__
+                return __CFA_MAX_PROCESSORS__;
+        #else
+                // No overriden function, no environment variable, no define
+                // fall back to a magic number
+                return 128;
+        #endif
+}
+// No overriden function, no environment variable, no define
+// fall back to a magic number
+#ifndef __CFA_MAX_PROCESSORS__
+        #define __CFA_MAX_PROCESSORS__ 128
+#endif
+// returns the maximum number of processors the RWLock support
 __attribute__((weak)) unsigned __max_processors() {
         const char * max_cores_s = getenv("CFA_MAX_PROCESSORS");
         if(!max_cores_s) {
                 __cfaabi_dbg_print_nolock("No CFA_MAX_PROCESSORS in ENV");
                 return __max_processors_fallback();
+                return __CFA_MAX_PROCESSORS__;
+        }
 …
         if(max_cores_l < 1 || max_cores_l > 65535) {
                 __cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS out of range : %ld", max_cores_l);
                 return __max_processors_fallback();
+                return __CFA_MAX_PROCESSORS__;
+        }
         if('\0' != *endptr) {
                 __cfaabi_dbg_print_nolock("CFA_MAX_PROCESSORS not a decimal number : %s", max_cores_s);
                 return __max_processors_fallback();
+                return __CFA_MAX_PROCESSORS__;
+        }
 …
+}
+static inline unsigned rand_bit(unsigned rnum, size_t mask) {
+        verify(sizeof(mask) == 8);
+// Picks a random 1 bit in 'mask' according to random number 'rnum'.
+static inline unsigned rand_bit(unsigned rnum, __cfa_readyQ_mask_t mask) {
+#if defined( __i386 )
+        static_assert(sizeof(mask) == 4);
+        unsigned bit = mask ? rnum % __builtin_popcount(mask) : 0;
+        #if !defined(__BMI2__)
+                #error rand_bit not implemented for non __BMI2__ i386
+        #else
+                uint32_t picked = _pdep_u32(1ul << bit, mask);
+                return picked ? __builtin_ctz(picked) : 0;
+        #endif
+#elif defined( __x86_64 )
+        static_assert(sizeof(mask) == 8);
         unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if !defined(__BMI2__)
+        uint64_t v = mask;   // Input value to find position with rank r.
+        unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+        unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+        uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+        unsigned int t;      // Bit count temporary.
+        // Do a normal parallel bit count for a 64-bit integer,
+        // but store all intermediate steps.
+        a =  v - ((v >> 1) & ~0UL/3);
+        b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+        c = (b + (b >> 4)) & ~0UL/0x11;
+        d = (c + (c >> 8)) & ~0UL/0x101;
+        t = (d >> 32) + (d >> 48);
+        // Now do branchless select!
+        s  = 64;
+        s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+        t  = (d >> (s - 16)) & 0xff;
+        s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+        t  = (c >> (s - 8)) & 0xf;
+        s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+        t  = (b >> (s - 4)) & 0x7;
+        s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+        t  = (a >> (s - 2)) & 0x3;
+        s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+        t  = (v >> (s - 1)) & 0x1;
+        s -= ((t - r) & 256) >> 8;
+        return s - 1;
+        #if !defined(__BMI2__)
+                uint64_t v = mask;   // Input value to find position with rank r.
+                unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+                unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+                uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+                unsigned int t;      // Bit count temporary.
+                // Do a normal parallel bit count for a 64-bit integer,
+                // but store all intermediate steps.
+                a =  v - ((v >> 1) & ~0UL/3);
+                b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+                c = (b + (b >> 4)) & ~0UL/0x11;
+                d = (c + (c >> 8)) & ~0UL/0x101;
+                t = (d >> 32) + (d >> 48);
+                // Now do branchless select!
+                s  = 64;
+                s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+                t  = (d >> (s - 16)) & 0xff;
+                s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+                t  = (c >> (s - 8)) & 0xf;
+                s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+                t  = (b >> (s - 4)) & 0x7;
+                s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+                t  = (a >> (s - 2)) & 0x3;
+                s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+                t  = (v >> (s - 1)) & 0x1;
+                s -= ((t - r) & 256) >> 8;
+                return s - 1;
+        #else
+                uint64_t picked = _pdep_u64(1ul << bit, mask);
+                return picked ? __builtin_ctzl(picked) : 0;
+        #endif
+#elif defined( __ARM_ARCH )
+        #error rand_bit not implemented for arm
 #else
+        uint64_t picked = _pdep_u64(1ul << bit, mask);
+        return picked ? __builtin_ctzl(picked) : 0;
+        #error uknown hardware architecture
 #endif
+}
+static inline __cfa_readyQ_mask_t readyQ_mask_full       () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
+static inline __cfa_readyQ_mask_t readyQ_mask_shit_length() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(readyQ_mask_full()); }
+//-----------------------------------------------------------------------------
+// Helpers used by extract
+// (_mask_bitsidx() & X) returns a bit index valid for a __cfa_readyQ_mask_t, where X is any integer
+static inline __cfa_readyQ_mask_t _mask_bitsidx () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
+// (X >> _mask_shiftidx()) retuns an index into an array of __cfa_readyQ_mask_t
+static inline __cfa_readyQ_mask_t _mask_shiftidx() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(_mask_bitsidx()); }
+// Assuming a large bit mask represented as an array of __cfa_readyQ_mask_t
+// Given an index into the large mask, returns the bit index and which __cfa_readyQ_mask_t index in the array
 static inline [__cfa_readyQ_mask_t, __cfa_readyQ_mask_t] extract(__cfa_readyQ_mask_t idx) {
         __cfa_readyQ_mask_t word = idx >> readyQ_mask_shit_length();
         __cfa_readyQ_mask_t bit  = idx &  readyQ_mask_full();
+        __cfa_readyQ_mask_t word = idx >> _mask_bitsidx();
+        __cfa_readyQ_mask_t bit  = idx &  _mask_shiftidx();
         return [bit, word];
+}
 …
 //=======================================================================
 // Get the head pointer (one before the first element) from the anchor
 static inline thread_desc * head(const __intrusive_ready_queue_t & this) {
+static inline thread_desc * head(const __intrusive_lane_t & this) {
         thread_desc * rhead = (thread_desc *)(
                 (uintptr_t)( &this.before ) - offsetof( thread_desc, link )
 …
 // Get the tail pointer (one after the last element) from the anchor
 static inline thread_desc * tail(const __intrusive_ready_queue_t & this) {
+static inline thread_desc * tail(const __intrusive_lane_t & this) {
         thread_desc * rtail = (thread_desc *)(
                 (uintptr_t)( &this.after ) - offsetof( thread_desc, link )
 …
 // Ctor
 void ?{}( __intrusive_ready_queue_t & this ) {
+void ?{}( __intrusive_lane_t & this ) {
         this.lock = false;
         this.last_id = -1u;
 …
         /* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
         /* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
         /* paranoid */ verify(sizeof(__intrusive_ready_queue_t) == 128);
+        /* paranoid */ verify(sizeof(__intrusive_lane_t) == 128);
         /* paranoid */ verify(sizeof(this) == 128);
         /* paranoid */ verify(__alignof__(__intrusive_ready_queue_t) == 128);
+        /* paranoid */ verify(__alignof__(__intrusive_lane_t) == 128);
         /* paranoid */ verify(__alignof__(this) == 128);
         /* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
         /* paranoid */ verifyf(readyQ_mask_shit_length() == 6 , "%zu", readyQ_mask_shit_length());
         /* paranoid */ verifyf(readyQ_mask_full()        == 63, "%zu", readyQ_mask_full());
+        /* paranoid */ verifyf(_mask_shiftidx() == 6 , "%zu", _mask_shiftidx());
+        /* paranoid */ verifyf(_mask_bitsidx () == 63, "%zu", _mask_bitsidx());
+}
 // Dtor is trivial
 void ^?{}( __intrusive_ready_queue_t & this ) {
+void ^?{}( __intrusive_lane_t & this ) {
         // Make sure the list is empty
         /* paranoid */ verify(head(this)->link.prev == 0p );
 …
+}
+bool push(__intrusive_ready_queue_t & this, thread_desc * node) {
+        verify(this.lock);
+        verify(node->link.ts != 0);
+        verify(node->link.next == 0p);
+        verify(node->link.prev == 0p);
+        this.count++;
+        if(this.before.link.ts == 0l) {
+                verify(tail(this)->link.next == 0p);
+                verify(tail(this)->link.prev == head(this));
+                verify(head(this)->link.next == tail(this));
+                verify(head(this)->link.prev == 0p);
+        }
+// Push a thread onto this lane
+// returns true of lane was empty before push, false otherwise
+bool push(__intrusive_lane_t & this, thread_desc * node) {
+        #if defined(__CFA_WITH_VERIFY__)
+                /* paranoid */ verify(this.lock);
+                /* paranoid */ verify(node->link.ts != 0);
+                /* paranoid */ verify(node->link.next == 0p);
+                /* paranoid */ verify(node->link.prev == 0p);
+                this.count++;
+                if(this.before.link.ts == 0l) {
+                        /* paranoid */ verify(tail(this)->link.next == 0p);
+                        /* paranoid */ verify(tail(this)->link.prev == head(this));
+                        /* paranoid */ verify(head(this)->link.next == tail(this));
+                        /* paranoid */ verify(head(this)->link.prev == 0p);
+                }
+        #endif
         // Get the relevant nodes locally
 …
         // Update stats
         #ifndef __CFA_NO_SCHED_STATS__
+        #if !defined(__CFA_NO_SCHED_STATS__)
                 this.stat.diff++;
                 this.stat.push++;
 …
         if(this.before.link.ts == 0l) {
                 this.before.link.ts = node->link.ts;
                 verify(node->link.prev == head(this));
+                /* paranoid */ verify(node->link.prev == head(this));
                 return true;
+        }
 …
+}
+[thread_desc *, bool] pop(__intrusive_ready_queue_t & this) {
+        verify(this.lock);
+        verify(this.before.link.ts != 0ul);
+// Pop a thread from this lane (must be non-empty)
+// returns popped
+// returns true of lane was empty before push, false otherwise
+[thread_desc *, bool] pop(__intrusive_lane_t & this) {
+        /* paranoid */ verify(this.lock);
+        /* paranoid */ verify(this.before.link.ts != 0ul);
+        // Get anchors locally
         thread_desc * head = head(this);
         thread_desc * tail = tail(this);
+        // Get the relevant nodes locally
         thread_desc * node = head->link.next;
         thread_desc * next = node->link.next;
+        if(node == tail) {
+                verify(false);
+                verify(this.before.link.ts == 0ul);
+                verify(tail(this)->link.next == 0p);
+                verify(tail(this)->link.prev == head(this));
+                verify(head(this)->link.next == tail(this));
+                verify(head(this)->link.prev == 0p);
+                return [0p, false];
+        }
+        this.count--;
+        /* paranoid */ verify(node);
+        #if defined(__CFA_WITH_VERIFY__)
+                this.count--;
+                /* paranoid */ verify(node != tail);
+                /* paranoid */ verify(node);
+        #endif
+        // Do the pop
         head->link.next = next;
         next->link.prev = head;
+        node->link.[next, prev] = 0p;
+        // Update head time stamp
+        this.before.link.ts = next->link.ts;
+        // Update stats
         #ifndef __CFA_NO_SCHED_STATS__
                 this.stat.diff--;
 …
         #endif
+        // Check if we emptied list and return accordingly
         if(next == tail) {
+                this.before.link.ts = 0ul;
+                verify(tail(this)->link.next == 0p);
+                verify(tail(this)->link.prev == head(this));
+                verify(head(this)->link.next == tail(this));
+                verify(head(this)->link.prev == 0p);
+                node->link.[next, prev] = 0p;
+                /* paranoid */ verify(this.before.link.ts == 0);
+                /* paranoid */ verify(tail(this)->link.next == 0p);
+                /* paranoid */ verify(tail(this)->link.prev == head(this));
+                /* paranoid */ verify(head(this)->link.next == tail(this));
+                /* paranoid */ verify(head(this)->link.prev == 0p);
                 return [node, true];
+        }
         else {
+                verify(next->link.ts != 0);
+                this.before.link.ts = next->link.ts;
+                verify(this.before.link.ts != 0);
+                node->link.[next, prev] = 0p;
+                /* paranoid */ verify(next->link.ts != 0);
+                /* paranoid */ verify(this.before.link.ts != 0);
                 return [node, false];
+        }
+}
+static inline unsigned long long ts(__intrusive_ready_queue_t & this) {
+// Check whether or not list is empty
+static inline bool is_empty(__intrusive_lane_t & this) {
+        verify( (this.before.link.ts == 0) == (this.count == 0) );
+        return this.before.link.ts == 0;
+}
+// Return the timestamp
+static inline unsigned long long ts(__intrusive_lane_t & this) {
+        verify( this.before.link.ts == this.before.link.next->link.ts );
         return this.before.link.ts;
+}
 …
 //=======================================================================
+// Thread local mirror of ready queue statistics
+#if !defined(__CFA_NO_STATISTICS__)
 static __attribute__((aligned(128))) thread_local struct {
         struct {
 …
                 size_t value;
                 size_t count;
         } full;
+        } used;
 } tls = {
         /* pick */{
 …
                 /* pop  */{ 0, 0, 0 },
         },
         /* full */{ 0, 0 }
+        /* used */{ 0, 0 }
 };
+#endif
 //-----------------------------------------------------------------------
 void ?{}(__ready_queue_t & this) with (this) {
         empty.count = 0;
         for( i ; __cfa_readyQ_mask_size ) {
                 empty.mask[i] = 0;
+        }
         list.data = alloc(4);
+        used.count = 0;
+        for( i ; __cfa_lane_mask_size ) {
+                used.mask[i] = 0;
+        }
+        lanes.data = alloc(4);
         for( i; 4 ) {
                 (list.data[i]){};
+        }
         list.count = 4;
+                (lanes.data[i]){};
+        }
+        lanes.count = 4;
         #if !defined(__CFA_NO_STATISTICS__)
 …
                 global_stats.pick.pop .success = 0;
                 global_stats.full.value = 0;
                 global_stats.full.count = 0;
+                global_stats.used.value = 0;
+                global_stats.used.count = 0;
         #endif
+}
 void ^?{}(__ready_queue_t & this) with (this) {
         verify( 4  == list .count );
         verify( 0  == empty.count );
+        verify( 4  == lanes.count );
+        verify( 0  == used .count );
         for( i; 4 ) {
                 ^(list.data[i]){};
+        }
         free(list.data);
+                ^(lanes.data[i]){};
+        }
+        free(lanes.data);
         #if defined(__CFA_WITH_VERIFY__)
                 for( i ; __cfa_readyQ_mask_size ) {
                         assert( 0 == empty.mask[i] );
+                for( i ; __cfa_lane_mask_size ) {
+                        assert( 0 == used.mask[i] );
+                }
         #endif
 …
 //-----------------------------------------------------------------------
+enum mask_strictness {
+        STRICT,
+        NOCHECK
+};
+// Set a given bit in the bit mask array
+// strictness determines of the bit had to be cleared before
+static inline void mask_set(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) {
+        // Extract the array and bit indexes
+        __cfa_readyQ_mask_t word;
+        __cfa_readyQ_mask_t bit;
+        [bit, word] = extract(index);
+        // Conditional check
+        verifyf(
+                strict == STRICT && // Conditional check if it was expected to be cleared
+                ((mask[word] & (1ull << bit)) == 0),
+                "Before set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+        );
+        // Atomically set the bit
+        __attribute__((unused)) bool ret = __atomic_bts(&mask[word], bit);
+        // Conditional check
+        verifyf(
+                strict == STRICT && // Conditional check if it was expected to be cleared
+                !ret,
+                "Bit was not set but bts returned true"
+        );
+        // Unconditional check
+        verifyf(
+                (mask[word] & (1ull << bit)) != 0,
+                "After set %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+        );
+}
+static inline void mask_clear(__cfa_readyQ_mask_t * mask, unsigned index, mask_strictness strict) {
+        // Extract the array and bit indexes
+        __cfa_readyQ_mask_t word;
+        __cfa_readyQ_mask_t bit;
+        [bit, word] = extract(index);
+        // Conditional check
+        verifyf(
+                strict == STRICT && // Conditional check if it was expected to be set
+                ((mask[word] & (1ull << bit)) != 0),
+                "Before clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+        );
+        // Atomically clear the bit
+        __attribute__((unused)) bool ret = __atomic_btr(&mask[word], bit);
+        // Conditional check
+        verifyf(
+                strict == STRICT && // Conditional check if it was expected to be cleared
+                ret,
+                "Bit was set but btr returned false"
+        );
+        // Unconditional check
+        verifyf(
+                (mask[word] & (1ull << bit)) == 0,
+                "After clear %llu:%llu (%u), %llx & %llx", word, bit, index, mask[word], (1ull << bit)
+        );
+}
+//-----------------------------------------------------------------------
 __attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd) with (cltr->ready_queue) {
+        // write timestamp
         thrd->link.ts = rdtscl();
+        while(true) {
+                // Pick a random list
+                unsigned i = tls_rand() % list.count;
+        // Try to pick a lane and lock it
+        unsigned i;
+        do {
+                // Pick the index of a lane
+                unsigned i = tls_rand() % lanes.count;
                 #if !defined(__CFA_NO_STATISTICS__)
 …
                 // If we can't lock it retry
+                if( !__atomic_try_acquire( &list.data[i].lock ) ) continue;
+                verify(list.data[i].last_id == -1u);
+                list.data[i].last_id = kernelTLS.this_processor->id;
+                __attribute__((unused)) size_t num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+                bool first = false;
+                verify( list.data[i].last_id == kernelTLS.this_processor->id );
+                verify( list.data[i].lock );
+                // Actually push it
+                if(push(list.data[i], thrd)) {
+                        size_t ret = __atomic_fetch_add( &empty.count, 1z, __ATOMIC_SEQ_CST);
+                        first = (ret == 0);
+                        __cfa_readyQ_mask_t word;
+                        __cfa_readyQ_mask_t bit;
+                        [bit, word] = extract(i);
+                        verifyf((empty.mask[word] & (1ull << bit)) == 0, "Before set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
+                        __attribute__((unused)) bool ret = bts(&empty.mask[word], bit);
+                        verify(!(bool)ret);
+                        verifyf((empty.mask[word] & (1ull << bit)) != 0, "After set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
+                }
+                verifyf( empty.count <= list.count, "Non-empty count (%zu) exceeds actual count (%zu)\n", empty.count, list.count );
+                verifyf( list.data[i].last_id == kernelTLS.this_processor->id, "Expected last processor to lock queue %u to be %u, was %u\n", i, list.data[i].last_id, kernelTLS.this_processor->id );
+                verifyf( list.data[i].lock, "List %u is not locked\n", i );
+                // Unlock and return
+                list.data[i].last_id = -1u;
+                __atomic_unlock( &list.data[i].lock );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        tls.pick.push.success++;
+                        tls.full.value += num;
+                        tls.full.count += 1;
+                #endif
+                return first;
+        }
+        } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+        #if defined(__CFA_WITH_VERIFY__)
+                /* paranoid */ verify(lanes.data[i].last_id == -1u);
+                /* paranoid */ lanes.data[i].last_id = kernelTLS.this_processor->id;
+        #endif
+        __attribute__((unused)) size_t num = __atomic_load_n( &used.count, __ATOMIC_RELAXED );
+        bool first = false;
+        // Actually push it
+        bool lane_first = push(lanes.data[i], thrd);
+        // If this lane used to be empty we need to do more
+        if(lane_first) {
+                // Update the global count
+                size_t ret = __atomic_fetch_add( &used.count, 1z, __ATOMIC_SEQ_CST);
+                // Check if the entire quue used to be empty
+                first = (ret == 0);
+                // Update the bit mask
+                mask_set((__cfa_readyQ_mask_t *)used.mask, i, STRICT);
+        }
+        #if defined(__CFA_WITH_VERIFY__)
+                /* paranoid */ verifyf( used.count <= lanes.count, "Non-empty count (%zu) exceeds actual count (%zu)\n", used.count, lanes.count );
+                /* paranoid */ verifyf( lanes.data[i].last_id == kernelTLS.this_processor->id, "Expected last processor to lock queue %u to be %u, was %u\n", i, lanes.data[i].last_id, kernelTLS.this_processor->id );
+                /* paranoid */ verifyf( lanes.data[i].lock, "List %u is not locked\n", i );
+                /* paranoid */ lanes.data[i].last_id = -1u;
+        #endif
+        // Unlock and return
+        __atomic_unlock( &lanes.data[i].lock );
+        // Update statistics
+        #if !defined(__CFA_NO_STATISTICS__)
+                tls.pick.push.success++;
+                tls.used.value += num;
+                tls.used.count += 1;
+        #endif
+        // return whether or not the list was empty before this push
+        return first;
+}
 //-----------------------------------------------------------------------
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
 static struct thread_desc * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
         #if !defined(__CFA_NO_STATISTICS__)
 …
         // Pick the bet list
         int w = i;
+        if( __builtin_expect(ts(list.data[j]) != 0, true) ) {
+                w = (ts(list.data[i]) < ts(list.data[j])) ? i : j;
+        }
+        __intrusive_ready_queue_t & list = list.data[w];
+        if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+                w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+        }
+        // Get relevant elements locally
+        __intrusive_lane_t & lane = lanes.data[w];
         // If list looks empty retry
         if( ts(list) == 0 ) return 0p;
+        if( is_empty(lane) ) return 0p;
         // If we can't get the lock retry
+        if( !__atomic_try_acquire(&list.lock) ) return 0p;
+        verify(list.last_id == -1u);
+        list.last_id = kernelTLS.this_processor->id;
+        verify(list.last_id == kernelTLS.this_processor->id);
+        __attribute__((unused)) int num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+        if( !__atomic_try_acquire(&lane.lock) ) return 0p;
+        #if defined(__CFA_WITH_VERIFY__)
+                /* paranoid */ verify(lane.last_id == -1u);
+                /* paranoid */ lane.last_id = kernelTLS.this_processor->id;
+        #endif
         // If list is empty, unlock and retry
+        if( ts(list) == 0 ) {
+                list.last_id = -1u;
+                __atomic_unlock(&list.lock);
+        if( is_empty(lane) ) {
+                #if defined(__CFA_WITH_VERIFY__)
+                        /* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+                        /* paranoid */ lane.last_id = -1u;
+                #endif
+                __atomic_unlock(&lane.lock);
                 return 0p;
+        }
+        {
-                __cfa_readyQ_mask_t word;
-                __cfa_readyQ_mask_t bit;
-                [bit, word] = extract(w);
-                verify((empty.mask[word] & (1ull << bit)) != 0);
+        }
-        verify(list.last_id == kernelTLS.this_processor->id);
-        verify(list.lock);
         // Actually pop the list
         struct thread_desc * thrd;
         bool emptied;
+        [thrd, emptied] = pop(list);
+        verify(thrd);
+        verify(list.last_id == kernelTLS.this_processor->id);
+        verify(list.lock);
+        [thrd, emptied] = pop(lane);
+        /* paranoid */ verify(thrd);
+        /* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+        /* paranoid */ verify(lane.lock);
+        // If this was the last element in the lane
         if(emptied) {
+                __atomic_fetch_sub( &empty.count, 1z, __ATOMIC_SEQ_CST);
+                __cfa_readyQ_mask_t word;
+                __cfa_readyQ_mask_t bit;
+                [bit, word] = extract(w);
+                verify((empty.mask[word] & (1ull << bit)) != 0);
+                __attribute__((unused)) bool ret = btr(&empty.mask[word], bit);
+                verify(ret);
+                verify((empty.mask[word] & (1ull << bit)) == 0);
+        }
+        verify(list.lock);
+                // Update the global count
+                __atomic_fetch_sub( &used.count, 1z, __ATOMIC_SEQ_CST);
+                // Update the bit mask
+                mask_clear((__cfa_readyQ_mask_t *)used.mask, w, STRICT);
+        }
+        #if defined(__CFA_WITH_VERIFY__)
+                /* paranoid */ verify(lane.last_id == kernelTLS.this_processor->id);
+                /* paranoid */ lane.last_id = -1u;
+        #endif
+        // For statistics, check the count before we release the lock
+        #if !defined(__CFA_NO_STATISTICS__)
+                int num = __atomic_load_n( &used.count, __ATOMIC_RELAXED );
+        #endif
         // Unlock and return
+        list.last_id = -1u;
+        __atomic_unlock(&list.lock);
+        verify(empty.count >= 0);
+        __atomic_unlock(&lane.lock);
+        // Update statistics
         #if !defined(__CFA_NO_STATISTICS__)
                 tls.pick.pop.success++;
+                tls.full.value += num;
+                tls.full.count += 1;
+        #endif
+                tls.used.value += num;
+                tls.used.count += 1;
+        #endif
+        // return the popped thread
         return thrd;
+}
+// Pop from the ready queue from a given cluster
 __attribute__((hot)) thread_desc * pop(struct cluster * cltr) with (cltr->ready_queue) {
+        verify( list.count > 0 );
+        while( __atomic_load_n( &empty.count, __ATOMIC_RELAXED ) != 0) {
+        /* paranoid */ verify( lanes.count > 0 );
+        // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+        while( __atomic_load_n( &used.count, __ATOMIC_RELAXED ) != 0) {
                 #if !defined(__CFA_READQ_NO_BITMASK__)
+                        tls.pick.pop.maskrds++;
+                        unsigned i, j;
+                        {
+                                #if !defined(__CFA_NO_SCHED_STATS__)
+                                        tls.pick.pop.maskrds++;
+                                #endif
+                                // Pick two lists at random
+                                unsigned num = ((__atomic_load_n( &list.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
+                                unsigned ri = tls_rand();
+                                unsigned rj = tls_rand();
+                                unsigned wdxi = (ri >> 6u) % num;
+                                unsigned wdxj = (rj >> 6u) % num;
+                                size_t maski = __atomic_load_n( &empty.mask[wdxi], __ATOMIC_RELAXED );
+                                size_t maskj = __atomic_load_n( &empty.mask[wdxj], __ATOMIC_RELAXED );
+                                if(maski == 0 && maskj == 0) continue;
+                                unsigned bi = rand_bit(ri, maski);
+                                unsigned bj = rand_bit(rj, maskj);
+                                verifyf(bi < 64, "%zu %u", maski, bi);
+                                verifyf(bj < 64, "%zu %u", maskj, bj);
+                                i = bi | (wdxi << 6);
+                                j = bj | (wdxj << 6);
+                                verifyf(i < list.count, "%u", wdxi << 6);
+                                verifyf(j < list.count, "%u", wdxj << 6);
+                        }
+                        // If using bit masks
+                        #if !defined(__CFA_NO_SCHED_STATS__)
+                                tls.pick.pop.maskrds++;
+                        #endif
+                        // Pick two lists at random
+                        unsigned ri = tls_rand();
+                        unsigned rj = tls_rand();
+                        // Find which __cfa_readyQ_mask_t the two lists belong
+                        unsigned num = ((__atomic_load_n( &lanes.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
+                        unsigned wdxi = (ri >> 6u) % num;
+                        unsigned wdxj = (rj >> 6u) % num;
+                        // Get the actual __cfa_readyQ_mask_t
+                        size_t maski = __atomic_load_n( &used.mask[wdxi], __ATOMIC_RELAXED );
+                        size_t maskj = __atomic_load_n( &used.mask[wdxj], __ATOMIC_RELAXED );
+                        // If both of these masks are empty, retry
+                        if(maski == 0 && maskj == 0) continue;
+                        // Pick one of the non-zero bits in the masks and get the bit indexes
+                        unsigned bi = rand_bit(ri, maski);
+                        unsigned bj = rand_bit(rj, maskj);
+                        // some checks
+                        /* paranoid */ verifyf(bi < 64, "%zu %u", maski, bi);
+                        /* paranoid */ verifyf(bj < 64, "%zu %u", maskj, bj);
+                        // get the general list index
+                        unsigned i = bi | (wdxi << 6);
+                        unsigned j = bj | (wdxj << 6);
+                        // some more checks
+                        /* paranoid */ verifyf(i < lanes.count, "%u", wdxi << 6);
+                        /* paranoid */ verifyf(j < lanes.count, "%u", wdxj << 6);
+                        // try popping from the 2 picked lists
                         struct thread_desc * thrd = try_pop(cltr, i, j);
                         if(thrd) return thrd;
                 #else
                         // Pick two lists at random
+                        int i = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
+                        int j = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
+                        int i = tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                        int j = tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                        // try popping from the 2 picked lists
                         struct thread_desc * thrd = try_pop(cltr, i, j);
                         if(thrd) return thrd;
 …
+        }
+        // All lanes where empty return 0p
         return 0p;
+}
 …
+                {
                         int idx = 0;
                         for( w ; __cfa_readyQ_mask_size ) {
+                        for( w ; __cfa_lane_mask_size ) {
                                 for( b ; 8 * sizeof(__cfa_readyQ_mask_t) ) {
                                         bool is_empty = idx < list.count ? (ts(list.data[idx]) == 0) : true;
                                         bool should_be_empty = 0 == (empty.mask[w] & (1z << b));
+                                        bool is_empty = idx < lanes.count ? (ts(lanes.data[idx]) == 0) : true;
+                                        bool should_be_empty = 0 == (used.mask[w] & (1z << b));
                                         assertf(should_be_empty == is_empty, "Inconsistent list %d, mask expect : %d, actual is got %d", idx, should_be_empty, (bool)is_empty);
                                         assert(__cfa_max_readyQs > idx);
+                                        assert(__cfa_max_lanes > idx);
                                         idx++;
+                                }
 …
+                {
                         for( idx ; list.count ) {
                                 __intrusive_ready_queue_t & sl = list.data[idx];
                                 assert(!list.data[idx].lock);
+                        for( idx ; lanes.count ) {
+                                __intrusive_lane_t & sl = lanes.data[idx];
+                                assert(!lanes.data[idx].lock);
                                 assert(head(sl)->link.prev == 0p );
 …
 // Call this function of the intrusive list was moved using memcpy
+// fixes the list so that the pointers back to anchors aren't left
+// dangling
+static inline void fix(__intrusive_ready_queue_t & ll) {
+        // if the list is not empty then follow he pointer
+        // and fix its reverse
+        if(ll.before.link.ts != 0l) {
+// fixes the list so that the pointers back to anchors aren't left dangling
+static inline void fix(__intrusive_lane_t & ll) {
+        // if the list is not empty then follow he pointer and fix its reverse
+        if(!is_empty(ll)) {
                 head(ll)->link.next->link.prev = head(ll);
                 tail(ll)->link.prev->link.next = tail(ll);
 …
         // Otherwise just reset the list
         else {
                 tail(ll)->link.next = 0p;
+                verify(tail(ll)->link.next == 0p);
                 tail(ll)->link.prev = head(ll);
                 head(ll)->link.next = tail(ll);
+                head(ll)->link.prev = 0p;
+        }
+}
+                verify(head(ll)->link.prev == 0p);
+        }
+}
+// Grow the ready queue
 void ready_queue_grow  (struct cluster * cltr) {
+        // Lock the RWlock so no-one pushes/pops while we are changing the queue
         uint_fast32_t last_size = ready_mutate_lock( *cltr );
         __cfaabi_dbg_print_safe("Kernel : Growing ready queue\n");
+        check( cltr->ready_queue );
+        // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
+        // grow the ready queue
         with( cltr->ready_queue ) {
                 size_t ncount = list.count;
+                size_t ncount = lanes.count;
                 // Check that we have some space left
+                if(ncount + 4 >= __cfa_max_readyQs) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_readyQs);
+                if(ncount + 4 >= __cfa_max_lanes) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_lanes);
+                // increase count
                 ncount += 4;
                 // Allocate new array
                 list.data = alloc(list.data, ncount);
+                // Allocate new array (uses realloc and memcpies the data)
+                lanes.data = alloc(lanes.data, ncount);
                 // Fix the moved data
                 for( idx; (size_t)list.count ) {
                         fix(list.data[idx]);
+                for( idx; (size_t)lanes.count ) {
+                        fix(lanes.data[idx]);
+                }
                 // Construct new data
                 for( idx; (size_t)list.count ~ ncount) {
                         (list.data[idx]){};
+                for( idx; (size_t)lanes.count ~ ncount) {
+                        (lanes.data[idx]){};
+                }
                 // Update original
+                list.count = ncount;
+                // fields in empty don't need to change
+                lanes.count = ncount;
+                // fields in 'used' don't need to change when growing
+        }
         // Make sure that everything is consistent
+        check( cltr->ready_queue );
+        /* paranoid */ check( cltr->ready_queue );
         __cfaabi_dbg_print_safe("Kernel : Growing ready queue done\n");
+        // Unlock the RWlock
         ready_mutate_unlock( *cltr, last_size );
+}
+// Shrink the ready queue
 void ready_queue_shrink(struct cluster * cltr) {
+        // Lock the RWlock so no-one pushes/pops while we are changing the queue
         uint_fast32_t last_size = ready_mutate_lock( *cltr );
         __cfaabi_dbg_print_safe("Kernel : Shrinking ready queue\n");
+        // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
         with( cltr->ready_queue ) {
+                // Make sure that the total thread count stays the same
                 #if defined(__CFA_WITH_VERIFY__)
                         size_t nthreads = 0;
                         for( idx; (size_t)list.count ) {
                                 nthreads += list.data[idx].count;
+                        for( idx; (size_t)lanes.count ) {
+                                nthreads += lanes.data[idx].count;
+                        }
                 #endif
                 size_t ocount = list.count;
+                size_t ocount = lanes.count;
                 // Check that we have some space left
                 if(ocount < 8) abort("Program attempted to destroy more Ready Queues than were created");
+                list.count -= 4;
+                // reduce the actual count so push doesn't use the old queues
+                lanes.count -= 4;
+                verify(ocount > lanes.count);
+                // for printing count the number of displaced threads
+                #if defined(__CFA_DEBUG_PRINT__)
+                        __attribute__((unused)) size_t displaced = 0;
+                #endif
                 // redistribute old data
+                verify(ocount > list.count);
+                __attribute__((unused)) size_t displaced = 0;
+                for( idx; (size_t)list.count ~ ocount) {
+                        // This is not strictly needed but makes checking invariants much easier
+                        bool locked = __atomic_try_acquire(&list.data[idx].lock);
+                for( idx; (size_t)lanes.count ~ ocount) {
+                        // Lock is not strictly needed but makes checking invariants much easier
+                        bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
                         verify(locked);
+                        while(0 != ts(list.data[idx])) {
+                        // As long as we can pop from this lane to push the threads somewhere else in the queue
+                        while(!is_empty(lanes.data[idx])) {
                                 struct thread_desc * thrd;
                                 __attribute__((unused)) bool _;
                                 [thrd, _] = pop(list.data[idx]);
+                                verify(thrd);
+                                [thrd, _] = pop(lanes.data[idx]);
                                 push(cltr, thrd);
+                                displaced++;
+                                // for printing count the number of displaced threads
+                                #if defined(__CFA_DEBUG_PRINT__)
+                                        displaced++;
+                                #endif
+                        }
+                        __atomic_unlock(&list.data[idx].lock);
+                        mask_clear((__cfa_readyQ_mask_t *)used.mask, idx, NOCHECK);
+                        // Unlock the lane
+                        __atomic_unlock(&lanes.data[idx].lock);
                         // TODO print the queue statistics here
                         ^(list.data[idx]){};
+                        ^(lanes.data[idx]){};
+                }
                 __cfaabi_dbg_print_safe("Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+                // clear the now unused masks
+                {
+                        __cfa_readyQ_mask_t fword, fbit, lword, lbit;
+                        [fbit, fword] = extract(ocount);
+                        [lbit, lword] = extract(list.count);
+                        // For now assume that all queues where coverd by the same bitmask
+                        // This is highly probable as long as grow and shrink use groups of 4
+                        // exclusively
+                        verify(fword == lword);
+                        __cfa_readyQ_mask_t clears = ~0;
+                        for( b ; lbit ~ fbit ) {
+                                clears ^= 1 << b;
+                // recompute the used.count instead of maintaining it
+                used.count = 0;
+                for( i ; __cfa_lane_mask_size ) {
+                        used.count += __builtin_popcountl(used.mask[i]);
+                }
+                // Allocate new array (uses realloc and memcpies the data)
+                lanes.data = alloc(lanes.data, lanes.count);
+                // Fix the moved data
+                for( idx; (size_t)lanes.count ) {
+                        fix(lanes.data[idx]);
+                }
+                // Make sure that the total thread count stayed the same
+                #if defined(__CFA_WITH_VERIFY__)
+                        for( idx; (size_t)lanes.count ) {
+                                nthreads -= lanes.data[idx].count;
+                        }
+                        empty.mask[fword] &= clears;
+                        empty.count = 0;
+                        for( i ; 0 ~= lword ) {
+                                empty.count += __builtin_popcountl(empty.mask[i]);
+                        }
+                }
+                // Allocate new array
+                list.data = alloc(list.data, list.count);
+                // Fix the moved data
+                for( idx; (size_t)list.count ) {
+                        fix(list.data[idx]);
+                }
+                #if defined(__CFA_WITH_VERIFY__)
+                        for( idx; (size_t)list.count ) {
+                                nthreads -= list.data[idx].count;
+                        }
+                        assertf(nthreads == 0, "Shrinking changed number of threads");
+                        verifyf(nthreads == 0, "Shrinking changed number of threads");
                 #endif
+        }
         // Make sure that everything is consistent
+        check( cltr->ready_queue );
+        /* paranoid */ check( cltr->ready_queue );
         __cfaabi_dbg_print_safe("Kernel : Shrinking ready queue done\n");
+        // Unlock the RWlock
         ready_mutate_unlock( *cltr, last_size );
+}
 …
         __atomic_fetch_add( &global_stats.pick.pop .success, tls.pick.pop .success, __ATOMIC_SEQ_CST );
         __atomic_fetch_add( &global_stats.full.value, tls.full.value, __ATOMIC_SEQ_CST );
         __atomic_fetch_add( &global_stats.full.count, tls.full.count, __ATOMIC_SEQ_CST );
+        __atomic_fetch_add( &global_stats.used.value, tls.used.value, __ATOMIC_SEQ_CST );
+        __atomic_fetch_add( &global_stats.used.count, tls.used.count, __ATOMIC_SEQ_CST );
+}
 #endif

libcfa/src/stdhdr/assert.h

-              r75ca7f4
+              rdca5802
         #define verify(x) assert(x)
         #define verifyf(x, ...) assertf(x, __VA_ARGS__)
+        #define verifyfail(...)
         #define __CFA_WITH_VERIFY__
 #else
         #define verify(x)
         #define verifyf(x, ...)
+        #define verifyfail(...)
 #endif

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: