Context Navigation

← Previous Change
Next Change →

Changeset 884f3f67 for libcfa/src

Timestamp:

Mar 14, 2022, 2:24:51 PM (4 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, ast-experimental, enum, master, pthread-emulation, qualifiedEnum

Children:

Parents:

Message:

Change how the ready queue is initialized to make it common with I/O

Location:

libcfa/src/concurrency

Files:

: 5 edited

kernel.hfa (modified) (3 diffs)
kernel/cluster.cfa (modified) (10 diffs)
kernel/startup.cfa (modified) (3 diffs)
kernel_private.hfa (modified) (2 diffs)
ready_queue.cfa (modified) (12 diffs)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/kernel.hfa

-              rc42b8a1
+              r884f3f67
 void ^?{}(__intrusive_lane_t & this);
 // Aligned timestamps which are used by the relaxed ready queue
+// Aligned timestamps which are used by the ready queue and io subsystem
 struct __attribute__((aligned(128))) __timestamp_t {
         volatile unsigned long long tv;
 …
 };
+static inline void  ?{}(__timestamp_t & this) { this.tv = 0; this.ma = 0; }
+static inline void ^?{}(__timestamp_t &) {}
 struct __attribute__((aligned(16))) __cache_id_t {
         volatile unsigned id;
 };
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __help_cnts_t {
+        volatile unsigned long long src;
+        volatile unsigned long long dst;
+        volatile unsigned long long tri;
+};
+static inline void  ?{}(__timestamp_t & this) { this.tv = 0; this.ma = 0; }
+static inline void ^?{}(__timestamp_t &) {}
+struct __attribute__((aligned(128))) __ready_queue_caches_t;
+void  ?{}(__ready_queue_caches_t & this);
+void ^?{}(__ready_queue_caches_t & this);
+//TODO adjust cache size to ARCHITECTURE
+// Structure holding the ready queue
+struct __ready_queue_t {
+        // Data tracking the actual lanes
+        // On a seperate cacheline from the used struct since
+        // used can change on each push/pop but this data
+        // only changes on shrink/grow
+        struct {
+                // Arary of lanes
+                __intrusive_lane_t * volatile data;
+                // Array of times
+                __timestamp_t * volatile tscs;
+                __cache_id_t * volatile caches;
+                // Array of stats
+                __help_cnts_t * volatile help;
+                // Number of lanes (empty or not)
+                volatile size_t count;
+        } lanes;
+};
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
+// //TODO adjust cache size to ARCHITECTURE
+// // Structure holding the ready queue
+// struct __ready_queue_t {
+//      // Data tracking the actual lanes
+//      // On a seperate cacheline from the used struct since
+//      // used can change on each push/pop but this data
+//      // only changes on shrink/grow
+//      struct {
+//              // Arary of lanes
+//              __intrusive_lane_t * volatile data;
+//              __cache_id_t * volatile caches;
+//              // Number of lanes (empty or not)
+//              volatile size_t count;
+//      } lanes;
+// };
+// void  ?{}(__ready_queue_t & this);
+// void ^?{}(__ready_queue_t & this);
 // Idle Sleep
 …
 // Cluster
 struct __attribute__((aligned(128))) cluster {
+        // Ready queue for threads
+        __ready_queue_t ready_queue;
+        struct {
+                struct {
+                        // Arary of subqueues
+                        __intrusive_lane_t * volatile data;
+                        // Time since subqueues were processed
+                        __timestamp_t * volatile tscs;
+                        // Number of subqueue / timestamps
+                        size_t count;
+                } readyQ;
+                struct {
+                        // Number of I/O subqueues
+                        volatile size_t count;
+                        // Time since subqueues were processed
+                        __timestamp_t * volatile tscs;
+                } io;
+                // Cache each kernel thread belongs to
+                __cache_id_t * volatile caches;
+        } sched;
+        // // Ready queue for threads
+        // __ready_queue_t ready_queue;
         // Name of the cluster

libcfa/src/concurrency/kernel/cluster.cfa

-              rc42b8a1
+              r884f3f67
 //-----------------------------------------------------------------------
 // Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
+static void check_readyQ( cluster * cltr ) with (cltr->sched) {
         #if defined(__CFA_WITH_VERIFY__)
+                {
+                        for( idx ; lanes.count ) {
+                                __intrusive_lane_t & sl = lanes.data[idx];
+                                assert(!lanes.data[idx].lock);
+                        const unsigned lanes_count = readyQ.count;
+                        for( idx ; lanes_count ) {
+                                __intrusive_lane_t & sl = readyQ.data[idx];
+                                assert(!readyQ.data[idx].lock);
                                         if(is_empty(sl)) {
 …
                 it->rdq.id = value;
                 it->rdq.target = MAX;
                 value += __readyq_shard_factor;
+                value += __shard_factor.readyq;
                 it = &(*it)`next;
+        }
 …
+}
 static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
         lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
         for(i; lanes.count) {
                 lanes.tscs[i].tv = rdtscl();
                 lanes.tscs[i].ma = 0;
+static void fix_times( __timestamp_t * volatile & tscs, unsigned count ) {
+        tscs = alloc(count, tscs`realloc);
+        for(i; count) {
+                tscs[i].tv = rdtscl();
+                tscs[i].ma = 0;
+        }
+}
 …
 // Grow the ready queue
 void ready_queue_grow(struct cluster * cltr) {
-        size_t ncount;
         int target = cltr->procs.total;
 …
         // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
+        // grow the ready queue
+        with( cltr->ready_queue ) {
+                // Find new count
+                // Make sure we always have atleast 1 list
+                if(target >= 2) {
+                        ncount = target * __readyq_shard_factor;
+                } else {
+                        ncount = __readyq_single_shard;
+                }
+                // Allocate new array (uses realloc and memcpies the data)
+                lanes.data = alloc( ncount, lanes.data`realloc );
+                // Fix the moved data
+                for( idx; (size_t)lanes.count ) {
+                        fix(lanes.data[idx]);
+                }
+                // Construct new data
+                for( idx; (size_t)lanes.count ~ ncount) {
+                        (lanes.data[idx]){};
+                }
+                // Update original
+                lanes.count = ncount;
+                lanes.caches = alloc( target, lanes.caches`realloc );
+        }
+        fix_times(cltr);
+        /* paranoid */ check_readyQ( cltr );
+        // Find new count
+        // Make sure we always have atleast 1 list
+        size_t ocount = cltr->sched.readyQ.count;
+        size_t ncount = max(target * __shard_factor.readyq, __readyq_single_shard);
+        // Do we have to do anything?
+        if( ocount != ncount ) {
+                // grow the ready queue
+                with( cltr->sched ) {
+                        // Allocate new array (uses realloc and memcpies the data)
+                        readyQ.data = alloc( ncount, readyQ.data`realloc );
+                        // Fix the moved data
+                        for( idx; ocount ) {
+                                fix(readyQ.data[idx]);
+                        }
+                        // Construct new data
+                        for( idx; ocount ~ ncount) {
+                                (readyQ.data[idx]){};
+                        }
+                        // Update original count
+                        readyQ.count = ncount;
+                }
+                fix_times(cltr->sched.readyQ.tscs, cltr->sched.readyQ.count);
+        }
+        // realloc the caches
+        cltr->sched.caches = alloc( target, cltr->sched.caches`realloc );
+        // reassign the clusters.
         reassign_cltr_id(cltr);
         // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
+        /* paranoid */ check_readyQ( cltr );
+        /* paranoid */ verify( (target == 0) == (cltr->sched.caches == 0p) );
         __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
 …
         // Make sure that everything is consistent
         /* paranoid */ check( cltr->ready_queue );
+        /* paranoid */ check_readyQ( cltr );
         int target = cltr->procs.total;
         with( cltr->ready_queue ) {
+        with( cltr->sched ) {
                 // Remember old count
                 size_t ocount = lanes.count;
+                size_t ocount = readyQ.count;
                 // Find new count
                 // Make sure we always have atleast 1 list
+                lanes.count = target >= 2 ? target * __readyq_shard_factor: __readyq_single_shard;
+                /* paranoid */ verify( ocount >= lanes.count );
+                /* paranoid */ verify( lanes.count == target * __readyq_shard_factor || target < 2 );
+                size_t ncount = max(target * __shard_factor.readyq, __readyq_single_shard);
+                /* paranoid */ verifyf( ocount >= ncount, "Error in shrinking size calculation, %zu >= %zu", ocount, ncount );
+                /* paranoid */ verifyf( ncount == target * __shard_factor.readyq || ncount == __readyq_single_shard,
+                /* paranoid */          "Error in shrinking size calculation, expected %u or %u, got %zu", target * __shard_factor.readyq, ncount );
+                readyQ.count = ncount;
                 // for printing count the number of displaced threads
 …
                 // redistribute old data
                 for( idx; (size_t)lanes.count ~ ocount) {
+                for( idx; ncount ~ ocount) {
                         // Lock is not strictly needed but makes checking invariants much easier
                         __attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+                        __attribute__((unused)) bool locked = __atomic_try_acquire(&readyQ.data[idx].lock);
                         verify(locked);
                         // As long as we can pop from this lane to push the threads somewhere else in the queue
                         while(!is_empty(lanes.data[idx])) {
+                        while(!is_empty(readyQ.data[idx])) {
                                 struct thread$ * thrd;
                                 unsigned long long _;
                                 [thrd, _] = pop(lanes.data[idx]);
+                                [thrd, _] = pop(readyQ.data[idx]);
                                 push(cltr, thrd, true);
 …
                         // Unlock the lane
                         __atomic_unlock(&lanes.data[idx].lock);
+                        __atomic_unlock(&readyQ.data[idx].lock);
                         // TODO print the queue statistics here
                         ^(lanes.data[idx]){};
+                        ^(readyQ.data[idx]){};
+                }
 …
                 // Allocate new array (uses realloc and memcpies the data)
                 lanes.data = alloc( lanes.count, lanes.data`realloc );
+                readyQ.data = alloc( ncount, readyQ.data`realloc );
                 // Fix the moved data
                 for( idx; (size_t)lanes.count ) {
                         fix(lanes.data[idx]);
+                }
                 lanes.caches = alloc( target, lanes.caches`realloc );
+        }
+        fix_times(cltr);
+                for( idx; ncount ) {
+                        fix(readyQ.data[idx]);
+                }
+                fix_times(readyQ.tscs, ncount);
+        }
+        cltr->sched.caches = alloc( target, cltr->sched.caches`realloc );
 …
         // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
+        /* paranoid */ verify( (target == 0) == (cltr->sched.caches == 0p) );
+        /* paranoid */ check_readyQ( cltr );
         __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
         /* paranoid */ verify( ready_mutate_islocked() );
+}
+void ready_queue_close(struct cluster * cltr) {
+        free( cltr->sched.readyQ.data );
+        free( cltr->sched.readyQ.tscs );
+        cltr->sched.readyQ.data = 0p;
+        cltr->sched.readyQ.tscs = 0p;
+        cltr->sched.readyQ.count = 0;
+        free( cltr->sched.io.tscs );
+        free( cltr->sched.caches );
+}

libcfa/src/concurrency/kernel/startup.cfa

-              rc42b8a1
+              r884f3f67
         this.rdq.its = 0;
         this.rdq.itr = 0;
         this.rdq.id  = MAX;
+        this.rdq.id  = 0;
         this.rdq.target = MAX;
         this.rdq.last = MAX;
 …
         this.name = name;
         this.preemption_rate = preemption_rate;
+        ready_queue{};
+        this.sched.readyQ.data = 0p;
+        this.sched.readyQ.tscs = 0p;
+        this.sched.readyQ.count = 0;
+        this.sched.io.tscs = 0p;
+        this.sched.caches = 0p;
         #if !defined(__CFA_NO_STATISTICS__)
 …
         // Unlock the RWlock
         ready_mutate_unlock( last_size );
+        ready_queue_close( &this );
+        /* paranoid */ verify( this.sched.readyQ.data == 0p );
+        /* paranoid */ verify( this.sched.readyQ.tscs == 0p );
+        /* paranoid */ verify( this.sched.readyQ.count == 0 );
+        /* paranoid */ verify( this.sched.io.tscs == 0p );
+        /* paranoid */ verify( this.sched.caches == 0p );
         enable_interrupts( false ); // Don't poll, could be in main cluster
         #if !defined(__CFA_NO_STATISTICS__)

libcfa/src/concurrency/kernel_private.hfa

-              rc42b8a1
+              r884f3f67
 //-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_close(struct cluster * cltr);
+//-----------------------------------------------------------------------
 // Calc moving average based on existing average, before and current time.
 static inline unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
 …
+}
+static const unsigned __readyq_shard_factor = 2;
+//-----------------------------------------------------------------------
+// Calc age a timestamp should be before needing help.
+forall(Data_t * | { unsigned long long ts(Data_t & this); })
+static inline unsigned long long calc_cutoff(
+        const unsigned long long ctsc,
+        const processor * proc,
+        size_t count,
+        Data_t * data,
+        __timestamp_t * tscs,
+        const unsigned shard_factor
+) {
+        unsigned start = proc->rdq.id;
+        unsigned long long max = 0;
+        for(i; shard_factor) {
+                unsigned long long ptsc = ts(data[start + i]);
+                if(ptsc != -1ull) {
+                        /* paranoid */ verify( start + i < count );
+                        unsigned long long tsc = moving_average(ctsc, ptsc, tscs[start + i].ma);
+                        if(tsc > max) max = tsc;
+                }
+        }
+        return (max + 2 * max) / 2;
+}
+static struct {
+        const unsigned readyq;
+} __shard_factor = { 2 };
 // Local Variables: //

libcfa/src/concurrency/ready_queue.cfa

-              rc42b8a1
+              r884f3f67
 // Cforall Ready Queue used for scheduling
 //=======================================================================
+void ?{}(__ready_queue_t & this) with (this) {
+        lanes.data   = 0p;
+        lanes.tscs   = 0p;
+        lanes.caches = 0p;
+        lanes.help   = 0p;
+        lanes.count  = 0;
+}
+void ^?{}(__ready_queue_t & this) with (this) {
+        free(lanes.data);
+        free(lanes.tscs);
+        free(lanes.caches);
+        free(lanes.help);
+}
+//-----------------------------------------------------------------------
+__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
+// void ?{}(__ready_queue_t & this) with (this) {
+//      lanes.data   = 0p;
+//      lanes.tscs   = 0p;
+//      lanes.caches = 0p;
+//      lanes.count  = 0;
+// }
+// void ^?{}(__ready_queue_t & this) with (this) {
+//      free(lanes.data);
+//      free(lanes.tscs);
+//      free(lanes.caches);
+// }
+//-----------------------------------------------------------------------
+__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->sched) {
         processor * const proc = kernelTLS().this_processor;
         const bool external = (!proc) || (cltr != proc->cltr);
         const bool remote   = hint == UNPARK_REMOTE;
+        const size_t lanes_count = readyQ.count;
+        /* paranoid */ verify( __shard_factor.readyq > 0 );
+        /* paranoid */ verify( lanes_count > 0 );
         unsigned i;
 …
                 // Figure out where thread was last time and make sure it's valid
                 /* paranoid */ verify(thrd->preferred >= 0);
+                if(thrd->preferred * __readyq_shard_factor < lanes.count) {
+                        /* paranoid */ verify(thrd->preferred * __readyq_shard_factor < lanes.count);
+                        unsigned start = thrd->preferred * __readyq_shard_factor;
+                unsigned start = thrd->preferred * __shard_factor.readyq;
+                if(start < lanes_count) {
                         do {
                                 unsigned r = __tls_rand();
                                 i = start + (r % __readyq_shard_factor);
                                 /* paranoid */ verify( i < lanes.count );
+                                i = start + (r % __shard_factor.readyq);
+                                /* paranoid */ verify( i < lanes_count );
                                 // If we can't lock it retry
                         } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                        } while( !__atomic_try_acquire( &readyQ.data[i].lock ) );
                 } else {
                         do {
                                 i = __tls_rand() % lanes.count;
                         } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                                i = __tls_rand() % lanes_count;
+                        } while( !__atomic_try_acquire( &readyQ.data[i].lock ) );
+                }
         } else {
                 do {
                         unsigned r = proc->rdq.its++;
                         i = proc->rdq.id + (r % __readyq_shard_factor);
                         /* paranoid */ verify( i < lanes.count );
+                        i = proc->rdq.id + (r % __shard_factor.readyq);
+                        /* paranoid */ verify( i < lanes_count );
                         // If we can't lock it retry
                 } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                } while( !__atomic_try_acquire( &readyQ.data[i].lock ) );
+        }
         // Actually push it
         push(lanes.data[i], thrd);
+        push(readyQ.data[i], thrd);
         // Unlock and return
         __atomic_unlock( &lanes.data[i].lock );
+        __atomic_unlock( &readyQ.data[i].lock );
         #if !defined(__CFA_NO_STATISTICS__)
 …
+}
+static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) {
+        unsigned start = proc->rdq.id;
+        unsigned long long max = 0;
+        for(i; __readyq_shard_factor) {
+                unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
+                if(ptsc != -1ull) {
+                        /* paranoid */ verify( start + i < rdq.lanes.count );
+                        unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma);
+                        if(tsc > max) max = tsc;
+                }
+        }
+        return (max + 2 * max) / 2;
+}
+__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->sched) {
+        const size_t lanes_count = readyQ.count;
+        /* paranoid */ verify( __shard_factor.readyq > 0 );
+        /* paranoid */ verify( lanes_count > 0 );
         /* paranoid */ verify( kernelTLS().this_processor );
         /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+        /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes_count );
         processor * const proc = kernelTLS().this_processor;
         unsigned this = proc->rdq.id;
         /* paranoid */ verify( this < lanes.count );
+        /* paranoid */ verify( this < lanes_count );
         __cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
 …
         // Super important: don't write the same value over and over again
         // We want to maximise our chances that his particular values stays in cache
         if(lanes.caches[this / __readyq_shard_factor].id != this_cache)
                 __atomic_store_n(&lanes.caches[this / __readyq_shard_factor].id, this_cache, __ATOMIC_RELAXED);
+        if(caches[this / __shard_factor.readyq].id != this_cache)
+                __atomic_store_n(&caches[this / __shard_factor.readyq].id, this_cache, __ATOMIC_RELAXED);
         const unsigned long long ctsc = rdtscl();
 …
                 uint64_t chaos = __tls_rand();
                 unsigned ext = chaos & 0xff;
                 unsigned other  = (chaos >> 8) % (lanes.count);
                 if(ext < 3 || __atomic_load_n(&lanes.caches[other / __readyq_shard_factor].id, __ATOMIC_RELAXED) == this_cache) {
+                unsigned other  = (chaos >> 8) % (lanes_count);
+                if(ext < 3 || __atomic_load_n(&caches[other / __shard_factor.readyq].id, __ATOMIC_RELAXED) == this_cache) {
                         proc->rdq.target = other;
+                }
 …
         else {
                 const unsigned target = proc->rdq.target;
                 __cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv);
                 /* paranoid */ verify( lanes.tscs[target].tv != MAX );
                 if(target < lanes.count) {
                         const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue);
                         const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma);
+                __cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, readyQ.tscs[target].tv);
+                /* paranoid */ verify( readyQ.tscs[target].tv != MAX );
+                if(target < lanes_count) {
+                        const unsigned long long cutoff = calc_cutoff(ctsc, proc, lanes_count, cltr->sched.readyQ.data, cltr->sched.readyQ.tscs, __shard_factor.readyq);
+                        const unsigned long long age = moving_average(ctsc, readyQ.tscs[target].tv, readyQ.tscs[target].ma);
                         __cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no");
                         if(age > cutoff) {
 …
+        }
         for(__readyq_shard_factor) {
                 unsigned i = this + (proc->rdq.itr++ % __readyq_shard_factor);
+        for(__shard_factor.readyq) {
+                unsigned i = this + (proc->rdq.itr++ % __shard_factor.readyq);
                 if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+        }
 …
+}
 __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
         unsigned i = __tls_rand() % lanes.count;
+__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) {
+        unsigned i = __tls_rand() % (cltr->sched.readyQ.count);
         return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+}
 …
 //-----------------------------------------------------------------------
 // try to pop from a lane given by index w
+static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+        /* paranoid */ verify( w < lanes.count );
+static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->sched) {
+        const size_t lanes_count = readyQ.count;
+        /* paranoid */ verify( w < lanes_count );
         __STATS( stats.attempt++; )
         // Get relevant elements locally
         __intrusive_lane_t & lane = lanes.data[w];
+        __intrusive_lane_t & lane = readyQ.data[w];
         // If list looks empty retry
 …
         if (tsv != MAX) {
                 unsigned long long now = rdtscl();
                 unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED);
                 __atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED);
                 __atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
+        }
         thrd->preferred = w / __readyq_shard_factor;
+                unsigned long long pma = __atomic_load_n(&readyQ.tscs[w].ma, __ATOMIC_RELAXED);
+                __atomic_store_n(&readyQ.tscs[w].tv, tsv, __ATOMIC_RELAXED);
+                __atomic_store_n(&readyQ.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
+        }
+        thrd->preferred = w / __shard_factor.readyq;
         // return the popped thread
 …
 // try to pop from any lanes making sure you don't miss any threads push
 // before the start of the function
+static inline struct thread$ * search(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+        unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+static inline struct thread$ * search(struct cluster * cltr) {
+        const size_t lanes_count = cltr->sched.readyQ.count;
+        /* paranoid */ verify( lanes_count > 0 );
+        unsigned count = __atomic_load_n( &lanes_count, __ATOMIC_RELAXED );
         unsigned offset = __tls_rand();
         for(i; count) {
 …
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->sched) {
         // Pick the bet list
         int w = i;
         if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
                 w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+        if( __builtin_expect(!is_empty(readyQ.data[j]), true) ) {
+                w = (ts(readyQ.data[i]) < ts(readyQ.data[j])) ? i : j;
+        }

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: