Context Navigation

-                      rc42b8a1
+                      r884f3f67
 // Cforall Ready Queue used for scheduling
 //=======================================================================
+void ?{}(__ready_queue_t & this) with (this) {
+        lanes.data   = 0p;
+        lanes.tscs   = 0p;
+        lanes.caches = 0p;
+        lanes.help   = 0p;
+        lanes.count  = 0;
+}
+void ^?{}(__ready_queue_t & this) with (this) {
+        free(lanes.data);
+        free(lanes.tscs);
+        free(lanes.caches);
+        free(lanes.help);
+}
+//-----------------------------------------------------------------------
+__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
+// void ?{}(__ready_queue_t & this) with (this) {
+//      lanes.data   = 0p;
+//      lanes.tscs   = 0p;
+//      lanes.caches = 0p;
+//      lanes.count  = 0;
+// }
+// void ^?{}(__ready_queue_t & this) with (this) {
+//      free(lanes.data);
+//      free(lanes.tscs);
+//      free(lanes.caches);
+// }
+//-----------------------------------------------------------------------
+__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->sched) {
         processor * const proc = kernelTLS().this_processor;
         const bool external = (!proc) || (cltr != proc->cltr);
         const bool remote   = hint == UNPARK_REMOTE;
+        const size_t lanes_count = readyQ.count;
+        /* paranoid */ verify( __shard_factor.readyq > 0 );
+        /* paranoid */ verify( lanes_count > 0 );
         unsigned i;
 …
                 // Figure out where thread was last time and make sure it's valid
                 /* paranoid */ verify(thrd->preferred >= 0);
+                if(thrd->preferred * __readyq_shard_factor < lanes.count) {
+                        /* paranoid */ verify(thrd->preferred * __readyq_shard_factor < lanes.count);
+                        unsigned start = thrd->preferred * __readyq_shard_factor;
+                unsigned start = thrd->preferred * __shard_factor.readyq;
+                if(start < lanes_count) {
                         do {
                                 unsigned r = __tls_rand();
                                 i = start + (r % __readyq_shard_factor);
                                 /* paranoid */ verify( i < lanes.count );
+                                i = start + (r % __shard_factor.readyq);
+                                /* paranoid */ verify( i < lanes_count );
                                 // If we can't lock it retry
                         } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                        } while( !__atomic_try_acquire( &readyQ.data[i].lock ) );
                 } else {
                         do {
                                 i = __tls_rand() % lanes.count;
                         } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                                i = __tls_rand() % lanes_count;
+                        } while( !__atomic_try_acquire( &readyQ.data[i].lock ) );
+                }
         } else {
                 do {
                         unsigned r = proc->rdq.its++;
                         i = proc->rdq.id + (r % __readyq_shard_factor);
                         /* paranoid */ verify( i < lanes.count );
+                        i = proc->rdq.id + (r % __shard_factor.readyq);
+                        /* paranoid */ verify( i < lanes_count );
                         // If we can't lock it retry
                 } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                } while( !__atomic_try_acquire( &readyQ.data[i].lock ) );
+        }
         // Actually push it
         push(lanes.data[i], thrd);
+        push(readyQ.data[i], thrd);
         // Unlock and return
         __atomic_unlock( &lanes.data[i].lock );
+        __atomic_unlock( &readyQ.data[i].lock );
         #if !defined(__CFA_NO_STATISTICS__)
 …
+}
+static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) {
+        unsigned start = proc->rdq.id;
+        unsigned long long max = 0;
+        for(i; __readyq_shard_factor) {
+                unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
+                if(ptsc != -1ull) {
+                        /* paranoid */ verify( start + i < rdq.lanes.count );
+                        unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma);
+                        if(tsc > max) max = tsc;
+                }
+        }
+        return (max + 2 * max) / 2;
+}
+__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->sched) {
+        const size_t lanes_count = readyQ.count;
+        /* paranoid */ verify( __shard_factor.readyq > 0 );
+        /* paranoid */ verify( lanes_count > 0 );
         /* paranoid */ verify( kernelTLS().this_processor );
         /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+        /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes_count );
         processor * const proc = kernelTLS().this_processor;
         unsigned this = proc->rdq.id;
         /* paranoid */ verify( this < lanes.count );
+        /* paranoid */ verify( this < lanes_count );
         __cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
 …
         // Super important: don't write the same value over and over again
         // We want to maximise our chances that his particular values stays in cache
         if(lanes.caches[this / __readyq_shard_factor].id != this_cache)
                 __atomic_store_n(&lanes.caches[this / __readyq_shard_factor].id, this_cache, __ATOMIC_RELAXED);
+        if(caches[this / __shard_factor.readyq].id != this_cache)
+                __atomic_store_n(&caches[this / __shard_factor.readyq].id, this_cache, __ATOMIC_RELAXED);
         const unsigned long long ctsc = rdtscl();
 …
                 uint64_t chaos = __tls_rand();
                 unsigned ext = chaos & 0xff;
                 unsigned other  = (chaos >> 8) % (lanes.count);
                 if(ext < 3 || __atomic_load_n(&lanes.caches[other / __readyq_shard_factor].id, __ATOMIC_RELAXED) == this_cache) {
+                unsigned other  = (chaos >> 8) % (lanes_count);
+                if(ext < 3 || __atomic_load_n(&caches[other / __shard_factor.readyq].id, __ATOMIC_RELAXED) == this_cache) {
                         proc->rdq.target = other;
+                }
 …
         else {
                 const unsigned target = proc->rdq.target;
                 __cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv);
                 /* paranoid */ verify( lanes.tscs[target].tv != MAX );
                 if(target < lanes.count) {
                         const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue);
                         const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma);
+                __cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, readyQ.tscs[target].tv);
+                /* paranoid */ verify( readyQ.tscs[target].tv != MAX );
+                if(target < lanes_count) {
+                        const unsigned long long cutoff = calc_cutoff(ctsc, proc, lanes_count, cltr->sched.readyQ.data, cltr->sched.readyQ.tscs, __shard_factor.readyq);
+                        const unsigned long long age = moving_average(ctsc, readyQ.tscs[target].tv, readyQ.tscs[target].ma);
                         __cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no");
                         if(age > cutoff) {
 …
+        }
         for(__readyq_shard_factor) {
                 unsigned i = this + (proc->rdq.itr++ % __readyq_shard_factor);
+        for(__shard_factor.readyq) {
+                unsigned i = this + (proc->rdq.itr++ % __shard_factor.readyq);
                 if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+        }
 …
+}
 __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
         unsigned i = __tls_rand() % lanes.count;
+__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) {
+        unsigned i = __tls_rand() % (cltr->sched.readyQ.count);
         return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+}
 …
 //-----------------------------------------------------------------------
 // try to pop from a lane given by index w
+static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+        /* paranoid */ verify( w < lanes.count );
+static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->sched) {
+        const size_t lanes_count = readyQ.count;
+        /* paranoid */ verify( w < lanes_count );
         __STATS( stats.attempt++; )
         // Get relevant elements locally
         __intrusive_lane_t & lane = lanes.data[w];
+        __intrusive_lane_t & lane = readyQ.data[w];
         // If list looks empty retry
 …
         if (tsv != MAX) {
                 unsigned long long now = rdtscl();
                 unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED);
                 __atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED);
                 __atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
+        }
         thrd->preferred = w / __readyq_shard_factor;
+                unsigned long long pma = __atomic_load_n(&readyQ.tscs[w].ma, __ATOMIC_RELAXED);
+                __atomic_store_n(&readyQ.tscs[w].tv, tsv, __ATOMIC_RELAXED);
+                __atomic_store_n(&readyQ.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
+        }
+        thrd->preferred = w / __shard_factor.readyq;
         // return the popped thread
 …
 // try to pop from any lanes making sure you don't miss any threads push
 // before the start of the function
+static inline struct thread$ * search(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+        unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+static inline struct thread$ * search(struct cluster * cltr) {
+        const size_t lanes_count = cltr->sched.readyQ.count;
+        /* paranoid */ verify( lanes_count > 0 );
+        unsigned count = __atomic_load_n( &lanes_count, __ATOMIC_RELAXED );
         unsigned offset = __tls_rand();
         for(i; count) {
 …
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->sched) {
         // Pick the bet list
         int w = i;
         if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
                 w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+        if( __builtin_expect(!is_empty(readyQ.data[j]), true) ) {
+                w = (ts(readyQ.data[i]) < ts(readyQ.data[j])) ? i : j;
+        }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 884f3f67 for libcfa/src/concurrency/ready_queue.cfa

Legend:

libcfa/src/concurrency/ready_queue.cfa

Download in other formats: