Context Navigation

← Previous Change
Next Change →

ready_queue.cfa

Timestamp:

Sep 27, 2021, 2:09:55 PM (4 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, ast-experimental, enum, forall-pointer-decay, master, pthread-emulation, qualifiedEnum

Children:

cc287800

Parents:

4e28d2e9 (diff), 056cbdb (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.

Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

File:

: 1 edited

libcfa/src/concurrency/ready_queue.cfa (modified) (15 diffs)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/ready_queue.cfa

-              r4e28d2e9
+              r949339b
         #define __kernel_rseq_unregister rseq_unregister_current_thread
 #elif defined(CFA_HAVE_LINUX_RSEQ_H)
         void __kernel_raw_rseq_register  (void);
         void __kernel_raw_rseq_unregister(void);
+        static void __kernel_raw_rseq_register  (void);
+        static void __kernel_raw_rseq_unregister(void);
         #define __kernel_rseq_register __kernel_raw_rseq_register
 …
 // Cforall Ready Queue used for scheduling
 //=======================================================================
+unsigned long long moving_average(unsigned long long nval, unsigned long long oval) {
+        const unsigned long long tw = 16;
+        const unsigned long long nw = 4;
+        const unsigned long long ow = tw - nw;
+        return ((nw * nval) + (ow * oval)) / tw;
+}
 void ?{}(__ready_queue_t & this) with (this) {
         #if defined(USE_CPU_WORK_STEALING)
 …
                 lanes.data = alloc( lanes.count );
                 lanes.tscs = alloc( lanes.count );
+                lanes.help = alloc( cpu_info.hthrd_count );
                 for( idx; (size_t)lanes.count ) {
                         (lanes.data[idx]){};
                         lanes.tscs[idx].tv = rdtscl();
+                        lanes.tscs[idx].ma = rdtscl();
+                }
+                for( idx; (size_t)cpu_info.hthrd_count ) {
+                        lanes.help[idx].src = 0;
+                        lanes.help[idx].dst = 0;
+                        lanes.help[idx].tri = 0;
+                }
         #else
                 lanes.data  = 0p;
                 lanes.tscs  = 0p;
+                lanes.help  = 0p;
                 lanes.count = 0;
         #endif
 …
         free(lanes.data);
         free(lanes.tscs);
+        free(lanes.help);
+}
 //-----------------------------------------------------------------------
 #if defined(USE_CPU_WORK_STEALING)
         __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+        __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
                 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
                 processor * const proc = kernelTLS().this_processor;
+                const bool external = !push_local || (!proc) || (cltr != proc->cltr);
+                const bool external = (!proc) || (cltr != proc->cltr);
+                // Figure out the current cpu and make sure it is valid
                 const int cpu = __kernel_getcpu();
                 /* paranoid */ verify(cpu >= 0);
 …
                 /* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
+                const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
+                // Figure out where thread was last time and make sure it's
+                /* paranoid */ verify(thrd->preferred >= 0);
+                /* paranoid */ verify(thrd->preferred < cpu_info.hthrd_count);
+                /* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
+                const int prf = thrd->preferred * READYQ_SHARD_FACTOR;
+                const cpu_map_entry_t & map;
+                choose(hint) {
+                        case UNPARK_LOCAL : &map = &cpu_info.llc_map[cpu];
+                        case UNPARK_REMOTE: &map = &cpu_info.llc_map[prf];
+                }
                 /* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
                 /* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
 …
                         if(unlikely(external)) { r = __tls_rand(); }
                         else { r = proc->rdq.its++; }
+                        i = start + (r % READYQ_SHARD_FACTOR);
+                        choose(hint) {
+                                case UNPARK_LOCAL : i = start + (r % READYQ_SHARD_FACTOR);
+                                case UNPARK_REMOTE: i = prf   + (r % READYQ_SHARD_FACTOR);
+                        }
                         // If we can't lock it retry
                 } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
 …
                 processor * const proc = kernelTLS().this_processor;
                 const int start = map.self * READYQ_SHARD_FACTOR;
+                const unsigned long long ctsc = rdtscl();
                 // Did we already have a help target
                 if(proc->rdq.target == -1u) {
+                        // if We don't have a
+                        unsigned long long min = ts(lanes.data[start]);
+                        unsigned long long max = 0;
                         for(i; READYQ_SHARD_FACTOR) {
+                                unsigned long long tsc = ts(lanes.data[start + i]);
+                                if(tsc < min) min = tsc;
+                        }
+                        proc->rdq.cutoff = min;
+                                unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+                                if(tsc > max) max = tsc;
+                        }
+                         proc->rdq.cutoff = (max + 2 * max) / 2;
                         /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
                         /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
                         if(0 == (__tls_rand() % 10_000)) {
+                        if(0 == (__tls_rand() % 100)) {
                                 proc->rdq.target = __tls_rand() % lanes.count;
                         } else {
 …
+                }
                 else {
+                        const unsigned long long bias = 0; //2_500_000_000;
+                        const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+                        unsigned long long max = 0;
+                        for(i; READYQ_SHARD_FACTOR) {
+                                unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+                                if(tsc > max) max = tsc;
+                        }
+                        const unsigned long long cutoff = (max + 2 * max) / 2;
+                        {
                                 unsigned target = proc->rdq.target;
                                 proc->rdq.target = -1u;
+                                if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
+                                lanes.help[target / READYQ_SHARD_FACTOR].tri++;
+                                if(moving_average(ctsc - lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
                                         thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
                                         proc->rdq.last = target;
                                         if(t) return t;
+                                        else proc->rdq.target = -1u;
+                                }
+                                else proc->rdq.target = -1u;
+                        }
 …
+        }
         __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+        __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
                 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
                 const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
                 /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
 …
 #endif
 #if defined(USE_WORK_STEALING)
         __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+        __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
                 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
                 // #define USE_PREFERRED
                 #if !defined(USE_PREFERRED)
                 const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
                 /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
                 #else
                         unsigned preferred = thrd->preferred;
                         const bool external = push_local || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+                        const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
                         /* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
 …
         // Actually pop the list
         struct thread$ * thrd;
+        unsigned long long tsc_before = ts(lane);
         unsigned long long tsv;
         [thrd, tsv] = pop(lane);
 …
         __STATS( stats.success++; )
+        #if defined(USE_WORK_STEALING)
+        #if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+                unsigned long long now = rdtscl();
                 lanes.tscs[w].tv = tsv;
+                lanes.tscs[w].ma = moving_average(now > tsc_before ? now - tsc_before : 0, lanes.tscs[w].ma);
         #endif
+        thrd->preferred = w;
+        #if defined(USE_CPU_WORK_STEALING)
+                thrd->preferred = w / READYQ_SHARD_FACTOR;
+        #else
+                thrd->preferred = w;
+        #endif
         // return the popped thread
 …
 //-----------------------------------------------------------------------
+// get preferred ready for new thread
+unsigned ready_queue_new_preferred() {
+        unsigned pref = 0;
+        if(struct thread$ * thrd = publicTLS_get( this_thread )) {
+                pref = thrd->preferred;
+        }
+        else {
+                #if defined(USE_CPU_WORK_STEALING)
+                        pref = __kernel_getcpu();
+                #endif
+        }
+        #if defined(USE_CPU_WORK_STEALING)
+                /* paranoid */ verify(pref >= 0);
+                /* paranoid */ verify(pref < cpu_info.hthrd_count);
+        #endif
+        return pref;
+}
+//-----------------------------------------------------------------------
 // Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
 …
         extern void __enable_interrupts_hard();
         void __kernel_raw_rseq_register  (void) {
+        static void __kernel_raw_rseq_register  (void) {
                 /* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
 …
+        }
         void __kernel_raw_rseq_unregister(void) {
+        static void __kernel_raw_rseq_unregister(void) {
                 /* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 949339b for libcfa/src/concurrency/ready_queue.cfa

Legend:

libcfa/src/concurrency/ready_queue.cfa

Download in other formats: