Context Navigation

← Previous Change
Next Change →

Changeset 175f9f4 for libcfa/src/concurrency

Timestamp:

Jan 18, 2022, 1:16:23 PM (5 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, ast-experimental, enum, forall-pointer-decay, master, pthread-emulation, qualifiedEnum, stuck-waitfor-destruct

Children:

1e8b4b49, adfd125

Parents:

21a5bfb7 (diff), 91a72ef (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.

Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

Location:

libcfa/src/concurrency

Files:

: 10 edited

clib/cfathread.cfa (modified) (2 diffs)
invoke.h (modified) (2 diffs)
io.cfa (modified) (1 diff)
kernel.cfa (modified) (1 diff)
kernel.hfa (modified) (4 diffs)
kernel/fwd.hfa (modified) (3 diffs)
kernel/startup.cfa (modified) (6 diffs)
locks.hfa (modified) (2 diffs)
ready_queue.cfa (modified) (24 diffs)
thread.cfa (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/clib/cfathread.cfa

-              r21a5bfb7
+              r175f9f4
 #include "thread.hfa"
 #include "time.hfa"
+#include "stdlib.hfa"
 #include "cfathread.h"
 …
                                 eevent.data.u64 = (uint64_t)active_thread();
                                 int id = thread_rand() % poller_cnt;
+                                int id = prng() % poller_cnt;
                                 if(0 != epoll_ctl(poller_fds[id], EPOLL_CTL_ADD, fd, &eevent))
+                                {

libcfa/src/concurrency/invoke.h

-              r21a5bfb7
+              r175f9f4
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Jan  6 16:37:40 2022
 // Update Count     : 47
+// Last Modified On : Sun Jan  9 19:06:45 2022
+// Update Count     : 48
 //
 …
                 struct processor * last_proc;
+                uint32_t random_state;                                                  // fast random numbers
                 #if defined( __CFA_WITH_VERIFY__ )
                         void * canary;

libcfa/src/concurrency/io.cfa

r21a5bfb7	r175f9f4
552	552	/* paranoid */ verify( proc == __cfaabi_tls.this_processor );
553	553	/* paranoid */ verify( ! __preemption_enabled() );
	554
	555	return true;
554	556	}
555	557	#endif

libcfa/src/concurrency/kernel.cfa

r21a5bfb7	r175f9f4
554	554	/* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
555	555
556		~~const bool local = thrd->state != Start;~~
557	556	if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
558	557

libcfa/src/concurrency/kernel.hfa

-              r21a5bfb7
+              r175f9f4
                 unsigned target;
                 unsigned last;
+                unsigned cnt;
+                unsigned long long int cutoff;
+                signed   cpu;
         } rdq;
 …
         volatile unsigned long long tv;
         volatile unsigned long long ma;
+};
+struct __attribute__((aligned(16))) __cache_id_t {
+        volatile unsigned id;
 };
 …
 static inline void ^?{}(__timestamp_t & this) {}
+struct __attribute__((aligned(128))) __ready_queue_caches_t;
+void  ?{}(__ready_queue_caches_t & this);
+void ^?{}(__ready_queue_caches_t & this);
 //TODO adjust cache size to ARCHITECTURE
 // Structure holding the relaxed ready queue
+// Structure holding the ready queue
 struct __ready_queue_t {
         // Data tracking the actual lanes
 …
                 // Array of times
                 __timestamp_t * volatile tscs;
+                __cache_id_t * volatile caches;
                 // Array of stats

libcfa/src/concurrency/kernel/fwd.hfa

-              r21a5bfb7
+              r175f9f4
                 static inline uint64_t __tls_rand() {
+                        return
                         #if defined(__SIZEOF_INT128__)
                                 return __lehmer64( kernelTLS().rand_seed );
+                                lehmer64( kernelTLS().rand_seed );
                         #else
                                 return __xorshift64( kernelTLS().rand_seed );
+                                xorshift_13_7_17( kernelTLS().rand_seed );
                         #endif
+                }
-                #define M  (1_l64u << 48_l64u)
-                #define A  (25214903917_l64u)
-                #define AI (18446708753438544741_l64u)
-                #define C  (11_l64u)
-                #define D  (16_l64u)
                 static inline unsigned __tls_rand_fwd() {
+                        kernelTLS().ready_rng.fwd_seed = (A * kernelTLS().ready_rng.fwd_seed + C) & (M - 1);
+                        return kernelTLS().ready_rng.fwd_seed >> D;
+                        return LCGBI_fwd( kernelTLS().ready_rng.fwd_seed );
+                }
                 static inline unsigned __tls_rand_bck() {
+                        unsigned int r = kernelTLS().ready_rng.bck_seed >> D;
+                        kernelTLS().ready_rng.bck_seed = AI * (kernelTLS().ready_rng.bck_seed - C) & (M - 1);
+                        return r;
+                }
+                #undef M
+                #undef A
+                #undef AI
+                #undef C
+                #undef D
+                        return LCGBI_bck( kernelTLS().ready_rng.bck_seed );
+                }
                 static inline void __tls_rand_advance_bck(void) {
 …
+                }
+        }
         extern void disable_interrupts();
 …
+                        }
+                }
-                extern uint64_t thread_rand();
                 // Semaphore which only supports a single thread

libcfa/src/concurrency/kernel/startup.cfa

-              r21a5bfb7
+              r175f9f4
 #include "kernel_private.hfa"
 #include "startup.hfa"          // STARTUP_PRIORITY_XXX
+#include "limits.hfa"
 #include "math.hfa"
 …
 extern void __wake_proc(processor *);
 extern int cfa_main_returned;                                                   // from interpose.cfa
+extern uint32_t __global_random_seed;
 //-----------------------------------------------------------------------------
 …
         this.context = &storage_mainThreadCtx;
+}
 …
         preferred = ready_queue_new_preferred();
         last_proc = 0p;
+        random_state = __global_random_seed;
         #if defined( __CFA_WITH_VERIFY__ )
                 canary = 0x0D15EA5E0D15EA5Ep;
 …
         this.rdq.its = 0;
         this.rdq.itr = 0;
+        this.rdq.id  = -1u;
+        this.rdq.target = -1u;
+        this.rdq.last = -1u;
+        this.rdq.cutoff = 0ull;
+        this.rdq.id  = MAX;
+        this.rdq.target = MAX;
+        this.rdq.last = MAX;
+        this.rdq.cpu = 0;
+        // this.rdq.cutoff = 0ull;
         do_terminate = false;
         preemption_alarm = 0p;
 …
         uint_fast32_t last_size;
         [this->unique_id, last_size] = ready_mutate_register();
+                this->rdq.cpu = __kernel_getcpu();
                 this->cltr->procs.total += 1u;

libcfa/src/concurrency/locks.hfa

-              r21a5bfb7
+              r175f9f4
 #include "time_t.hfa"
 #include "time.hfa"
-//-----------------------------------------------------------------------------
-// Semaphores
-// '0-nary' semaphore
-// Similar to a counting semaphore except the value of one is never reached
-// as a consequence, a V() that would bring the value to 1 *spins* until
-// a P consumes it
-struct Semaphore0nary {
-        __spinlock_t lock; // needed to protect
-        mpsc_queue(thread$) queue;
-};
-static inline bool P(Semaphore0nary & this, thread$ * thrd) {
-        /* paranoid */ verify(!thrd`next);
-        /* paranoid */ verify(!(&(*thrd)`next));
-        push(this.queue, thrd);
-        return true;
+}
-static inline bool P(Semaphore0nary & this) {
-    thread$ * thrd = active_thread();
-    P(this, thrd);
-    park();
-    return true;
+}
-static inline thread$ * V(Semaphore0nary & this, bool doUnpark = true) {
-        thread$ * next;
-        lock(this.lock __cfaabi_dbg_ctx2);
-                for (;;) {
-                        next = pop(this.queue);
-                        if (next) break;
-                        Pause();
+                }
-        unlock(this.lock);
-        if (doUnpark) unpark(next);
-        return next;
+}
-// Wrapper used on top of any sempahore to avoid potential locking
-struct BinaryBenaphore {
-        volatile ssize_t counter;
-};
-static inline {
-        void ?{}(BinaryBenaphore & this) { this.counter = 0; }
-        void ?{}(BinaryBenaphore & this, zero_t) { this.counter = 0; }
-        void ?{}(BinaryBenaphore & this, one_t ) { this.counter = 1; }
-        // returns true if no blocking needed
-        bool P(BinaryBenaphore & this) {
-                return __atomic_fetch_sub(&this.counter, 1, __ATOMIC_SEQ_CST) > 0;
+        }
-        bool tryP(BinaryBenaphore & this) {
-                ssize_t c = this.counter;
-                /* paranoid */ verify( c > MIN );
-                return (c >= 1) && __atomic_compare_exchange_n(&this.counter, &c, c-1, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+        }
-        // returns true if notify needed
-        bool V(BinaryBenaphore & this) {
-                ssize_t c = 0;
-                for () {
-                        /* paranoid */ verify( this.counter < MAX );
-                        if (__atomic_compare_exchange_n(&this.counter, &c, c+1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                if (c == 0) return true;
-                                /* paranoid */ verify(c < 0);
-                                return false;
-                        } else {
-                                if (c == 1) return true;
-                                /* paranoid */ verify(c < 1);
-                                Pause();
+                        }
+                }
+        }
+}
-// Binary Semaphore based on the BinaryBenaphore on top of the 0-nary Semaphore
-struct ThreadBenaphore {
-        BinaryBenaphore ben;
-        Semaphore0nary  sem;
-};
-static inline void ?{}(ThreadBenaphore & this) {}
-static inline void ?{}(ThreadBenaphore & this, zero_t) { (this.ben){ 0 }; }
-static inline void ?{}(ThreadBenaphore & this, one_t ) { (this.ben){ 1 }; }
-static inline bool P(ThreadBenaphore & this)              { return P(this.ben) ? false : P(this.sem); }
-static inline bool tryP(ThreadBenaphore & this)           { return tryP(this.ben); }
-static inline bool P(ThreadBenaphore & this, bool wait)   { return wait ? P(this) : tryP(this); }
-static inline thread$ * V(ThreadBenaphore & this, bool doUnpark = true) {
-        if (V(this.ben)) return 0p;
-        return V(this.sem, doUnpark);
+}
 //-----------------------------------------------------------------------------
 …
 static inline void   on_wakeup( owner_lock & this, size_t v ) { on_wakeup ( (blocking_lock &)this, v ); }
 static inline void   on_notify( owner_lock & this, struct thread$ * t ) { on_notify( (blocking_lock &)this, t ); }
-struct fast_lock {
-        thread$ * volatile owner;
-        ThreadBenaphore sem;
-};
-static inline void ?{}(fast_lock & this) __attribute__((deprecated("use linear_backoff_then_block_lock instead")));
-static inline void ?{}(fast_lock & this) { this.owner = 0p; }
-static inline bool $try_lock(fast_lock & this, thread$ * thrd) {
-    thread$ * exp = 0p;
-    return __atomic_compare_exchange_n(&this.owner, &exp, thrd, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+}
-static inline void lock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline void lock( fast_lock & this ) {
-        thread$ * thrd = active_thread();
-        /* paranoid */verify(thrd != this.owner);
-        for (;;) {
-                if ($try_lock(this, thrd)) return;
-                P(this.sem);
+        }
+}
-static inline bool try_lock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline bool try_lock ( fast_lock & this ) {
-        thread$ * thrd = active_thread();
-        /* paranoid */ verify(thrd != this.owner);
-        return $try_lock(this, thrd);
+}
-static inline thread$ * unlock( fast_lock & this ) __attribute__((deprecated("use linear_backoff_then_block_lock instead"), artificial));
-static inline thread$ * unlock( fast_lock & this ) {
-        /* paranoid */ verify(active_thread() == this.owner);
-        // open 'owner' before unlocking anyone
-        // so new and unlocked threads don't park incorrectly.
-        // This may require additional fencing on ARM.
-        this.owner = 0p;
-        return V(this.sem);
+}
-static inline size_t on_wait( fast_lock & this ) { unlock(this); return 0; }
-static inline void on_wakeup( fast_lock & this, size_t ) { lock(this); }
-static inline void on_notify( fast_lock &, struct thread$ * t ) { unpark(t); }
 struct mcs_node {

libcfa/src/concurrency/ready_queue.cfa

-              r21a5bfb7
+              r175f9f4
 #define USE_RELAXED_FIFO
+// #define USE_RELAXED_FIFO
 // #define USE_WORK_STEALING
 // #define USE_CPU_WORK_STEALING
+#define USE_AWARE_STEALING
 #include "bits/defs.hfa"
 …
 #include "stdlib.hfa"
+#include "limits.hfa"
 #include "math.hfa"
 …
 #endif
+#if   defined(USE_CPU_WORK_STEALING)
+#if   defined(USE_AWARE_STEALING)
+        #define READYQ_SHARD_FACTOR 2
+        #define SEQUENTIAL_SHARD 2
+#elif defined(USE_CPU_WORK_STEALING)
         #define READYQ_SHARD_FACTOR 2
 #elif defined(USE_RELAXED_FIFO)
 …
         __kernel_rseq_register();
-        __cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
         bool * handle = (bool *)&kernelTLS().sched_lock;
 …
+        }
-        __cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p done, id %lu\n", proc, n);
         // Return new spot.
         /* paranoid */ verify(n < ready);
 …
         __atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
-        __cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
         __kernel_rseq_unregister();
 …
 //=======================================================================
+// caches handling
+struct __attribute__((aligned(128))) __ready_queue_caches_t {
+        // Count States:
+        // - 0  : No one is looking after this cache
+        // - 1  : No one is looking after this cache, BUT it's not empty
+        // - 2+ : At least one processor is looking after this cache
+        volatile unsigned count;
+};
+void  ?{}(__ready_queue_caches_t & this) { this.count = 0; }
+void ^?{}(__ready_queue_caches_t & this) {}
+static inline void depart(__ready_queue_caches_t & cache) {
+        /* paranoid */ verify( cache.count > 1);
+        __atomic_fetch_add(&cache.count, -1, __ATOMIC_SEQ_CST);
+        /* paranoid */ verify( cache.count != 0);
+        /* paranoid */ verify( cache.count < 65536 ); // This verify assumes no cluster will have more than 65000 kernel threads mapped to a single cache, which could be correct but is super weird.
+}
+static inline void arrive(__ready_queue_caches_t & cache) {
+        // for() {
+        //      unsigned expected = cache.count;
+        //      unsigned desired  = 0 == expected ? 2 : expected + 1;
+        // }
+}
+//=======================================================================
 // Cforall Ready Queue used for scheduling
 //=======================================================================
+unsigned long long moving_average(unsigned long long nval, unsigned long long oval) {
+        const unsigned long long tw = 16;
+        const unsigned long long nw = 4;
+        const unsigned long long ow = tw - nw;
+        return ((nw * nval) + (ow * oval)) / tw;
+unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
+        /* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc );
+        /* paranoid */ verifyf( instsc  < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc );
+        /* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg );
+        const unsigned long long new_val = currtsc > instsc ? currtsc - instsc : 0;
+        const unsigned long long total_weight = 16;
+        const unsigned long long new_weight   = 4;
+        const unsigned long long old_weight = total_weight - new_weight;
+        const unsigned long long ret = ((new_weight * new_val) + (old_weight * old_avg)) / total_weight;
+        return ret;
+}
 …
+                }
         #else
+                lanes.data  = 0p;
+                lanes.tscs  = 0p;
+                lanes.help  = 0p;
+                lanes.count = 0;
+                lanes.data   = 0p;
+                lanes.tscs   = 0p;
+                lanes.caches = 0p;
+                lanes.help   = 0p;
+                lanes.count  = 0;
         #endif
+}
 …
         free(lanes.data);
         free(lanes.tscs);
+        free(lanes.caches);
         free(lanes.help);
+}
 //-----------------------------------------------------------------------
+#if defined(USE_AWARE_STEALING)
+        __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
+                processor * const proc = kernelTLS().this_processor;
+                const bool external = (!proc) || (cltr != proc->cltr);
+                const bool remote   = hint == UNPARK_REMOTE;
+                unsigned i;
+                if( external || remote ) {
+                        // Figure out where thread was last time and make sure it's valid
+                        /* paranoid */ verify(thrd->preferred >= 0);
+                        if(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count) {
+                                /* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
+                                unsigned start = thrd->preferred * READYQ_SHARD_FACTOR;
+                                do {
+                                        unsigned r = __tls_rand();
+                                        i = start + (r % READYQ_SHARD_FACTOR);
+                                        /* paranoid */ verify( i < lanes.count );
+                                        // If we can't lock it retry
+                                } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                        } else {
+                                do {
+                                        i = __tls_rand() % lanes.count;
+                                } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                        }
+                } else {
+                        do {
+                                unsigned r = proc->rdq.its++;
+                                i = proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+                                /* paranoid */ verify( i < lanes.count );
+                                // If we can't lock it retry
+                        } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                }
+                // Actually push it
+                push(lanes.data[i], thrd);
+                // Unlock and return
+                __atomic_unlock( &lanes.data[i].lock );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(unlikely(external || remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
+                        else __tls_stats()->ready.push.local.success++;
+                #endif
+        }
+        static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) {
+                unsigned start = proc->rdq.id;
+                unsigned long long max = 0;
+                for(i; READYQ_SHARD_FACTOR) {
+                        unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
+                        if(ptsc != -1ull) {
+                                /* paranoid */ verify( start + i < rdq.lanes.count );
+                                unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma);
+                                if(tsc > max) max = tsc;
+                        }
+                }
+                return (max + 2 * max) / 2;
+        }
+        __attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+                /* paranoid */ verify( lanes.count > 0 );
+                /* paranoid */ verify( kernelTLS().this_processor );
+                /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+                processor * const proc = kernelTLS().this_processor;
+                unsigned this = proc->rdq.id;
+                /* paranoid */ verify( this < lanes.count );
+                __cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
+                // Figure out the current cpu and make sure it is valid
+                const int cpu = __kernel_getcpu();
+                /* paranoid */ verify(cpu >= 0);
+                /* paranoid */ verify(cpu < cpu_info.hthrd_count);
+                unsigned this_cache = cpu_info.llc_map[cpu].cache;
+                // Super important: don't write the same value over and over again
+                // We want to maximise our chances that his particular values stays in cache
+                if(lanes.caches[this / READYQ_SHARD_FACTOR].id != this_cache)
+                        __atomic_store_n(&lanes.caches[this / READYQ_SHARD_FACTOR].id, this_cache, __ATOMIC_RELAXED);
+                const unsigned long long ctsc = rdtscl();
+                if(proc->rdq.target == MAX) {
+                        uint64_t chaos = __tls_rand();
+                        unsigned ext = chaos & 0xff;
+                        unsigned other  = (chaos >> 8) % (lanes.count);
+                        if(ext < 3 || __atomic_load_n(&lanes.caches[other / READYQ_SHARD_FACTOR].id, __ATOMIC_RELAXED) == this_cache) {
+                                proc->rdq.target = other;
+                        }
+                }
+                else {
+                        const unsigned target = proc->rdq.target;
+                        __cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv);
+                        /* paranoid */ verify( lanes.tscs[target].tv != MAX );
+                        if(target < lanes.count) {
+                                const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue);
+                                const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma);
+                                __cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no");
+                                if(age > cutoff) {
+                                        thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
+                                        if(t) return t;
+                                }
+                        }
+                        proc->rdq.target = MAX;
+                }
+                for(READYQ_SHARD_FACTOR) {
+                        unsigned i = this + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+                        if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+                }
+                // All lanes where empty return 0p
+                return 0p;
+        }
+        __attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+                unsigned i = __tls_rand() % lanes.count;
+                return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+        }
+        __attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) {
+                return search(cltr);
+        }
+#endif
 #if defined(USE_CPU_WORK_STEALING)
         __attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
 …
                 /* paranoid */ verify( kernelTLS().this_processor );
+                processor * const proc = kernelTLS().this_processor;
                 const int cpu = __kernel_getcpu();
                 /* paranoid */ verify(cpu >= 0);
 …
                 /* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
-                processor * const proc = kernelTLS().this_processor;
                 const int start = map.self * READYQ_SHARD_FACTOR;
                 const unsigned long long ctsc = rdtscl();
                 // Did we already have a help target
                 if(proc->rdq.target == -1u) {
+                if(proc->rdq.target == MAX) {
                         unsigned long long max = 0;
                         for(i; READYQ_SHARD_FACTOR) {
                                 unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+                                unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
                                 if(tsc > max) max = tsc;
+                        }
                          proc->rdq.cutoff = (max + 2 * max) / 2;
+                        //  proc->rdq.cutoff = (max + 2 * max) / 2;
                         /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
                         /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
 …
+                        }
                         /* paranoid */ verify(proc->rdq.target != -1u);
+                        /* paranoid */ verify(proc->rdq.target != MAX);
+                }
                 else {
                         unsigned long long max = 0;
                         for(i; READYQ_SHARD_FACTOR) {
                                 unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+                                unsigned long long tsc = moving_average(ctsc, ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
                                 if(tsc > max) max = tsc;
+                        }
 …
+                        {
                                 unsigned target = proc->rdq.target;
                                 proc->rdq.target = -1u;
+                                proc->rdq.target = MAX;
                                 lanes.help[target / READYQ_SHARD_FACTOR].tri++;
                                 if(moving_average(ctsc - lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
+                                if(moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
                                         thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
                                         proc->rdq.last = target;
                                         if(t) return t;
-                                        else proc->rdq.target = -1u;
+                                }
                                 else proc->rdq.target = -1u;
+                                proc->rdq.target = MAX;
+                        }
                         unsigned last = proc->rdq.last;
                         if(last != -1u && lanes.tscs[last].tv < cutoff && ts(lanes.data[last]) < cutoff) {
+                        if(last != MAX && moving_average(ctsc, lanes.tscs[last].tv, lanes.tscs[last].ma) > cutoff) {
                                 thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
                                 if(t) return t;
+                        }
                         else {
                                 proc->rdq.last = -1u;
+                                proc->rdq.last = MAX;
+                        }
+                }
 …
                 processor * const proc = kernelTLS().this_processor;
                 unsigned last = proc->rdq.last;
                 if(last != -1u) {
+                if(last != MAX) {
                         struct thread$ * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal));
                         if(t) return t;
                         proc->rdq.last = -1u;
+                        proc->rdq.last = MAX;
+                }
 …
                 #else
                         unsigned preferred = thrd->preferred;
                         const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+                        const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == MAX || thrd->curr_cluster != cltr;
                         /* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
 …
                 processor * proc = kernelTLS().this_processor;
                 if(proc->rdq.target == -1u) {
+                if(proc->rdq.target == MAX) {
                         unsigned long long min = ts(lanes.data[proc->rdq.id]);
                         for(int i = 0; i < READYQ_SHARD_FACTOR; i++) {
 …
                 else {
                         unsigned target = proc->rdq.target;
                         proc->rdq.target = -1u;
+                        proc->rdq.target = MAX;
                         const unsigned long long bias = 0; //2_500_000_000;
                         const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
 …
 // try to pop from a lane given by index w
 static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
+        /* paranoid */ verify( w < lanes.count );
         __STATS( stats.attempt++; )
 …
         // Actually pop the list
         struct thread$ * thrd;
+        unsigned long long tsc_before = ts(lane);
+        #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+                unsigned long long tsc_before = ts(lane);
+        #endif
         unsigned long long tsv;
         [thrd, tsv] = pop(lane);
 …
         __STATS( stats.success++; )
+        #if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+                unsigned long long now = rdtscl();
+                lanes.tscs[w].tv = tsv;
+                lanes.tscs[w].ma = moving_average(now > tsc_before ? now - tsc_before : 0, lanes.tscs[w].ma);
+        #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+                if (tsv != MAX) {
+                        unsigned long long now = rdtscl();
+                        unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED);
+                        __atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED);
+                        __atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
+                }
         #endif
         #if defined(USE_CPU_WORK_STEALING)
+        #if defined(USE_AWARE_STEALING) || defined(USE_CPU_WORK_STEALING)
                 thrd->preferred = w / READYQ_SHARD_FACTOR;
         #else
 …
                 /* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
                 it->rdq.id = value;
                 it->rdq.target = -1u;
+                it->rdq.target = MAX;
                 value += READYQ_SHARD_FACTOR;
                 it = &(*it)`next;
 …
 static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
         #if defined(USE_WORK_STEALING)
+        #if defined(USE_AWARE_STEALING) || defined(USE_WORK_STEALING)
                 lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
                 for(i; lanes.count) {
+                        unsigned long long tsc1 = ts(lanes.data[i]);
+                        unsigned long long tsc2 = rdtscl();
+                        lanes.tscs[i].tv = min(tsc1, tsc2);
+                        lanes.tscs[i].tv = rdtscl();
+                        lanes.tscs[i].ma = 0;
+                }
         #endif
 …
                         // Update original
                         lanes.count = ncount;
+                        lanes.caches = alloc( target, lanes.caches`realloc );
+                }
 …
                                 fix(lanes.data[idx]);
+                        }
+                        lanes.caches = alloc( target, lanes.caches`realloc );
+                }
                 fix_times(cltr);
                 reassign_cltr_id(cltr);

libcfa/src/concurrency/thread.cfa

-              r21a5bfb7
+              r175f9f4
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Dec  4 09:17:49 2019
 // Update Count     : 9
+// Last Modified On : Thu Jan 13 20:11:55 2022
+// Update Count     : 42
 //
 …
 #include "invoke.h"
 uint64_t thread_rand();
+extern uint32_t __global_random_seed;
 //-----------------------------------------------------------------------------
 // Thread ctors and dtors
 void ?{}(thread$ & this, const char * const name, cluster & cl, void * storage, size_t storageSize ) with( this ) {
+void ?{}( thread$ & this, const char * const name, cluster & cl, void * storage, size_t storageSize ) with( this ) {
         context{ 0p, 0p };
         self_cor{ name, storage, storageSize };
 …
         preferred = ready_queue_new_preferred();
         last_proc = 0p;
+        random_state = __global_random_seed;
         #if defined( __CFA_WITH_VERIFY__ )
                 canary = 0x0D15EA5E0D15EA5Ep;
 …
+}
+uint64_t thread_rand() {
+        disable_interrupts();
+        uint64_t ret = __tls_rand();
+        enable_interrupts();
+        return ret;
+}
+//-----------------------------------------------------------------------------
+#define GENERATOR LCG
+void set_seed( uint32_t seed ) {
+        active_thread()->random_state = __global_random_seed = seed;
+        GENERATOR( active_thread()->random_state );
+} // set_seed
+uint32_t prng( void ) { return GENERATOR( active_thread()->random_state ); } // [0,UINT_MAX]
 // Local Variables: //

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: