Context Navigation

← Previous Change
Next Change →

ready_queue.cfa

Timestamp:

May 21, 2021, 4:48:10 PM (5 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast-unique-expr, pthread-emulation, qualifiedEnum

Children:

f1bce515

Parents:

5407cdc (diff), 7404cdc (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.

Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

File:

: 1 edited

libcfa/src/concurrency/ready_queue.cfa (modified) (26 diffs)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/ready_queue.cfa

-              r5407cdc
+              r8d66610
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
-// #define USE_MPSC
 #define USE_RELAXED_FIFO
 …
         this.alloc = 0;
         this.ready = 0;
-        this.lock  = false;
         this.data  = alloc(this.max);
+        /*paranoid*/ verify( 0 == (((uintptr_t)(this.data    )) % 64) );
+        /*paranoid*/ verify( 0 == (((uintptr_t)(this.data + 1)) % 64) );
+        this.write_lock  = false;
         /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
         /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
 …
+}
-void ?{}( __scheduler_lock_id_t & this, __processor_id_t * proc ) {
-        this.handle = proc;
-        this.lock   = false;
-        #ifdef __CFA_WITH_VERIFY__
-                this.owned  = false;
-        #endif
+}
 //=======================================================================
 // Lock-Free registering/unregistering of threads
 void register_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+unsigned register_proc_id( void ) with(*__scheduler_lock) {
         __cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
+        bool * handle = (bool *)&kernelTLS().sched_lock;
         // Step - 1 : check if there is already space in the data
 …
         // Check among all the ready
         for(uint_fast32_t i = 0; i < s; i++) {
+                __processor_id_t * null = 0p; // Re-write every loop since compare thrashes it
+                if( __atomic_load_n(&data[i].handle, (int)__ATOMIC_RELAXED) == null
+                        && __atomic_compare_exchange_n( &data[i].handle, &null, proc, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                        /*paranoid*/ verify(i < ready);
+                        /*paranoid*/ verify(0 == (__alignof__(data[i]) % cache_line_size));
+                        /*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
+                        proc->id = i;
+                bool * volatile * cell = (bool * volatile *)&data[i]; // Cforall is bugged and the double volatiles causes problems
+                /* paranoid */ verify( handle != *cell );
+                bool * null = 0p; // Re-write every loop since compare thrashes it
+                if( __atomic_load_n(cell, (int)__ATOMIC_RELAXED) == null
+                        && __atomic_compare_exchange_n( cell, &null, handle, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                        /* paranoid */ verify(i < ready);
+                        /* paranoid */ verify( (kernelTLS().sched_id = i, true) );
+                        return i;
+                }
+        }
 …
         // Step - 3 : Mark space as used and then publish it.
+        __scheduler_lock_id_t * storage = (__scheduler_lock_id_t *)&data[n];
+        (*storage){ proc };
+        data[n] = handle;
         while() {
                 unsigned copy = n;
 …
         // Return new spot.
+        /*paranoid*/ verify(n < ready);
+        /*paranoid*/ verify(__alignof__(data[n]) == (2 * cache_line_size));
+        /*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
+        proc->id = n;
+}
+void unregister_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+        unsigned id = proc->id;
+        /*paranoid*/ verify(id < ready);
+        /*paranoid*/ verify(proc == __atomic_load_n(&data[id].handle, __ATOMIC_RELAXED));
+        __atomic_store_n(&data[id].handle, 0p, __ATOMIC_RELEASE);
+        /* paranoid */ verify(n < ready);
+        /* paranoid */ verify( (kernelTLS().sched_id = n, true) );
+        return n;
+}
+void unregister_proc_id( unsigned id ) with(*__scheduler_lock) {
+        /* paranoid */ verify(id < ready);
+        /* paranoid */ verify(id == kernelTLS().sched_id);
+        /* paranoid */ verify(data[id] == &kernelTLS().sched_lock);
+        bool * volatile * cell = (bool * volatile *)&data[id]; // Cforall is bugged and the double volatiles causes problems
+        __atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
         __cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
 …
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( ! kernelTLS().sched_lock );
         // Step 1 : lock global lock
         // It is needed to avoid processors that register mid Critical-Section
         //   to simply lock their own lock and enter.
         __atomic_acquire( &lock );
+        __atomic_acquire( &write_lock );
         // Step 2 : lock per-proc lock
 …
         uint_fast32_t s = ready;
         for(uint_fast32_t i = 0; i < s; i++) {
+                __atomic_acquire( &data[i].lock );
+                volatile bool * llock = data[i];
+                if(llock) __atomic_acquire( llock );
+        }
 …
         // Alternative solution : return s in write_lock and pass it to write_unlock
         for(uint_fast32_t i = 0; i < last_s; i++) {
                 verify(data[i].lock);
                 __atomic_store_n(&data[i].lock, (bool)false, __ATOMIC_RELEASE);
+                volatile bool * llock = data[i];
+                if(llock) __atomic_store_n(llock, (bool)false, __ATOMIC_RELEASE);
+        }
         // Step 2 : release global lock
         /*paranoid*/ assert(true == lock);
         __atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+        /*paranoid*/ assert(true == write_lock);
+        __atomic_store_n(&write_lock, (bool)false, __ATOMIC_RELEASE);
         /* paranoid */ verify( ! __preemption_enabled() );
 …
+        }
         __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
                 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
                 const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
                 /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
-                // write timestamp
-                thrd->link.ts = rdtscl();
                 bool local;
 …
                         #endif
-                #if defined(USE_MPSC)
-                        // mpsc always succeeds
-                } while( false );
-                #else
                         // If we can't lock it retry
                 } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-                #endif
                 // Actually push it
                 push(lanes.data[i], thrd);
+                #if !defined(USE_MPSC)
+                        // Unlock and return
+                        __atomic_unlock( &lanes.data[i].lock );
+                #endif
+                // Unlock and return
+                __atomic_unlock( &lanes.data[i].lock );
                 // Mark the current index in the tls rng instance as having an item
 …
 #endif
 #if defined(USE_WORK_STEALING)
         __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
                 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+                const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                // #define USE_PREFERRED
+                #if !defined(USE_PREFERRED)
+                const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
                 /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+                // write timestamp
+                thrd->link.ts = rdtscl();
+                #else
+                        unsigned preferred = thrd->preferred;
+                        const bool external = push_local || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+                        /* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
+                        unsigned r = preferred % READYQ_SHARD_FACTOR;
+                        const unsigned start = preferred - r;
+                #endif
                 // Try to pick a lane and lock it
 …
+                        }
                         else {
+                                processor * proc = kernelTLS().this_processor;
+                                unsigned r = proc->rdq.its++;
+                                i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+                                #if !defined(USE_PREFERRED)
+                                        processor * proc = kernelTLS().this_processor;
+                                        unsigned r = proc->rdq.its++;
+                                        i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+                                #else
+                                        i = start + (r++ % READYQ_SHARD_FACTOR);
+                                #endif
+                        }
-                #if defined(USE_MPSC)
-                        // mpsc always succeeds
-                } while( false );
-                #else
                         // If we can't lock it retry
                 } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-                #endif
                 // Actually push it
                 push(lanes.data[i], thrd);
+                #if !defined(USE_MPSC)
+                        // Unlock and return
+                        __atomic_unlock( &lanes.data[i].lock );
+                #endif
+                // Unlock and return
+                __atomic_unlock( &lanes.data[i].lock );
                 #if !defined(__CFA_NO_STATISTICS__)
 …
                 if(proc->rdq.target == -1u) {
+                        unsigned long long min = ts(lanes.data[proc->rdq.id]);
+                        for(int i = 0; i < READYQ_SHARD_FACTOR; i++) {
+                                unsigned long long tsc = ts(lanes.data[proc->rdq.id + i]);
+                                if(tsc < min) min = tsc;
+                        }
+                        proc->rdq.cutoff = min;
                         proc->rdq.target = __tls_rand() % lanes.count;
-                        unsigned it1  = proc->rdq.itr;
-                        unsigned it2  = proc->rdq.itr + 1;
-                        unsigned idx1 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
-                        unsigned idx2 = proc->rdq.id + (it2 % READYQ_SHARD_FACTOR);
-                        unsigned long long tsc1 = ts(lanes.data[idx1]);
-                        unsigned long long tsc2 = ts(lanes.data[idx2]);
-                        proc->rdq.cutoff = min(tsc1, tsc2);
-                        if(proc->rdq.cutoff == 0) proc->rdq.cutoff = -1ull;
+                }
                 else {
                         unsigned target = proc->rdq.target;
                         proc->rdq.target = -1u;
+                        if(lanes.tscs[target].tv < proc->rdq.cutoff) {
+                        const unsigned long long bias = 0; //2_500_000_000;
+                        const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+                        if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
                                 $thread * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
                                 if(t) return t;
 …
                 for(READYQ_SHARD_FACTOR) {
                         unsigned i = proc->rdq.id + (--proc->rdq.itr % READYQ_SHARD_FACTOR);
+                        unsigned i = proc->rdq.id + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
                         if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+                }
 …
         // If list looks empty retry
         if( is_empty(lane) ) {
-                __STATS( stats.espec++; )
                 return 0p;
+        }
 …
         // If we can't get the lock retry
         if( !__atomic_try_acquire(&lane.lock) ) {
-                __STATS( stats.elock++; )
                 return 0p;
+        }
 …
         if( is_empty(lane) ) {
                 __atomic_unlock(&lane.lock);
-                __STATS( stats.eempty++; )
                 return 0p;
+        }
 …
         // Actually pop the list
         struct $thread * thrd;
+        thrd = pop(lane);
+        unsigned long long tsv;
+        [thrd, tsv] = pop(lane);
         /* paranoid */ verify(thrd);
+        /* paranoid */ verify(tsv);
         /* paranoid */ verify(lane.lock);
 …
         #if defined(USE_WORK_STEALING)
                 lanes.tscs[w].tv = thrd->link.ts;
+                lanes.tscs[w].tv = tsv;
         #endif
+        thrd->preferred = w;
         // return the popped thread
 …
 // Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
         #if defined(__CFA_WITH_VERIFY__) && !defined(USE_MPSC)
+        #if defined(__CFA_WITH_VERIFY__)
+                {
                         for( idx ; lanes.count ) {
 …
                                 assert(!lanes.data[idx].lock);
+                                assert(head(sl)->link.prev == 0p );
+                                assert(head(sl)->link.next->link.prev == head(sl) );
+                                assert(tail(sl)->link.next == 0p );
+                                assert(tail(sl)->link.prev->link.next == tail(sl) );
+                                if(is_empty(sl)) {
+                                        assert(tail(sl)->link.prev == head(sl));
+                                        assert(head(sl)->link.next == tail(sl));
+                                } else {
+                                        assert(tail(sl)->link.prev != head(sl));
+                                        assert(head(sl)->link.next != tail(sl));
+                                }
+                                        if(is_empty(sl)) {
+                                                assert( sl.anchor.next == 0p );
+                                                assert( sl.anchor.ts   == 0  );
+                                                assert( mock_head(sl)  == sl.prev );
+                                        } else {
+                                                assert( sl.anchor.next != 0p );
+                                                assert( sl.anchor.ts   != 0  );
+                                                assert( mock_head(sl)  != sl.prev );
+                                        }
+                        }
+                }
 …
 // fixes the list so that the pointers back to anchors aren't left dangling
 static inline void fix(__intrusive_lane_t & ll) {
+        #if !defined(USE_MPSC)
+                // if the list is not empty then follow he pointer and fix its reverse
+                if(!is_empty(ll)) {
+                        head(ll)->link.next->link.prev = head(ll);
+                        tail(ll)->link.prev->link.next = tail(ll);
+                }
+                // Otherwise just reset the list
+                else {
+                        verify(tail(ll)->link.next == 0p);
+                        tail(ll)->link.prev = head(ll);
+                        head(ll)->link.next = tail(ll);
+                        verify(head(ll)->link.prev == 0p);
+                }
+        #endif
+}
+static void assign_list(unsigned & value, dlist(processor, processor) & list, unsigned count) {
+                        if(is_empty(ll)) {
+                                verify(ll.anchor.next == 0p);
+                                ll.prev = mock_head(ll);
+                        }
+}
+static void assign_list(unsigned & value, dlist(processor) & list, unsigned count) {
         processor * it = &list`first;
         for(unsigned i = 0; i < count; i++) {
 …
                 lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
                 for(i; lanes.count) {
+                        lanes.tscs[i].tv = ts(lanes.data[i]);
+                        unsigned long long tsc = ts(lanes.data[i]);
+                        lanes.tscs[i].tv = tsc != 0 ? tsc : rdtscl();
+                }
         #endif
 …
                         while(!is_empty(lanes.data[idx])) {
                                 struct $thread * thrd;
+                                thrd = pop(lanes.data[idx]);
+                                push(cltr, thrd);
+                                unsigned long long _;
+                                [thrd, _] = pop(lanes.data[idx]);
+                                push(cltr, thrd, true);
                                 // for printing count the number of displaced threads
 …
         /* paranoid */ verify( ready_mutate_islocked() );
+}
+#if !defined(__CFA_NO_STATISTICS__)
+        unsigned cnt(const __ready_queue_t & this, unsigned idx) {
+                /* paranoid */ verify(this.lanes.count > idx);
+                return this.lanes.data[idx].cnt;
+        }
+#endif

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 8d66610 for libcfa/src/concurrency/ready_queue.cfa

Legend:

libcfa/src/concurrency/ready_queue.cfa

Download in other formats: