Context Navigation

Reverse Diff

ready_queue.cfa [b808625:fc59df78]

File:

: 1 edited

libcfa/src/concurrency/ready_queue.cfa (modified) (26 diffs)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/ready_queue.cfa

-              rb808625
+              rfc59df78
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
+// #define USE_MPSC
 #define USE_RELAXED_FIFO
 …
         this.alloc = 0;
         this.ready = 0;
+        this.lock  = false;
         this.data  = alloc(this.max);
+        this.write_lock  = false;
+        /*paranoid*/ verify( 0 == (((uintptr_t)(this.data    )) % 64) );
+        /*paranoid*/ verify( 0 == (((uintptr_t)(this.data + 1)) % 64) );
         /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
         /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
 …
+}
+void ?{}( __scheduler_lock_id_t & this, __processor_id_t * proc ) {
+        this.handle = proc;
+        this.lock   = false;
+        #ifdef __CFA_WITH_VERIFY__
+                this.owned  = false;
+        #endif
+}
 //=======================================================================
 // Lock-Free registering/unregistering of threads
 unsigned register_proc_id( void ) with(*__scheduler_lock) {
+void register_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
         __cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
-        bool * handle = (bool *)&kernelTLS().sched_lock;
         // Step - 1 : check if there is already space in the data
 …
         // Check among all the ready
         for(uint_fast32_t i = 0; i < s; i++) {
+                bool * volatile * cell = (bool * volatile *)&data[i]; // Cforall is bugged and the double volatiles causes problems
+                /* paranoid */ verify( handle != *cell );
+                bool * null = 0p; // Re-write every loop since compare thrashes it
+                if( __atomic_load_n(cell, (int)__ATOMIC_RELAXED) == null
+                        && __atomic_compare_exchange_n( cell, &null, handle, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                        /* paranoid */ verify(i < ready);
+                        /* paranoid */ verify( (kernelTLS().sched_id = i, true) );
+                        return i;
+                __processor_id_t * null = 0p; // Re-write every loop since compare thrashes it
+                if( __atomic_load_n(&data[i].handle, (int)__ATOMIC_RELAXED) == null
+                        && __atomic_compare_exchange_n( &data[i].handle, &null, proc, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                        /*paranoid*/ verify(i < ready);
+                        /*paranoid*/ verify(0 == (__alignof__(data[i]) % cache_line_size));
+                        /*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
+                        proc->id = i;
+                }
+        }
 …
         // Step - 3 : Mark space as used and then publish it.
+        data[n] = handle;
+        __scheduler_lock_id_t * storage = (__scheduler_lock_id_t *)&data[n];
+        (*storage){ proc };
         while() {
                 unsigned copy = n;
 …
         // Return new spot.
+        /* paranoid */ verify(n < ready);
+        /* paranoid */ verify( (kernelTLS().sched_id = n, true) );
+        return n;
+}
+void unregister_proc_id( unsigned id ) with(*__scheduler_lock) {
+        /* paranoid */ verify(id < ready);
+        /* paranoid */ verify(id == kernelTLS().sched_id);
+        /* paranoid */ verify(data[id] == &kernelTLS().sched_lock);
+        bool * volatile * cell = (bool * volatile *)&data[id]; // Cforall is bugged and the double volatiles causes problems
+        __atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
+        /*paranoid*/ verify(n < ready);
+        /*paranoid*/ verify(__alignof__(data[n]) == (2 * cache_line_size));
+        /*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
+        proc->id = n;
+}
+void unregister_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+        unsigned id = proc->id;
+        /*paranoid*/ verify(id < ready);
+        /*paranoid*/ verify(proc == __atomic_load_n(&data[id].handle, __ATOMIC_RELAXED));
+        __atomic_store_n(&data[id].handle, 0p, __ATOMIC_RELEASE);
         __cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
 …
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
-        /* paranoid */ verify( ! kernelTLS().sched_lock );
         // Step 1 : lock global lock
         // It is needed to avoid processors that register mid Critical-Section
         //   to simply lock their own lock and enter.
         __atomic_acquire( &write_lock );
+        __atomic_acquire( &lock );
         // Step 2 : lock per-proc lock
 …
         uint_fast32_t s = ready;
         for(uint_fast32_t i = 0; i < s; i++) {
+                volatile bool * llock = data[i];
+                if(llock) __atomic_acquire( llock );
+                __atomic_acquire( &data[i].lock );
+        }
 …
         // Alternative solution : return s in write_lock and pass it to write_unlock
         for(uint_fast32_t i = 0; i < last_s; i++) {
                 volatile bool * llock = data[i];
                 if(llock) __atomic_store_n(llock, (bool)false, __ATOMIC_RELEASE);
+                verify(data[i].lock);
+                __atomic_store_n(&data[i].lock, (bool)false, __ATOMIC_RELEASE);
+        }
         // Step 2 : release global lock
         /*paranoid*/ assert(true == write_lock);
         __atomic_store_n(&write_lock, (bool)false, __ATOMIC_RELEASE);
+        /*paranoid*/ assert(true == lock);
+        __atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
         /* paranoid */ verify( ! __preemption_enabled() );
 …
+        }
         __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
+        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
                 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
                 const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
                 /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+                // write timestamp
+                thrd->link.ts = rdtscl();
                 bool local;
 …
                         #endif
+                #if defined(USE_MPSC)
+                        // mpsc always succeeds
+                } while( false );
+                #else
                         // If we can't lock it retry
                 } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                #endif
                 // Actually push it
                 push(lanes.data[i], thrd);
+                // Unlock and return
+                __atomic_unlock( &lanes.data[i].lock );
+                #if !defined(USE_MPSC)
+                        // Unlock and return
+                        __atomic_unlock( &lanes.data[i].lock );
+                #endif
                 // Mark the current index in the tls rng instance as having an item
 …
 #endif
 #if defined(USE_WORK_STEALING)
         __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
+        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
                 __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+                // #define USE_PREFERRED
+                #if !defined(USE_PREFERRED)
+                const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
                 /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+                #else
+                        unsigned preferred = thrd->preferred;
+                        const bool external = push_local || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+                        /* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
+                        unsigned r = preferred % READYQ_SHARD_FACTOR;
+                        const unsigned start = preferred - r;
+                #endif
+                // write timestamp
+                thrd->link.ts = rdtscl();
                 // Try to pick a lane and lock it
 …
+                        }
                         else {
+                                #if !defined(USE_PREFERRED)
+                                        processor * proc = kernelTLS().this_processor;
+                                        unsigned r = proc->rdq.its++;
+                                        i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+                                #else
+                                        i = start + (r++ % READYQ_SHARD_FACTOR);
+                                #endif
+                                processor * proc = kernelTLS().this_processor;
+                                unsigned r = proc->rdq.its++;
+                                i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+                        }
+                #if defined(USE_MPSC)
+                        // mpsc always succeeds
+                } while( false );
+                #else
                         // If we can't lock it retry
                 } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                #endif
                 // Actually push it
                 push(lanes.data[i], thrd);
+                // Unlock and return
+                __atomic_unlock( &lanes.data[i].lock );
+                #if !defined(USE_MPSC)
+                        // Unlock and return
+                        __atomic_unlock( &lanes.data[i].lock );
+                #endif
                 #if !defined(__CFA_NO_STATISTICS__)
 …
                 if(proc->rdq.target == -1u) {
-                        unsigned long long min = ts(lanes.data[proc->rdq.id]);
-                        for(int i = 0; i < READYQ_SHARD_FACTOR; i++) {
-                                unsigned long long tsc = ts(lanes.data[proc->rdq.id + i]);
-                                if(tsc < min) min = tsc;
+                        }
-                        proc->rdq.cutoff = min;
                         proc->rdq.target = __tls_rand() % lanes.count;
+                        unsigned it1  = proc->rdq.itr;
+                        unsigned it2  = proc->rdq.itr + 1;
+                        unsigned idx1 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+                        unsigned idx2 = proc->rdq.id + (it2 % READYQ_SHARD_FACTOR);
+                        unsigned long long tsc1 = ts(lanes.data[idx1]);
+                        unsigned long long tsc2 = ts(lanes.data[idx2]);
+                        proc->rdq.cutoff = min(tsc1, tsc2);
+                        if(proc->rdq.cutoff == 0) proc->rdq.cutoff = -1ull;
+                }
                 else {
                         unsigned target = proc->rdq.target;
                         proc->rdq.target = -1u;
+                        const unsigned long long bias = 0; //2_500_000_000;
+                        const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+                        if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
+                        if(lanes.tscs[target].tv < proc->rdq.cutoff) {
                                 $thread * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
                                 if(t) return t;
 …
                 for(READYQ_SHARD_FACTOR) {
                         unsigned i = proc->rdq.id + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
+                        unsigned i = proc->rdq.id + (--proc->rdq.itr % READYQ_SHARD_FACTOR);
                         if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+                }
 …
         // If list looks empty retry
         if( is_empty(lane) ) {
+                __STATS( stats.espec++; )
                 return 0p;
+        }
 …
         // If we can't get the lock retry
         if( !__atomic_try_acquire(&lane.lock) ) {
+                __STATS( stats.elock++; )
                 return 0p;
+        }
 …
         if( is_empty(lane) ) {
                 __atomic_unlock(&lane.lock);
+                __STATS( stats.eempty++; )
                 return 0p;
+        }
 …
         // Actually pop the list
         struct $thread * thrd;
+        unsigned long long tsv;
+        [thrd, tsv] = pop(lane);
+        thrd = pop(lane);
         /* paranoid */ verify(thrd);
-        /* paranoid */ verify(tsv);
         /* paranoid */ verify(lane.lock);
 …
         #if defined(USE_WORK_STEALING)
                 lanes.tscs[w].tv = tsv;
+                lanes.tscs[w].tv = thrd->link.ts;
         #endif
-        thrd->preferred = w;
         // return the popped thread
 …
 // Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
         #if defined(__CFA_WITH_VERIFY__)
+        #if defined(__CFA_WITH_VERIFY__) && !defined(USE_MPSC)
+                {
                         for( idx ; lanes.count ) {
 …
                                 assert(!lanes.data[idx].lock);
+                                        if(is_empty(sl)) {
+                                                assert( sl.anchor.next == 0p );
+                                                assert( sl.anchor.ts   == 0  );
+                                                assert( mock_head(sl)  == sl.prev );
+                                        } else {
+                                                assert( sl.anchor.next != 0p );
+                                                assert( sl.anchor.ts   != 0  );
+                                                assert( mock_head(sl)  != sl.prev );
+                                        }
+                                assert(head(sl)->link.prev == 0p );
+                                assert(head(sl)->link.next->link.prev == head(sl) );
+                                assert(tail(sl)->link.next == 0p );
+                                assert(tail(sl)->link.prev->link.next == tail(sl) );
+                                if(is_empty(sl)) {
+                                        assert(tail(sl)->link.prev == head(sl));
+                                        assert(head(sl)->link.next == tail(sl));
+                                } else {
+                                        assert(tail(sl)->link.prev != head(sl));
+                                        assert(head(sl)->link.next != tail(sl));
+                                }
+                        }
+                }
 …
 // fixes the list so that the pointers back to anchors aren't left dangling
 static inline void fix(__intrusive_lane_t & ll) {
+                        if(is_empty(ll)) {
+                                verify(ll.anchor.next == 0p);
+                                ll.prev = mock_head(ll);
+                        }
+}
+static void assign_list(unsigned & value, dlist(processor) & list, unsigned count) {
+        #if !defined(USE_MPSC)
+                // if the list is not empty then follow he pointer and fix its reverse
+                if(!is_empty(ll)) {
+                        head(ll)->link.next->link.prev = head(ll);
+                        tail(ll)->link.prev->link.next = tail(ll);
+                }
+                // Otherwise just reset the list
+                else {
+                        verify(tail(ll)->link.next == 0p);
+                        tail(ll)->link.prev = head(ll);
+                        head(ll)->link.next = tail(ll);
+                        verify(head(ll)->link.prev == 0p);
+                }
+        #endif
+}
+static void assign_list(unsigned & value, dlist(processor, processor) & list, unsigned count) {
         processor * it = &list`first;
         for(unsigned i = 0; i < count; i++) {
 …
                 lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
                 for(i; lanes.count) {
+                        unsigned long long tsc = ts(lanes.data[i]);
+                        lanes.tscs[i].tv = tsc != 0 ? tsc : rdtscl();
+                        lanes.tscs[i].tv = ts(lanes.data[i]);
+                }
         #endif
 …
                         while(!is_empty(lanes.data[idx])) {
                                 struct $thread * thrd;
+                                unsigned long long _;
+                                [thrd, _] = pop(lanes.data[idx]);
+                                push(cltr, thrd, true);
+                                thrd = pop(lanes.data[idx]);
+                                push(cltr, thrd);
                                 // for printing count the number of displaced threads
 …
         /* paranoid */ verify( ready_mutate_islocked() );
+}
-#if !defined(__CFA_NO_STATISTICS__)
-        unsigned cnt(const __ready_queue_t & this, unsigned idx) {
-                /* paranoid */ verify(this.lanes.count > idx);
-                return this.lanes.data[idx].cnt;
+        }
-#endif

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changes in libcfa/src/concurrency/ready_queue.cfa [b808625:fc59df78]

Legend:

libcfa/src/concurrency/ready_queue.cfa

Download in other formats: