Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1eb239e4

Timestamp:

Aug 10, 2020, 2:43:02 PM (5 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum

Children:

Parents:

Message:

Removed snzi and replaced it with a fast/slow path

Location:

libcfa/src/concurrency

Files:

: 7 edited

alarm.hfa (modified) (1 diff)
io/setup.cfa (modified) (1 diff)
kernel.cfa (modified) (8 diffs)
kernel.hfa (modified) (5 diffs)
kernel/startup.cfa (modified) (5 diffs)
kernel_private.hfa (modified) (2 diffs)
ready_queue.cfa (modified) (14 diffs)

Legend:

: Unmodified
: Added
: Removed

TabularUnified libcfa/src/concurrency/alarm.hfa ¶

r6c144d8	r1eb239e4
23	23	#include "time.hfa"
24	24
25		#include ~~<containers/list.hfa>~~
	25	#include "containers/list.hfa"
26	26
27	27	struct $thread;

TabularUnified libcfa/src/concurrency/io/setup.cfa ¶

r6c144d8	r1eb239e4
228	228	if( cluster_context ) {
229	229	cluster & cltr = *thrd.curr_cluster;
230		/* paranoid */ verify( cltr.~~nprocessors~~ == 0 \|\| &cltr == mainCluster );
	230	/* paranoid */ verify( cltr.idles.total == 0 \|\| &cltr == mainCluster );
231	231	/* paranoid */ verify( !ready_mutate_islocked() );
232	232

TabularUnified libcfa/src/concurrency/kernel.cfa ¶

-                      r6c144d8
+                      r1eb239e4
 // Kernel Scheduling logic
 static $thread * __next_thread(cluster * this);
 static bool __has_next_thread(cluster * this);
+static $thread * __next_thread_slow(cluster * this);
 static void __run_thread(processor * this, $thread * dst);
+static bool __wake_one(struct __processor_id_t * id, cluster * cltr);
+static void __halt(processor * this);
+bool __wake_proc(processor *);
+static void __wake_one(struct __processor_id_t * id, cluster * cltr);
+static void push  (__cluster_idles & idles, processor & proc);
+static void remove(__cluster_idles & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
 //=============================================================================================
 …
                 $thread * readyThread = 0p;
+                for( unsigned int spin_count = 0;; spin_count++ ) {
+                MAIN_LOOP:
+                for() {
                         // Try to get the next thread
                         readyThread = __next_thread( this->cltr );
+                        // Check if we actually found a thread
+                        if( readyThread ) {
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                /* paranoid */ verifyf( readyThread->state == Ready || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
+                                /* paranoid */ verifyf( readyThread->link.next == 0p, "Expected null got %p", readyThread->link.next );
+                                __builtin_prefetch( readyThread->context.SP );
+                                // We found a thread run it
+                                __run_thread(this, readyThread);
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                        if( !readyThread ) {
+                                readyThread = __next_thread_slow( this->cltr );
+                        }
+                        if(__atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST)) break;
+                        HALT:
                         if( !readyThread ) {
+                                // Block until a thread is ready
+                                __halt(this);
+                                // Don't block if we are done
+                                if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        __tls_stats()->ready.sleep.halts++;
+                                #endif
+                                // Push self to idle stack
+                                push(this->cltr->idles, * this);
+                                // Confirm the ready-queue is empty
+                                readyThread = __next_thread_slow( this->cltr );
+                                if( readyThread ) {
+                                        // A thread was found, cancel the halt
+                                        remove(this->cltr->idles, * this);
+                                        #if !defined(__CFA_NO_STATISTICS__)
+                                                __tls_stats()->ready.sleep.cancels++;
+                                        #endif
+                                        // continue the mai loop
+                                        break HALT;
+                                }
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if(this->print_halts) {
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+                                        }
+                                #endif
+                                wait( this->idle );
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if(this->print_halts) {
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+                                        }
+                                #endif
+                                // We were woken up, remove self from idle
+                                remove(this->cltr->idles, * this);
+                                // DON'T just proceed, start looking again
+                                continue MAIN_LOOP;
+                        }
+                        /* paranoid */ verify( readyThread );
+                        // We found a thread run it
+                        __run_thread(this, readyThread);
+                        // Are we done?
+                        if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                }
 …
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
+        /* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+        __builtin_prefetch( thrd_dst->context.SP );
         $coroutine * proc_cor = get_coroutine(this->runner);
 …
         proc_cor->state = Active;
         kernelTLS.this_thread = 0p;
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
 …
         ready_schedule_lock  ( id );
                 push( thrd->curr_cluster, thrd );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        bool woke =
+                #endif
+                        __wake_one(id, thrd->curr_cluster);
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(woke) __tls_stats()->ready.sleep.wakes++;
+                #endif
+                __wake_one(id, thrd->curr_cluster);
         ready_schedule_unlock( id );
 …
 // KERNEL ONLY
 static $thread * __next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread(cluster * this) with( *this ) {
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
                 $thread * head = pop( this );
+                $thread * thrd = pop( this );
         ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         return head;
+        return thrd;
+}
 // KERNEL ONLY
 static bool __has_next_thread(cluster * this) with( *this ) {
+static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         ready_schedule_lock  ( (__processor_id_t*)kernelTLS.this_processor );
                 bool not_empty = query( this );
+                $thread * thrd = pop_slow( this );
         ready_schedule_unlock( (__processor_id_t*)kernelTLS.this_processor );
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         return not_empty;
+        return thrd;
+}
 …
 //=============================================================================================
 // Wake a thread from the front if there are any
+static bool __wake_one(struct __processor_id_t * id, cluster * this) {
+static void __wake_one(struct __processor_id_t * id, cluster * this) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         /* paranoid */ verify( ready_schedule_islocked( id ) );
         // Check if there is a sleeping processor
+        processor * p = pop(this->idles);
+        processor * p;
+        unsigned idle;
+        unsigned total;
+        [idle, total, p] = query(this->idles);
         // If no one is sleeping, we are done
         if( 0p == p ) return false;
+        if( idle == 0 ) return;
         // We found a processor, wake it up
         post( p->idle );
+        return true;
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.sleep.wakes++;
+        #endif
+        /* paranoid */ verify( ready_schedule_islocked( id ) );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        return;
+}
 // Unconditionnaly wake a thread
 bool __wake_proc(processor * this) {
+void __wake_proc(processor * this) {
         __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
 …
                 bool ret = post( this->idle );
         enable_interrupts( __cfaabi_dbg_ctx );
+        return ret;
+}
+static void __halt(processor * this) with( *this ) {
+        if( do_terminate ) return;
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.sleep.halts++;
+        #endif
+        // Push self to queue
+        push(cltr->idles, *this);
+        // Makre sure we don't miss a thread
+        if( __has_next_thread(cltr) ) {
+                // A thread was posted, make sure a processor is woken up
+                struct __processor_id_t *id = (struct __processor_id_t *) this;
+                ready_schedule_lock  ( id );
+                        __wake_one( id, cltr );
+                ready_schedule_unlock( id );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        __tls_stats()->ready.sleep.cancels++;
+                #endif
+        }
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(this->print_halts) {
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+                }
+        #endif
+        wait( idle );
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(this->print_halts) {
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+                }
+        #endif
+}
+static void push  (__cluster_idles & this, processor & proc) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        lock( this );
+                this.idle++;
+                /* paranoid */ verify( this.idle <= this.total );
+                insert_first(this.list, proc);
+        unlock( this );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+static void remove(__cluster_idles & this, processor & proc) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        lock( this );
+                this.idle--;
+                /* paranoid */ verify( this.idle >= 0 );
+                remove(proc);
+        unlock( this );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+        for() {
+                uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
+                if( 1 == (l % 2) ) { Pause(); continue; }
+                unsigned idle    = this.idle;
+                unsigned total   = this.total;
+                processor * proc = &this.list`first;
+                if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
+                return [idle, total, proc];
+        }
+}

TabularUnified libcfa/src/concurrency/kernel.hfa ¶

-                      r6c144d8
+                      r1eb239e4
 #include "coroutine.hfa"
 #include "containers/stackLockFree.hfa"
+#include "containers/list.hfa"
 extern "C" {
 …
         // Link lists fields
         Link(processor) link;
+        DLISTED_MGD_IMPL_IN(processor)
         #if !defined(__CFA_NO_STATISTICS__)
 …
 static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }
+static inline Link(processor) * ?`next( processor * this ) { return &this->link; }
+DLISTED_MGD_IMPL_OUT(processor)
 //-----------------------------------------------------------------------------
 …
 void ^?{}(__ready_queue_t & this);
+// Idle Sleep
+struct __cluster_idles {
+        // Spin lock protecting the queue
+        volatile uint64_t lock;
+        // Total number of processors
+        unsigned total;
+        // Total number of idle processors
+        unsigned idle;
+        // List of idle processors
+        dlist(processor, processor) list;
+};
 //-----------------------------------------------------------------------------
 // Cluster
 …
         // List of idle processors
+        StackLF(processor) idles;
+        volatile unsigned int nprocessors;
+        __cluster_idles idles;
         // List of threads

TabularUnified libcfa/src/concurrency/kernel/startup.cfa ¶

-                      r6c144d8
+                      r1eb239e4
 //-----------------------------------------------------------------------------
 // Other Forward Declarations
 extern bool __wake_proc(processor *);
+extern void __wake_proc(processor *);
 //-----------------------------------------------------------------------------
 …
         #endif
+        int target = __atomic_add_fetch( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+        lock( this.cltr->idles );
+                int target = this.cltr->idles.total += 1u;
+        unlock( this.cltr->idles );
         id = doregister((__processor_id_t*)&this);
 …
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
+        int target = __atomic_sub_fetch( &this.cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+        lock( this.cltr->idles );
+                int target = this.cltr->idles.total -= 1u;
+        unlock( this.cltr->idles );
         // Lock the RWlock so no-one pushes/pops while we are changing the queue
 …
                 // Adjust the ready queue size
                 ready_queue_shrink( this.cltr, target );
-                // Make sure we aren't on the idle queue
-                unsafe_remove( this.cltr->idles, &this );
         // Unlock the RWlock
 …
 //-----------------------------------------------------------------------------
 // Cluster
+static void ?{}(__cluster_idles & this) {
+        this.lock  = 0;
+        this.idle  = 0;
+        this.total = 0;
+        (this.list){};
+}
 void ?{}(cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params) with( this ) {
         this.name = name;
         this.preemption_rate = preemption_rate;
-        this.nprocessors = 0;
         ready_queue{};

TabularUnified libcfa/src/concurrency/kernel_private.hfa ¶

-                      r6c144d8
+                      r1eb239e4
 void     unregister( struct __processor_id_t * proc );
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+        for() {
+                uint64_t l = this.lock;
+                if(
+                        (0 == (l % 2))
+                        && __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+                ) return;
+                Pause();
+        }
+}
+static inline void unlock(__cluster_idles & this) {
+        /* paranoid */ verify( 1 == (this.lock % 2) );
+        __atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
 //=======================================================================
 // Reader-writer lock implementation
 …
 // pop thread from the ready queue of a cluster
 // returns 0p if empty
+// May return 0p spuriously
 __attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// guaranteed to find any threads added before this call
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
 //-----------------------------------------------------------------------

TabularUnified libcfa/src/concurrency/ready_queue.cfa ¶

-                      r6c144d8
+                      r1eb239e4
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
+// #define USE_SNZI
 #include "bits/defs.hfa"
 #include "kernel_private.hfa"
 …
 void ^?{}(__ready_queue_t & this) with (this) {
         verify( 1 == lanes.count );
+        verify( !query( snzi ) );
+        #ifdef USE_SNZI
+                verify( !query( snzi ) );
+        #endif
         free(lanes.data);
+}
 …
 //-----------------------------------------------------------------------
 __attribute__((hot)) bool query(struct cluster * cltr) {
+        return query(cltr->ready_queue.snzi);
+        #ifdef USE_SNZI
+                return query(cltr->ready_queue.snzi);
+        #endif
+        return true;
+}
 …
         bool lane_first = push(lanes.data[i], thrd);
+        // If this lane used to be empty we need to do more
+        if(lane_first) {
+                // Check if the entire queue used to be empty
+                first = !query(snzi);
+                // Update the snzi
+                arrive( snzi, i );
+        }
+        #ifdef USE_SNZI
+                // If this lane used to be empty we need to do more
+                if(lane_first) {
+                        // Check if the entire queue used to be empty
+                        first = !query(snzi);
+                        // Update the snzi
+                        arrive( snzi, i );
+                }
+        #endif
         // Unlock and return
 …
 __attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
         /* paranoid */ verify( lanes.count > 0 );
+        unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
         #if defined(BIAS)
                 // Don't bother trying locally too much
 …
         // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+        while( query(snzi) ) {
+        #ifdef USE_SNZI
+                while( query(snzi) ) {
+        #else
+                for(25) {
+        #endif
                 // Pick two lists at random
                 unsigned i,j;
 …
                 #endif
                 i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
                 j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                i %= count;
+                j %= count;
                 // try popping from the 2 picked lists
 …
+}
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+        unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+        unsigned offset = __tls_rand();
+        for(i; count) {
+                unsigned idx = (offset + i) % count;
+                struct $thread * thrd = try_pop(cltr, idx);
+                if(thrd) {
+                        return thrd;
+                }
+        }
+        // All lanes where empty return 0p
+        return 0p;
+}
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
 …
         /* paranoid */ verify(lane.lock);
+        // If this was the last element in the lane
+        if(emptied) {
+                depart( snzi, w );
+        }
+        #ifdef USE_SNZI
+                // If this was the last element in the lane
+                if(emptied) {
+                        depart( snzi, w );
+                }
+        #endif
         // Unlock and return
 …
                                 removed = true;
+                                if(emptied) {
+                                        depart( snzi, i );
+                                }
+                                #ifdef USE_SNZI
+                                        if(emptied) {
+                                                depart( snzi, i );
+                                        }
+                                #endif
+                        }
                 __atomic_unlock(&lane.lock);
 …
         // grow the ready queue
         with( cltr->ready_queue ) {
+                ^(snzi){};
+                #ifdef USE_SNZI
+                        ^(snzi){};
+                #endif
                 // Find new count
 …
                 lanes.count = ncount;
+                // Re-create the snzi
+                snzi{ log2( lanes.count / 8 ) };
+                for( idx; (size_t)lanes.count ) {
+                        if( !is_empty(lanes.data[idx]) ) {
+                                arrive(snzi, idx);
+                        }
+                }
+                #ifdef USE_SNZI
+                        // Re-create the snzi
+                        snzi{ log2( lanes.count / 8 ) };
+                        for( idx; (size_t)lanes.count ) {
+                                if( !is_empty(lanes.data[idx]) ) {
+                                        arrive(snzi, idx);
+                                }
+                        }
+                #endif
+        }
 …
         with( cltr->ready_queue ) {
+                ^(snzi){};
+                #ifdef USE_SNZI
+                        ^(snzi){};
+                #endif
                 // Remember old count
 …
+                }
+                // Re-create the snzi
+                snzi{ log2( lanes.count / 8 ) };
+                for( idx; (size_t)lanes.count ) {
+                        if( !is_empty(lanes.data[idx]) ) {
+                                arrive(snzi, idx);
+                        }
+                }
+                #ifdef USE_SNZI
+                        // Re-create the snzi
+                        snzi{ log2( lanes.count / 8 ) };
+                        for( idx; (size_t)lanes.count ) {
+                                if( !is_empty(lanes.data[idx]) ) {
+                                        arrive(snzi, idx);
+                                }
+                        }
+                #endif
+        }

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: