Context Navigation

← Previous Change
Next Change →

Changeset c993b15 for libcfa/src/concurrency

Timestamp:

Apr 29, 2021, 4:26:25 PM (4 years ago)

Author:

Thierry Delisle <tdelisle@…>

Branches:

ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast-unique-expr, pthread-emulation, qualifiedEnum

Children:

Parents:

Message:

Changed RW lock to avoid hitting the global array on schedule.

Location:

libcfa/src/concurrency

Files:

: 7 edited

kernel.cfa (modified) (9 diffs)
kernel.hfa (modified) (3 diffs)
kernel/fwd.hfa (modified) (2 diffs)
kernel/startup.cfa (modified) (9 diffs)
kernel_private.hfa (modified) (9 diffs)
preemption.cfa (modified) (2 diffs)
ready_queue.cfa (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/kernel.cfa

-                      rb2fc7ad9
+                      rc993b15
         #if !defined(__CFA_NO_STATISTICS__)
                 if( this->print_halts ) {
                         __cfaabi_bits_print_safe( STDOUT_FILENO, "Processor : %d - %s (%p)\n", this->id, this->name, (void*)this);
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "Processor : %d - %s (%p)\n", this->unique_id, this->name, (void*)this);
+                }
         #endif
 …
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         if(this->print_halts) {
                                                 __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl());
+                                        }
                                 #endif
 …
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         if(this->print_halts) {
                                                 __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl());
+                                        }
                                 #endif
 …
         post( this->terminated );
         if(this == mainProcessor) {
 …
 static void __schedule_thread( $thread * thrd ) {
         /* paranoid */ verify( ! __preemption_enabled() );
-        /* paranoid */ verify( kernelTLS().this_proc_id );
         /* paranoid */ verify( ready_schedule_islocked());
         /* paranoid */ verify( thrd );
 …
 static inline $thread * __next_thread(cluster * this) with( *this ) {
         /* paranoid */ verify( ! __preemption_enabled() );
-        /* paranoid */ verify( kernelTLS().this_proc_id );
         ready_schedule_lock();
 …
         ready_schedule_unlock();
-        /* paranoid */ verify( kernelTLS().this_proc_id );
         /* paranoid */ verify( ! __preemption_enabled() );
         return thrd;
 …
 static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
         /* paranoid */ verify( ! __preemption_enabled() );
-        /* paranoid */ verify( kernelTLS().this_proc_id );
         ready_schedule_lock();
 …
         ready_schedule_unlock();
-        /* paranoid */ verify( kernelTLS().this_proc_id );
         /* paranoid */ verify( ! __preemption_enabled() );
         return thrd;

libcfa/src/concurrency/kernel.hfa

-                      rb2fc7ad9
+                      rc993b15
 // Processor id, required for scheduling threads
+struct __processor_id_t {
+        unsigned id:24;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct __stats_t * stats;
+        #endif
+};
 coroutine processorCtx_t {
 …
 // Wrapper around kernel threads
 struct __attribute__((aligned(128))) processor {
-        // Main state
-        inline __processor_id_t;
         // Cluster from which to get threads
         struct cluster * cltr;
 …
         // Handle to pthreads
         pthread_t kernel_thread;
+        // Unique id for the processor (not per cluster)
+        unsigned unique_id;
         struct {

libcfa/src/concurrency/kernel/fwd.hfa

-                      rb2fc7ad9
+                      rc993b15
                         struct $thread          * volatile this_thread;
                         struct processor        * volatile this_processor;
+                        struct __processor_id_t * volatile this_proc_id;
+                        struct __stats_t        * volatile this_stats;
+                        volatile bool sched_lock;
                         struct {
 …
                                 uint64_t bck_seed;
                         } ready_rng;
+                        struct __stats_t        * volatile this_stats;
+                        #ifdef __CFA_WITH_VERIFY__
+                                // Debug, check if the rwlock is owned for reading
+                                bool in_sched_lock;
+                                unsigned sched_id;
+                        #endif
                 } __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" )));

libcfa/src/concurrency/kernel/startup.cfa

-                      rb2fc7ad9
+                      rc993b15
 static void doregister( struct cluster & cltr );
 static void unregister( struct cluster & cltr );
+static void register_tls( processor * this );
+static void unregister_tls( processor * this );
 static void ?{}( $coroutine & this, current_stack_info_t * info);
 static void ?{}( $thread & this, current_stack_info_t * info);
 …
         NULL,                                                                                           // cannot use 0p
         NULL,
+        false,
+        { 1, false, false },
+,
+        { 0, 0 },
         NULL,
+        NULL,
+        { 1, false, false },
+        #ifdef __CFA_WITH_VERIFY__
+                false,
+,
+        #endif
 };
 …
         (*mainProcessor){};
+        register_tls( mainProcessor );
         //initialize the global state variables
         __cfaabi_tls.this_processor = mainProcessor;
-        __cfaabi_tls.this_proc_id   = (__processor_id_t*)mainProcessor;
         __cfaabi_tls.this_thread    = mainThread;
 …
         #endif
+        unregister_tls( mainProcessor );
         // Destroy the main processor and its context in reverse order of construction
         // These were manually constructed so we need manually destroy them
 …
         processor * proc = (processor *) arg;
         __cfaabi_tls.this_processor = proc;
-        __cfaabi_tls.this_proc_id   = (__processor_id_t*)proc;
         __cfaabi_tls.this_thread    = 0p;
         __cfaabi_tls.preemption_state.[enabled, disable_count] = [false, 1];
+        register_tls( proc );
         // SKULLDUGGERY: We want to create a context for the processor coroutine
         // which is needed for the 2-step context switch. However, there is no reason
 …
                 #endif
         #endif
+        unregister_tls( proc );
         return 0p;
 …
         #endif
-        // Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
-        uint_fast32_t last_size = ready_mutate_register((__processor_id_t*)&this);
-                this.cltr->procs.total += 1u;
-                insert_last(this.cltr->procs.actives, this);
-                // Adjust the ready queue size
-                ready_queue_grow( cltr );
-        // Unlock the RWlock
-        ready_mutate_unlock( last_size );
         __cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
 …
 // Not a ctor, it just preps the destruction but should not destroy members
 static void deinit(processor & this) {
-        // Lock the RWlock so no-one pushes/pops while we are changing the queue
-        uint_fast32_t last_size = ready_mutate_lock();
-                this.cltr->procs.total -= 1u;
-                remove(this);
-                // Adjust the ready queue size
-                ready_queue_shrink( this.cltr );
-        // Unlock the RWlock and unregister: we don't need the read_lock any more
-        ready_mutate_unregister((__processor_id_t*)&this, last_size );
         close(this.idle);
+}
 …
         cltr->nthreads -= 1;
         unlock(cltr->thread_list_lock);
+}
+static void register_tls( processor * this ) {
+        // Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
+        uint_fast32_t last_size;
+        [this->unique_id, last_size] = ready_mutate_register();
+                this->cltr->procs.total += 1u;
+                insert_last(this->cltr->procs.actives, *this);
+                // Adjust the ready queue size
+                ready_queue_grow( this->cltr );
+        // Unlock the RWlock
+        ready_mutate_unlock( last_size );
+}
+static void unregister_tls( processor * this ) {
+        // Lock the RWlock so no-one pushes/pops while we are changing the queue
+        uint_fast32_t last_size = ready_mutate_lock();
+                this->cltr->procs.total -= 1u;
+                remove(*this);
+                // clear the cluster so nothing gets pushed to local queues
+                cluster * cltr = this->cltr;
+                this->cltr = 0p;
+                // Adjust the ready queue size
+                ready_queue_shrink( cltr );
+        // Unlock the RWlock and unregister: we don't need the read_lock any more
+        ready_mutate_unregister( this->unique_id, last_size );
+}

libcfa/src/concurrency/kernel_private.hfa

-                      rb2fc7ad9
+                      rc993b15
 // Scheduler
-struct __attribute__((aligned(128))) __scheduler_lock_id_t;
 extern "C" {
 …
 // Lock-Free registering/unregistering of threads
 // Register a processor to a given cluster and get its unique id in return
 void register_proc_id( struct __processor_id_t * );
+unsigned register_proc_id( void );
 // Unregister a processor from a given cluster using its id, getting back the original pointer
 void unregister_proc_id( struct __processor_id_t * proc );
+void unregister_proc_id( unsigned );
 //=======================================================================
 …
+}
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __attribute__((aligned(128))) __scheduler_lock_id_t {
+        // Spin lock used as the underlying lock
+        volatile bool lock;
+        // Handle pointing to the proc owning this cell
+        // Used for allocating cells and debugging
+        __processor_id_t * volatile handle;
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                bool owned;
+        #endif
+};
+static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
 //-----------------------------------------------------------------------
 …
         // writer lock
         volatile bool lock;
+        volatile bool write_lock;
         // data pointer
         __scheduler_lock_id_t * data;
+        volatile bool * volatile * data;
 };
 …
 static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        unsigned iproc = kernelTLS().this_proc_id->id;
+        /*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
+        /*paranoid*/ verify(iproc < ready);
+        /* paranoid */ verify( ! kernelTLS().in_sched_lock );
+        /* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
+        /* paranoid */ verify( !kernelTLS().this_processor || kernelTLS().this_processor->unique_id == kernelTLS().sched_id );
         // Step 1 : make sure no writer are in the middle of the critical section
         while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+        while(__atomic_load_n(&write_lock, (int)__ATOMIC_RELAXED))
                 Pause();
 …
         // Step 2 : acquire our local lock
         __atomic_acquire( &data[iproc].lock );
         /*paranoid*/ verify(data[iproc].lock);
+        __atomic_acquire( &kernelTLS().sched_lock );
+        /*paranoid*/ verify(kernelTLS().sched_lock);
         #ifdef __CFA_WITH_VERIFY__
                 // Debug, check if this is owned for reading
                 data[iproc].owned = true;
+                kernelTLS().in_sched_lock = true;
         #endif
+}
 …
 static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        unsigned iproc = kernelTLS().this_proc_id->id;
+        /*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
+        /*paranoid*/ verify(iproc < ready);
+        /*paranoid*/ verify(data[iproc].lock);
+        /*paranoid*/ verify(data[iproc].owned);
+        /* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
+        /* paranoid */ verify( !kernelTLS().this_processor || kernelTLS().this_processor->unique_id == kernelTLS().sched_id );
+        /* paranoid */ verify( kernelTLS().sched_lock );
+        /* paranoid */ verify( kernelTLS().in_sched_lock );
         #ifdef __CFA_WITH_VERIFY__
                 // Debug, check if this is owned for reading
                 data[iproc].owned = false;
+                kernelTLS().in_sched_lock = false;
         #endif
         __atomic_unlock(&data[iproc].lock);
+        __atomic_unlock(&kernelTLS().sched_lock);
+}
 …
         static inline bool ready_schedule_islocked(void) {
                 /* paranoid */ verify( ! __preemption_enabled() );
+                /*paranoid*/ verify( kernelTLS().this_proc_id );
+                __processor_id_t * proc = kernelTLS().this_proc_id;
+                return __scheduler_lock->data[proc->id].owned;
+                /* paranoid */ verify( (!kernelTLS().in_sched_lock) || kernelTLS().sched_lock );
+                return kernelTLS().sched_lock;
+        }
         static inline bool ready_mutate_islocked() {
                 return __scheduler_lock->lock;
+                return __scheduler_lock->write_lock;
+        }
 #endif
 …
 // Register a processor to a given cluster and get its unique id in return
 // For convenience, also acquires the lock
+static inline uint_fast32_t ready_mutate_register( struct __processor_id_t * proc ) {
+        register_proc_id( proc );
+        return ready_mutate_lock();
+static inline [unsigned, uint_fast32_t] ready_mutate_register() {
+        unsigned id = register_proc_id();
+        uint_fast32_t last = ready_mutate_lock();
+        return [id, last];
+}
 // Unregister a processor from a given cluster using its id, getting back the original pointer
 // assumes the lock is acquired
 static inline void ready_mutate_unregister( struct __processor_id_t * proc, uint_fast32_t last_s ) {
+static inline void ready_mutate_unregister( unsigned id, uint_fast32_t last_s ) {
         ready_mutate_unlock( last_s );
         unregister_proc_id( proc );
+        unregister_proc_id( id );
+}

libcfa/src/concurrency/preemption.cfa

-                      rb2fc7ad9
+                      rc993b15
 // Waits on SIGALRM and send SIGUSR1 to whom ever needs it
 static void * alarm_loop( __attribute__((unused)) void * args ) {
+        __processor_id_t id;
+        register_proc_id(&id);
+        __cfaabi_tls.this_proc_id = &id;
+        unsigned id = register_proc_id();
         // Block sigalrms to control when they arrive
 …
 EXIT:
         __cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
         register_proc_id(&id);
+        unregister_proc_id(id);
         return 0p;

libcfa/src/concurrency/ready_queue.cfa

-                      rb2fc7ad9
+                      rc993b15
         this.alloc = 0;
         this.ready = 0;
-        this.lock  = false;
         this.data  = alloc(this.max);
+        /*paranoid*/ verify( 0 == (((uintptr_t)(this.data    )) % 64) );
+        /*paranoid*/ verify( 0 == (((uintptr_t)(this.data + 1)) % 64) );
+        this.write_lock  = false;
         /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
         /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
 …
+}
-void ?{}( __scheduler_lock_id_t & this, __processor_id_t * proc ) {
-        this.handle = proc;
-        this.lock   = false;
-        #ifdef __CFA_WITH_VERIFY__
-                this.owned  = false;
-        #endif
+}
 //=======================================================================
 // Lock-Free registering/unregistering of threads
 void register_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+unsigned register_proc_id( void ) with(*__scheduler_lock) {
         __cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
+        bool * handle = (bool *)&kernelTLS().sched_lock;
         // Step - 1 : check if there is already space in the data
 …
         // Check among all the ready
         for(uint_fast32_t i = 0; i < s; i++) {
+                __processor_id_t * null = 0p; // Re-write every loop since compare thrashes it
+                if( __atomic_load_n(&data[i].handle, (int)__ATOMIC_RELAXED) == null
+                        && __atomic_compare_exchange_n( &data[i].handle, &null, proc, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                        /*paranoid*/ verify(i < ready);
+                        /*paranoid*/ verify(0 == (__alignof__(data[i]) % cache_line_size));
+                        /*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
+                        proc->id = i;
+                        return;
+                bool * volatile * cell = (bool * volatile *)&data[i]; // Cforall is bugged and the double volatiles causes problems
+                /* paranoid */ verify( handle != *cell );
+                bool * null = 0p; // Re-write every loop since compare thrashes it
+                if( __atomic_load_n(cell, (int)__ATOMIC_RELAXED) == null
+                        && __atomic_compare_exchange_n( cell, &null, handle, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                        /* paranoid */ verify(i < ready);
+                        /* paranoid */ verify( (kernelTLS().sched_id = i, true) );
+                        return i;
+                }
+        }
 …
         // Step - 3 : Mark space as used and then publish it.
+        __scheduler_lock_id_t * storage = (__scheduler_lock_id_t *)&data[n];
+        (*storage){ proc };
+        data[n] = handle;
         while() {
                 unsigned copy = n;
 …
         // Return new spot.
+        /*paranoid*/ verify(n < ready);
+        /*paranoid*/ verify(__alignof__(data[n]) == (2 * cache_line_size));
+        /*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
+        proc->id = n;
+}
+void unregister_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+        unsigned id = proc->id;
+        /*paranoid*/ verify(id < ready);
+        /*paranoid*/ verify(proc == __atomic_load_n(&data[id].handle, __ATOMIC_RELAXED));
+        __atomic_store_n(&data[id].handle, 0p, __ATOMIC_RELEASE);
+        /* paranoid */ verify(n < ready);
+        /* paranoid */ verify( (kernelTLS().sched_id = n, true) );
+        return n;
+}
+void unregister_proc_id( unsigned id ) with(*__scheduler_lock) {
+        /* paranoid */ verify(id < ready);
+        /* paranoid */ verify(id == kernelTLS().sched_id);
+        /* paranoid */ verify(data[id] == &kernelTLS().sched_lock);
+        bool * volatile * cell = (bool * volatile *)&data[id]; // Cforall is bugged and the double volatiles causes problems
+        __atomic_store_n(cell, 0p, __ATOMIC_RELEASE);
         __cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
 …
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( ! kernelTLS().sched_lock );
         // Step 1 : lock global lock
         // It is needed to avoid processors that register mid Critical-Section
         //   to simply lock their own lock and enter.
         __atomic_acquire( &lock );
+        __atomic_acquire( &write_lock );
         // Step 2 : lock per-proc lock
 …
         uint_fast32_t s = ready;
         for(uint_fast32_t i = 0; i < s; i++) {
+                __atomic_acquire( &data[i].lock );
+                volatile bool * llock = data[i];
+                if(llock) __atomic_acquire( llock );
+        }
 …
         // Alternative solution : return s in write_lock and pass it to write_unlock
         for(uint_fast32_t i = 0; i < last_s; i++) {
                 verify(data[i].lock);
                 __atomic_store_n(&data[i].lock, (bool)false, __ATOMIC_RELEASE);
+                volatile bool * llock = data[i];
+                if(llock) __atomic_store_n(llock, (bool)false, __ATOMIC_RELEASE);
+        }
         // Step 2 : release global lock
         /*paranoid*/ assert(true == lock);
         __atomic_store_n(&lock, (bool)false, __ATOMIC_RELEASE);
+        /*paranoid*/ assert(true == write_lock);
+        __atomic_store_n(&write_lock, (bool)false, __ATOMIC_RELEASE);
         /* paranoid */ verify( ! __preemption_enabled() );

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: