Context Navigation

Reverse Diff

ready_queue.cfa [6ba6846:b808625]

File:

: 1 edited

libcfa/src/concurrency/ready_queue.cfa (modified) (11 diffs)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/ready_queue.cfa

-              r6ba6846
+              rb808625
 #define __cforall_thread__
-#define _GNU_SOURCE
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
 …
 #define USE_RELAXED_FIFO
 // #define USE_WORK_STEALING
-// #define USE_CPU_WORK_STEALING
 #include "bits/defs.hfa"
-#include "device/cpu.hfa"
 #include "kernel_private.hfa"
+#define _GNU_SOURCE
 #include "stdlib.hfa"
 #include "math.hfa"
-#include <errno.h>
 #include <unistd.h>
-extern "C" {
-        #include <sys/syscall.h>  // __NR_xxx
+}
 #include "ready_subqueue.hfa"
 …
 #endif
+#if   defined(USE_CPU_WORK_STEALING)
+        #define READYQ_SHARD_FACTOR 2
+#elif defined(USE_RELAXED_FIFO)
+#if   defined(USE_RELAXED_FIFO)
         #define BIAS 4
         #define READYQ_SHARD_FACTOR 4
 …
+}
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No forward declaration needed
-        #define __kernel_rseq_register rseq_register_current_thread
-        #define __kernel_rseq_unregister rseq_unregister_current_thread
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        void __kernel_raw_rseq_register  (void);
-        void __kernel_raw_rseq_unregister(void);
-        #define __kernel_rseq_register __kernel_raw_rseq_register
-        #define __kernel_rseq_unregister __kernel_raw_rseq_unregister
-#else
-        // No forward declaration needed
-        // No initialization needed
-        static inline void noop(void) {}
-        #define __kernel_rseq_register noop
-        #define __kernel_rseq_unregister noop
-#endif
 //=======================================================================
 // Cluster wide reader-writer lock
 …
 // Lock-Free registering/unregistering of threads
 unsigned register_proc_id( void ) with(*__scheduler_lock) {
-        __kernel_rseq_register();
         __cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
         bool * handle = (bool *)&kernelTLS().sched_lock;
 …
         __cfadbg_print_safe(ready_queue, "Kernel : Unregister proc %p\n", proc);
-        __kernel_rseq_unregister();
+}
 …
 //=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
+        #if defined(USE_CPU_WORK_STEALING)
+                lanes.count = cpu_info.hthrd_count * READYQ_SHARD_FACTOR;
+                lanes.data = alloc( lanes.count );
+                lanes.tscs = alloc( lanes.count );
+                for( idx; (size_t)lanes.count ) {
+                        (lanes.data[idx]){};
+                        lanes.tscs[idx].tv = rdtscl();
+                }
+        #else
+                lanes.data  = 0p;
+                lanes.tscs  = 0p;
+                lanes.count = 0;
+        #endif
+        lanes.data  = 0p;
+        lanes.tscs  = 0p;
+        lanes.count = 0;
+}
 void ^?{}(__ready_queue_t & this) with (this) {
+        #if !defined(USE_CPU_WORK_STEALING)
+                verify( SEQUENTIAL_SHARD == lanes.count );
+        #endif
+        verify( SEQUENTIAL_SHARD == lanes.count );
         free(lanes.data);
         free(lanes.tscs);
 …
 //-----------------------------------------------------------------------
-#if defined(USE_CPU_WORK_STEALING)
-        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd, bool push_local) with (cltr->ready_queue) {
-                __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
-                processor * const proc = kernelTLS().this_processor;
-                const bool external = !push_local || (!proc) || (cltr != proc->cltr);
-                const int cpu = __kernel_getcpu();
-                /* paranoid */ verify(cpu >= 0);
-                /* paranoid */ verify(cpu < cpu_info.hthrd_count);
-                /* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
-                const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
-                /* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
-                /* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
-                /* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
-                const int start = map.self * READYQ_SHARD_FACTOR;
-                unsigned i;
-                do {
-                        unsigned r;
-                        if(unlikely(external)) { r = __tls_rand(); }
-                        else { r = proc->rdq.its++; }
-                        i = start + (r % READYQ_SHARD_FACTOR);
-                        // If we can't lock it retry
-                } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
-                // Actually push it
-                push(lanes.data[i], thrd);
-                // Unlock and return
-                __atomic_unlock( &lanes.data[i].lock );
-                #if !defined(__CFA_NO_STATISTICS__)
-                        if(unlikely(external)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
-                        else __tls_stats()->ready.push.local.success++;
-                #endif
-                __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+        }
-        // Pop from the ready queue from a given cluster
-        __attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
-                /* paranoid */ verify( lanes.count > 0 );
-                /* paranoid */ verify( kernelTLS().this_processor );
-                const int cpu = __kernel_getcpu();
-                /* paranoid */ verify(cpu >= 0);
-                /* paranoid */ verify(cpu < cpu_info.hthrd_count);
-                /* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
-                const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
-                /* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
-                /* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
-                /* paranoid */ verifyf((map.start + map.count) * READYQ_SHARD_FACTOR <= lanes.count, "have %zu lanes but map can go up to %u", lanes.count, (map.start + map.count) * READYQ_SHARD_FACTOR);
-                processor * const proc = kernelTLS().this_processor;
-                const int start = map.self * READYQ_SHARD_FACTOR;
-                // Did we already have a help target
-                if(proc->rdq.target == -1u) {
-                        // if We don't have a
-                        unsigned long long min = ts(lanes.data[start]);
-                        for(i; READYQ_SHARD_FACTOR) {
-                                unsigned long long tsc = ts(lanes.data[start + i]);
-                                if(tsc < min) min = tsc;
+                        }
-                        proc->rdq.cutoff = min;
-                        /* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
-                        /* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
-                        uint64_t chaos = __tls_rand();
-                        uint64_t high_chaos = (chaos >> 32);
-                        uint64_t  mid_chaos = (chaos >> 16) & 0xffff;
-                        uint64_t  low_chaos = chaos & 0xffff;
-                        unsigned me = map.self;
-                        unsigned cpu_chaos = map.start + (mid_chaos % map.count);
-                        bool global = cpu_chaos == me;
-                        if(global) {
-                                proc->rdq.target = high_chaos % lanes.count;
-                        } else {
-                                proc->rdq.target = (cpu_chaos * READYQ_SHARD_FACTOR) + (low_chaos % READYQ_SHARD_FACTOR);
-                                /* paranoid */ verify(proc->rdq.target >= (map.start * READYQ_SHARD_FACTOR));
-                                /* paranoid */ verify(proc->rdq.target <  ((map.start + map.count) * READYQ_SHARD_FACTOR));
+                        }
-                        /* paranoid */ verify(proc->rdq.target != -1u);
+                }
-                else {
-                        const unsigned long long bias = 0; //2_500_000_000;
-                        const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+                        {
-                                unsigned target = proc->rdq.target;
-                                proc->rdq.target = -1u;
-                                if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
-                                        $thread * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
-                                        proc->rdq.last = target;
-                                        if(t) return t;
+                                }
+                        }
-                        unsigned last = proc->rdq.last;
-                        if(last != -1u && lanes.tscs[last].tv < cutoff && ts(lanes.data[last]) < cutoff) {
-                                $thread * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.help));
-                                if(t) return t;
+                        }
-                        else {
-                                proc->rdq.last = -1u;
+                        }
+                }
-                for(READYQ_SHARD_FACTOR) {
-                        unsigned i = start + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
-                        if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
+                }
-                // All lanes where empty return 0p
-                return 0p;
+        }
-        __attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
-                processor * const proc = kernelTLS().this_processor;
-                unsigned last = proc->rdq.last;
-                if(last != -1u) {
-                        struct $thread * t = try_pop(cltr, last __STATS(, __tls_stats()->ready.pop.steal));
-                        if(t) return t;
-                        proc->rdq.last = -1u;
+                }
-                unsigned i = __tls_rand() % lanes.count;
-                return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
+        }
-        __attribute__((hot)) struct $thread * pop_search(struct cluster * cltr) {
-                return search(cltr);
+        }
-#endif
 #if defined(USE_RELAXED_FIFO)
         //-----------------------------------------------------------------------
 …
                                         if(is_empty(sl)) {
                                                 assert( sl.anchor.next == 0p );
                                                 assert( sl.anchor.ts   == -1llu );
+                                                assert( sl.anchor.ts   == 0  );
                                                 assert( mock_head(sl)  == sl.prev );
                                         } else {
                                                 assert( sl.anchor.next != 0p );
                                                 assert( sl.anchor.ts   != -1llu );
+                                                assert( sl.anchor.ts   != 0  );
                                                 assert( mock_head(sl)  != sl.prev );
+                                        }
 …
                 lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
                 for(i; lanes.count) {
+                        unsigned long long tsc1 = ts(lanes.data[i]);
+                        unsigned long long tsc2 = rdtscl();
+                        lanes.tscs[i].tv = min(tsc1, tsc2);
+                        unsigned long long tsc = ts(lanes.data[i]);
+                        lanes.tscs[i].tv = tsc != 0 ? tsc : rdtscl();
+                }
         #endif
+}
+#if defined(USE_CPU_WORK_STEALING)
+        // ready_queue size is fixed in this case
+        void ready_queue_grow(struct cluster * cltr) {}
+        void ready_queue_shrink(struct cluster * cltr) {}
+#else
+        // Grow the ready queue
+        void ready_queue_grow(struct cluster * cltr) {
+                size_t ncount;
+                int target = cltr->procs.total;
+                /* paranoid */ verify( ready_mutate_islocked() );
+                __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
+                // Make sure that everything is consistent
+                /* paranoid */ check( cltr->ready_queue );
+                // grow the ready queue
+                with( cltr->ready_queue ) {
+                        // Find new count
+                        // Make sure we always have atleast 1 list
+                        if(target >= 2) {
+                                ncount = target * READYQ_SHARD_FACTOR;
+                        } else {
+                                ncount = SEQUENTIAL_SHARD;
+// Grow the ready queue
+void ready_queue_grow(struct cluster * cltr) {
+        size_t ncount;
+        int target = cltr->procs.total;
+        /* paranoid */ verify( ready_mutate_islocked() );
+        __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue\n");
+        // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
+        // grow the ready queue
+        with( cltr->ready_queue ) {
+                // Find new count
+                // Make sure we always have atleast 1 list
+                if(target >= 2) {
+                        ncount = target * READYQ_SHARD_FACTOR;
+                } else {
+                        ncount = SEQUENTIAL_SHARD;
+                }
+                // Allocate new array (uses realloc and memcpies the data)
+                lanes.data = alloc( ncount, lanes.data`realloc );
+                // Fix the moved data
+                for( idx; (size_t)lanes.count ) {
+                        fix(lanes.data[idx]);
+                }
+                // Construct new data
+                for( idx; (size_t)lanes.count ~ ncount) {
+                        (lanes.data[idx]){};
+                }
+                // Update original
+                lanes.count = ncount;
+        }
+        fix_times(cltr);
+        reassign_cltr_id(cltr);
+        // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
+        __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
+        /* paranoid */ verify( ready_mutate_islocked() );
+}
+// Shrink the ready queue
+void ready_queue_shrink(struct cluster * cltr) {
+        /* paranoid */ verify( ready_mutate_islocked() );
+        __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
+        // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
+        int target = cltr->procs.total;
+        with( cltr->ready_queue ) {
+                // Remember old count
+                size_t ocount = lanes.count;
+                // Find new count
+                // Make sure we always have atleast 1 list
+                lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
+                /* paranoid */ verify( ocount >= lanes.count );
+                /* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
+                // for printing count the number of displaced threads
+                #if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+                        __attribute__((unused)) size_t displaced = 0;
+                #endif
+                // redistribute old data
+                for( idx; (size_t)lanes.count ~ ocount) {
+                        // Lock is not strictly needed but makes checking invariants much easier
+                        __attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+                        verify(locked);
+                        // As long as we can pop from this lane to push the threads somewhere else in the queue
+                        while(!is_empty(lanes.data[idx])) {
+                                struct $thread * thrd;
+                                unsigned long long _;
+                                [thrd, _] = pop(lanes.data[idx]);
+                                push(cltr, thrd, true);
+                                // for printing count the number of displaced threads
+                                #if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+                                        displaced++;
+                                #endif
+                        }
+                        // Allocate new array (uses realloc and memcpies the data)
+                        lanes.data = alloc( ncount, lanes.data`realloc );
+                        // Fix the moved data
+                        for( idx; (size_t)lanes.count ) {
+                                fix(lanes.data[idx]);
+                        }
+                        // Construct new data
+                        for( idx; (size_t)lanes.count ~ ncount) {
+                                (lanes.data[idx]){};
+                        }
+                        // Update original
+                        lanes.count = ncount;
+                }
+                fix_times(cltr);
+                reassign_cltr_id(cltr);
+                // Make sure that everything is consistent
+                /* paranoid */ check( cltr->ready_queue );
+                __cfadbg_print_safe(ready_queue, "Kernel : Growing ready queue done\n");
+                /* paranoid */ verify( ready_mutate_islocked() );
+        }
+        // Shrink the ready queue
+        void ready_queue_shrink(struct cluster * cltr) {
+                /* paranoid */ verify( ready_mutate_islocked() );
+                __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
+                // Make sure that everything is consistent
+                /* paranoid */ check( cltr->ready_queue );
+                int target = cltr->procs.total;
+                with( cltr->ready_queue ) {
+                        // Remember old count
+                        size_t ocount = lanes.count;
+                        // Find new count
+                        // Make sure we always have atleast 1 list
+                        lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
+                        /* paranoid */ verify( ocount >= lanes.count );
+                        /* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
+                        // for printing count the number of displaced threads
+                        #if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+                                __attribute__((unused)) size_t displaced = 0;
+                        #endif
+                        // redistribute old data
+                        for( idx; (size_t)lanes.count ~ ocount) {
+                                // Lock is not strictly needed but makes checking invariants much easier
+                                __attribute__((unused)) bool locked = __atomic_try_acquire(&lanes.data[idx].lock);
+                                verify(locked);
+                                // As long as we can pop from this lane to push the threads somewhere else in the queue
+                                while(!is_empty(lanes.data[idx])) {
+                                        struct $thread * thrd;
+                                        unsigned long long _;
+                                        [thrd, _] = pop(lanes.data[idx]);
+                                        push(cltr, thrd, true);
+                                        // for printing count the number of displaced threads
+                                        #if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+                                                displaced++;
+                                        #endif
+                                }
+                                // Unlock the lane
+                                __atomic_unlock(&lanes.data[idx].lock);
+                                // TODO print the queue statistics here
+                                ^(lanes.data[idx]){};
+                        }
+                        __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+                        // Allocate new array (uses realloc and memcpies the data)
+                        lanes.data = alloc( lanes.count, lanes.data`realloc );
+                        // Fix the moved data
+                        for( idx; (size_t)lanes.count ) {
+                                fix(lanes.data[idx]);
+                        }
+                }
+                fix_times(cltr);
+                reassign_cltr_id(cltr);
+                // Make sure that everything is consistent
+                /* paranoid */ check( cltr->ready_queue );
+                __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
+                /* paranoid */ verify( ready_mutate_islocked() );
+        }
+#endif
+                        // Unlock the lane
+                        __atomic_unlock(&lanes.data[idx].lock);
+                        // TODO print the queue statistics here
+                        ^(lanes.data[idx]){};
+                }
+                __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue displaced %zu threads\n", displaced);
+                // Allocate new array (uses realloc and memcpies the data)
+                lanes.data = alloc( lanes.count, lanes.data`realloc );
+                // Fix the moved data
+                for( idx; (size_t)lanes.count ) {
+                        fix(lanes.data[idx]);
+                }
+        }
+        fix_times(cltr);
+        reassign_cltr_id(cltr);
+        // Make sure that everything is consistent
+        /* paranoid */ check( cltr->ready_queue );
+        __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue done\n");
+        /* paranoid */ verify( ready_mutate_islocked() );
+}
 #if !defined(__CFA_NO_STATISTICS__)
 …
+        }
 #endif
-#if   defined(CFA_HAVE_LINUX_LIBRSEQ)
-        // No definition needed
-#elif defined(CFA_HAVE_LINUX_RSEQ_H)
-        #if defined( __x86_64 ) || defined( __i386 )
-                #define RSEQ_SIG        0x53053053
-        #elif defined( __ARM_ARCH )
-                #ifdef __ARMEB__
-                #define RSEQ_SIG    0xf3def5e7      /* udf    #24035    ; 0x5de3 (ARMv6+) */
-                #else
-                #define RSEQ_SIG    0xe7f5def3      /* udf    #24035    ; 0x5de3 */
-                #endif
-        #endif
-        extern void __disable_interrupts_hard();
-        extern void __enable_interrupts_hard();
-        void __kernel_raw_rseq_register  (void) {
-                /* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
-                // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, (sigset_t *)0p, _NSIG / 8);
-                int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), 0, RSEQ_SIG);
-                if(ret != 0) {
-                        int e = errno;
-                        switch(e) {
-                        case EINVAL: abort("KERNEL ERROR: rseq register invalid argument");
-                        case ENOSYS: abort("KERNEL ERROR: rseq register no supported");
-                        case EFAULT: abort("KERNEL ERROR: rseq register with invalid argument");
-                        case EBUSY : abort("KERNEL ERROR: rseq register already registered");
-                        case EPERM : abort("KERNEL ERROR: rseq register sig  argument  on unregistration does not match the signature received on registration");
-                        default: abort("KERNEL ERROR: rseq register unexpected return %d", e);
+                        }
+                }
+        }
-        void __kernel_raw_rseq_unregister(void) {
-                /* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
-                // int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, (sigset_t *)0p, _NSIG / 8);
-                int ret = syscall(__NR_rseq, &__cfaabi_rseq, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
-                if(ret != 0) {
-                        int e = errno;
-                        switch(e) {
-                        case EINVAL: abort("KERNEL ERROR: rseq unregister invalid argument");
-                        case ENOSYS: abort("KERNEL ERROR: rseq unregister no supported");
-                        case EFAULT: abort("KERNEL ERROR: rseq unregister with invalid argument");
-                        case EBUSY : abort("KERNEL ERROR: rseq unregister already registered");
-                        case EPERM : abort("KERNEL ERROR: rseq unregister sig  argument  on unregistration does not match the signature received on registration");
-                        default: abort("KERNEL ERROR: rseq unregisteunexpected return %d", e);
+                        }
+                }
+        }
-#else
-        // No definition needed
-#endif

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changes in libcfa/src/concurrency/ready_queue.cfa [6ba6846:b808625]

Legend:

libcfa/src/concurrency/ready_queue.cfa

Download in other formats: