Context Navigation

-              r0f9ceacb
+              rb798713
+}
 #define __CFA_NO_BIT_TEST_AND_SET__
+// #define __CFA_NO_BIT_TEST_AND_SET__
 static inline bool bts(volatile unsigned long long int * target, unsigned long long int bit ) {
 …
         asm volatile(
             "LOCK btsq %[bit], %[target]\n\t"
             :"=@ccc" (result)
+            : "=@ccc" (result)
             : [target] "m" (*target), [bit] "r" (bit)
         );

libcfa/src/concurrency/invoke.h

-              r0f9ceacb
+              rb798713
         };
+        // Link lists fields
+        // instrusive link field for threads
+        struct __thread_desc_link {
+                struct thread_desc * next;
+                struct thread_desc * prev;
+                unsigned long long ts;
+        };
         struct thread_desc {
                 // Core threading fields
 …
                 // Link lists fields
                 // instrusive link field for threads
+                struct thread_desc * next;
+                struct thread_desc * prev;
+                unsigned long long ts;
+                struct __thread_desc_link link;
                 struct {
 …
         extern "Cforall" {
                 static inline thread_desc *& get_next( thread_desc & this ) {
                         return this.next;
+                        return this.link.next;
+                }

libcfa/src/concurrency/kernel.cfa

-              r0f9ceacb
+              rb798713
         self_mon.recursion = 1;
         self_mon_p = &self_mon;
+        next = NULL;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = NULL;
 …
         // register the processor unless it's the main thread which is handled in the boot sequence
         if(this != mainProcessor)
+        if(this != mainProcessor) {
                 this->id = doregister(this->cltr, this);
+                ready_queue_grow( this->cltr );
+        }
+        {
 …
         V( this->terminated );
         // unregister the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor)
+        if(this != mainProcessor) {
+                ready_queue_shrink( this->cltr );
                 unregister(this->cltr, this);
+        }
         __cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+        stats_tls_tally(this->cltr);
+}
 …
         verify( ! kernelTLS.preemption_state.enabled );
+        verifyf( thrd->next == NULL, "Expected null got %p", thrd->next );
+        verifyf( thrd->link.next == NULL, "Expected null got %p", thrd->link.next );
+        ready_schedule_lock(thrd->curr_cluster, kernelTLS.this_processor);
+                bool was_empty = push( thrd->curr_cluster, thrd );
+        ready_schedule_unlock(thrd->curr_cluster, kernelTLS.this_processor);
         with( *thrd->curr_cluster ) {
-                ready_schedule_lock(*thrd->curr_cluster, kernelTLS.this_processor);
-                __atomic_acquire(&ready_queue.lock);
-                thrd->ts = rdtscl();
-                bool was_empty = push( ready_queue, thrd );
-                __atomic_unlock(&ready_queue.lock);
-                ready_schedule_unlock(*thrd->curr_cluster, kernelTLS.this_processor);
                 if(was_empty) {
                         lock      (proc_list_lock __cfaabi_dbg_ctx2);
 …
                         wake_fast(idle);
+                }
+        }
 …
         verify( ! kernelTLS.preemption_state.enabled );
+        ready_schedule_lock(*this, kernelTLS.this_processor);
+                __atomic_acquire(&ready_queue.lock);
+                        thread_desc * head;
+                        __attribute__((unused)) bool _;
+                        [head, _] = pop( ready_queue );
+                __atomic_unlock(&ready_queue.lock);
+        ready_schedule_unlock(*this, kernelTLS.this_processor);
+        ready_schedule_lock(this, kernelTLS.this_processor);
+                thread_desc * head = pop( this );
+        ready_schedule_unlock(this, kernelTLS.this_processor);
         verify( ! kernelTLS.preemption_state.enabled );
 …
         // Destroy the main processor and its context in reverse order of construction
         // These were manually constructed so we need manually destroy them
+        ^(mainProcessor->runner){};
+        void ^?{}(processor & this) with( this ) {
+                //don't join the main thread here, that wouldn't make any sense
+                __cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+        }
         ^(*mainProcessor){};

libcfa/src/concurrency/kernel.hfa

-              r0f9ceacb
+              rb798713
         // spin lock protecting the queue
         volatile bool lock;
+        unsigned int last_id;
         // anchor for the head and the tail of the queue
         struct __sentinel_t {
+                struct thread_desc * next;
+                struct thread_desc * prev;
+                unsigned long long ts;
+                // Link lists fields
+                // instrusive link field for threads
+                // must be exactly as in thread_desc
+                __thread_desc_link link;
         } before, after;
         // Optional statistic counters
         #ifndef __CFA_NO_SCHED_STATS__
+        #if !defined(__CFA_NO_SCHED_STATS__)
                 struct __attribute__((aligned(64))) {
                         // difference between number of push and pops
 …
 void ^?{}(__intrusive_ready_queue_t & this);
+typedef unsigned long long __cfa_readyQ_mask_t;
+// enum {
+//      __cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
+//      __cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
+// };
+#define __cfa_readyQ_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_readyQs (__cfa_readyQ_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+//TODO adjust cache size to ARCHITECTURE
+struct __attribute__((aligned(128))) __ready_queue_t {
+        struct {
+                volatile size_t count;
+                volatile __cfa_readyQ_mask_t mask[ __cfa_readyQ_mask_size ];
+        } empty;
+        struct __attribute__((aligned(64))) {
+                __intrusive_ready_queue_t * volatile data;
+                volatile size_t count;
+        } list;
+        #if !defined(__CFA_NO_STATISTICS__)
+                __attribute__((aligned(64))) struct {
+                        struct {
+                                struct {
+                                        volatile size_t attempt;
+                                        volatile size_t success;
+                                } push;
+                                struct {
+                                        volatile size_t maskrds;
+                                        volatile size_t attempt;
+                                        volatile size_t success;
+                                } pop;
+                        } pick;
+                        struct {
+                                volatile size_t value;
+                                volatile size_t count;
+                        } full;
+                } global_stats;
+        #endif
+};
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
 //-----------------------------------------------------------------------------
 // Cluster
 …
         // Ready queue for threads
         __intrusive_ready_queue_t ready_queue;
+        __ready_queue_t ready_queue;
         // Name of the cluster

libcfa/src/concurrency/kernel_private.hfa

-              r0f9ceacb
+              rb798713
 static inline bool __atomic_try_acquire(volatile bool * ll) {
         return __atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
 …
 // Reader side : acquire when using the ready queue to schedule but not
 //  creating/destroying queues
 static inline void ready_schedule_lock( struct cluster & cltr, struct processor * proc) with(cltr.ready_lock) {
+static inline void ready_schedule_lock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
         unsigned iproc = proc->id;
         /*paranoid*/ verify(data[iproc].handle == proc);
 …
+}
 static inline void ready_schedule_unlock( struct cluster & cltr, struct processor * proc) with(cltr.ready_lock) {
+static inline void ready_schedule_unlock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
         unsigned iproc = proc->id;
         /*paranoid*/ verify(data[iproc].handle == proc);
 …
 void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );
+bool push(__intrusive_ready_queue_t & this, thread_desc * node);
+[thread_desc *, bool] pop(__intrusive_ready_queue_t & this);
+//=======================================================================
+// Ready-Queue API
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr);
+void ready_queue_grow  (struct cluster * cltr);
+void ready_queue_shrink(struct cluster * cltr);
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr);
+#else
+static inline void stats_tls_tally(struct cluster * cltr) {}
+#endif
 // Local Variables: //

libcfa/src/concurrency/monitor.cfa

r0f9ceacb	rb798713
841	841	for( thread_desc ** thrd_it = &entry_queue.head;
842	842	*thrd_it;
843		thrd_it = &(*thrd_it)->next
	843	thrd_it = &(*thrd_it)->link.next
844	844	) {
845	845	// For each acceptable check if it matches

libcfa/src/concurrency/ready_queue.cfa

-              r0f9ceacb
+              rb798713
+}
+static inline unsigned rand_bit(unsigned rnum, size_t mask) {
+        verify(sizeof(mask) == 8);
+        unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if !defined(__BMI2__)
+        uint64_t v = mask;   // Input value to find position with rank r.
+        unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+        unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+        uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+        unsigned int t;      // Bit count temporary.
+        // Do a normal parallel bit count for a 64-bit integer,
+        // but store all intermediate steps.
+        a =  v - ((v >> 1) & ~0UL/3);
+        b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+        c = (b + (b >> 4)) & ~0UL/0x11;
+        d = (c + (c >> 8)) & ~0UL/0x101;
+        t = (d >> 32) + (d >> 48);
+        // Now do branchless select!
+        s  = 64;
+        s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+        t  = (d >> (s - 16)) & 0xff;
+        s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+        t  = (c >> (s - 8)) & 0xf;
+        s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+        t  = (b >> (s - 4)) & 0x7;
+        s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+        t  = (a >> (s - 2)) & 0x3;
+        s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+        t  = (v >> (s - 1)) & 0x1;
+        s -= ((t - r) & 256) >> 8;
+        return s - 1;
+#else
+        uint64_t picked = _pdep_u64(1ul << bit, mask);
+        return picked ? __builtin_ctzl(picked) : 0;
+#endif
+}
+static inline __cfa_readyQ_mask_t readyQ_mask_full       () { return (8 * sizeof(__cfa_readyQ_mask_t)) - 1; }
+static inline __cfa_readyQ_mask_t readyQ_mask_shit_length() { return (8 * sizeof(__cfa_readyQ_mask_t)) - __builtin_clzl(readyQ_mask_full()); }
+static inline [__cfa_readyQ_mask_t, __cfa_readyQ_mask_t] extract(__cfa_readyQ_mask_t idx) {
+        __cfa_readyQ_mask_t word = idx >> readyQ_mask_shit_length();
+        __cfa_readyQ_mask_t bit  = idx &  readyQ_mask_full();
+        return [bit, word];
+}
 //=======================================================================
 // Cluster wide reader-writer lock
 …
 // Intrusive Queue used by ready queue
 //=======================================================================
-static const size_t fields_offset = offsetof( thread_desc, next );
 // Get the head pointer (one before the first element) from the anchor
 static inline thread_desc * head(const __intrusive_ready_queue_t & this) {
         thread_desc * rhead = (thread_desc *)(
                 (uintptr_t)( &this.before ) - fields_offset
+                (uintptr_t)( &this.before ) - offsetof( thread_desc, link )
         );
         /* paranoid */ verify(rhead);
 …
 static inline thread_desc * tail(const __intrusive_ready_queue_t & this) {
         thread_desc * rtail = (thread_desc *)(
                 (uintptr_t)( &this.after ) - fields_offset
+                (uintptr_t)( &this.after ) - offsetof( thread_desc, link )
         );
         /* paranoid */ verify(rtail);
 …
 // Ctor
 void ?{}( __intrusive_ready_queue_t & this ) {
+        this.before.prev = 0p;
+        this.before.next = tail(this);
+        this.after .prev = head(this);
+        this.after .next = 0p;
+        this.lock = false;
+        this.last_id = -1u;
+        this.before.link.prev = 0p;
+        this.before.link.next = tail(this);
+        this.before.link.ts   = 0;
+        this.after .link.prev = head(this);
+        this.after .link.next = 0p;
+        this.after .link.ts   = 0;
+        #if !defined(__CFA_NO_SCHED_STATS__)
+                this.stat.diff = 0;
+                this.stat.push = 0;
+                this.stat.pop  = 0;
+        #endif
         // We add a boat-load of assertions here because the anchor code is very fragile
         /* paranoid */ verify(((uintptr_t)( head(this) ) + fields_offset) == (uintptr_t)(&this.before));
         /* paranoid */ verify(((uintptr_t)( tail(this) ) + fields_offset) == (uintptr_t)(&this.after ));
         /* paranoid */ verify(head(this)->prev == 0p );
         /* paranoid */ verify(head(this)->next == tail(this) );
         /* paranoid */ verify(tail(this)->next == 0p );
         /* paranoid */ verify(tail(this)->prev == head(this) );
         /* paranoid */ verify(&head(this)->prev == &this.before.prev );
         /* paranoid */ verify(&head(this)->next == &this.before.next );
         /* paranoid */ verify(&tail(this)->prev == &this.after .prev );
         /* paranoid */ verify(&tail(this)->next == &this.after .next );
+        /* paranoid */ verify(((uintptr_t)( head(this) ) + offsetof( thread_desc, link )) == (uintptr_t)(&this.before));
+        /* paranoid */ verify(((uintptr_t)( tail(this) ) + offsetof( thread_desc, link )) == (uintptr_t)(&this.after ));
+        /* paranoid */ verify(head(this)->link.prev == 0p );
+        /* paranoid */ verify(head(this)->link.next == tail(this) );
+        /* paranoid */ verify(tail(this)->link.next == 0p );
+        /* paranoid */ verify(tail(this)->link.prev == head(this) );
+        /* paranoid */ verify(&head(this)->link.prev == &this.before.link.prev );
+        /* paranoid */ verify(&head(this)->link.next == &this.before.link.next );
+        /* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
+        /* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
         /* paranoid */ verify(sizeof(__intrusive_ready_queue_t) == 128);
         /* paranoid */ verify(sizeof(this) == 128);
 …
         /* paranoid */ verify(__alignof__(this) == 128);
         /* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
+        /* paranoid */ verifyf(readyQ_mask_shit_length() == 6 , "%zu", readyQ_mask_shit_length());
+        /* paranoid */ verifyf(readyQ_mask_full()        == 63, "%zu", readyQ_mask_full());
+}
 …
 void ^?{}( __intrusive_ready_queue_t & this ) {
         // Make sure the list is empty
         /* paranoid */ verify(head(this)->prev == 0p );
         /* paranoid */ verify(head(this)->next == tail(this) );
         /* paranoid */ verify(tail(this)->next == 0p );
         /* paranoid */ verify(tail(this)->prev == head(this) );
+        /* paranoid */ verify(head(this)->link.prev == 0p );
+        /* paranoid */ verify(head(this)->link.next == tail(this) );
+        /* paranoid */ verify(tail(this)->link.next == 0p );
+        /* paranoid */ verify(tail(this)->link.prev == head(this) );
+}
 …
 bool push(__intrusive_ready_queue_t & this, thread_desc * node) {
         verify(this.lock);
+        verify(node->ts != 0);
+        verify(node->next == 0p);
+        verify(node->prev == 0p);
+        verify(node->link.ts != 0);
+        verify(node->link.next == 0p);
+        verify(node->link.prev == 0p);
+        if(this.before.link.ts == 0l) {
+                verify(tail(this)->link.next == 0p);
+                verify(tail(this)->link.prev == head(this));
+                verify(head(this)->link.next == tail(this));
+                verify(head(this)->link.prev == 0p);
+        }
         // Get the relevant nodes locally
         thread_desc * tail = tail(this);
         thread_desc * prev = tail->prev;
+        thread_desc * prev = tail->link.prev;
         // Do the push
         node->next = tail;
         node->prev = prev;
         prev->next = node;
         tail->prev = node;
+        node->link.next = tail;
+        node->link.prev = prev;
+        prev->link.next = node;
+        tail->link.prev = node;
         // Update stats
 …
         #endif
+        verify(node->link.next == tail(this));
         // Check if the queue used to be empty
         if(this.before.ts == 0l) {
                 this.before.ts = node->ts;
                 verify(node->prev == head(this));
+        if(this.before.link.ts == 0l) {
+                this.before.link.ts = node->link.ts;
+                verify(node->link.prev == head(this));
                 return true;
+        }
 …
 [thread_desc *, bool] pop(__intrusive_ready_queue_t & this) {
         verify(this.lock);
+        verify(this.before.link.ts != 0ul);
         thread_desc * head = head(this);
         thread_desc * tail = tail(this);
+        thread_desc * node = head->next;
+        thread_desc * next = node->next;
+        if(node == tail) return [0p, false];
+        thread_desc * node = head->link.next;
+        thread_desc * next = node->link.next;
+        if(node == tail) {
+                verify(false);
+                verify(this.before.link.ts == 0ul);
+                verify(tail(this)->link.next == 0p);
+                verify(tail(this)->link.prev == head(this));
+                verify(head(this)->link.next == tail(this));
+                verify(head(this)->link.prev == 0p);
+                return [0p, false];
+        }
         /* paranoid */ verify(node);
         head->next = next;
         next->prev = head;
+        head->link.next = next;
+        next->link.prev = head;
         #ifndef __CFA_NO_SCHED_STATS__
 …
         if(next == tail) {
+                this.before.ts = 0ul;
+                node->[next, prev] = 0p;
+                this.before.link.ts = 0ul;
+                verify(tail(this)->link.next == 0p);
+                verify(tail(this)->link.prev == head(this));
+                verify(head(this)->link.next == tail(this));
+                verify(head(this)->link.prev == 0p);
+                node->link.[next, prev] = 0p;
                 return [node, true];
+        }
         else {
                 verify(next->ts != 0);
                 this.before.ts = next->ts;
                 verify(this.before.ts != 0);
                 node->[next, prev] = 0p;
+                verify(next->link.ts != 0);
+                this.before.link.ts = next->link.ts;
+                verify(this.before.link.ts != 0);
+                node->link.[next, prev] = 0p;
                 return [node, false];
+        }
 …
 static inline unsigned long long ts(__intrusive_ready_queue_t & this) {
+        return this.before.ts;
+}
+        return this.before.link.ts;
+}
+//=======================================================================
+// Cforall Reqdy Queue used by ready queue
+//=======================================================================
+static __attribute__((aligned(128))) thread_local struct {
+        struct {
+                struct {
+                        size_t attempt;
+                        size_t success;
+                } push;
+                struct {
+                        size_t maskrds;
+                        size_t attempt;
+                        size_t success;
+                } pop;
+        } pick;
+        struct {
+                size_t value;
+                size_t count;
+        } full;
+} tls = {
+        /* pick */{
+                /* push */{ 0, 0 },
+                /* pop  */{ 0, 0, 0 },
+        },
+        /* full */{ 0, 0 }
+};
+//-----------------------------------------------------------------------
+void ?{}(__ready_queue_t & this) with (this) {
+        empty.count = 0;
+        for( i ; __cfa_readyQ_mask_size ) {
+                empty.mask[i] = 0;
+        }
+        list.data = alloc(4);
+        for( i; 4 ) {
+                (list.data[i]){};
+        }
+        list.count = 4;
+        #if !defined(__CFA_NO_STATISTICS__)
+                global_stats.pick.push.attempt = 0;
+                global_stats.pick.push.success = 0;
+                global_stats.pick.pop .maskrds = 0;
+                global_stats.pick.pop .attempt = 0;
+                global_stats.pick.pop .success = 0;
+                global_stats.full.value = 0;
+                global_stats.full.count = 0;
+        #endif
+}
+void ^?{}(__ready_queue_t & this) with (this) {
+        verify( 4  == list .count );
+        verify( 0  == empty.count );
+        for( i; 4 ) {
+                ^(list.data[i]){};
+        }
+        free(list.data);
+        #if defined(__CFA_WITH_VERIFY__)
+                for( i ; __cfa_readyQ_mask_size ) {
+                        assert( 0 == empty.mask[i] );
+                }
+        #endif
+}
+//-----------------------------------------------------------------------
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd) with (cltr->ready_queue) {
+        thrd->link.ts = rdtscl();
+        while(true) {
+                // Pick a random list
+                unsigned i = tls_rand() % list.count;
+                #if !defined(__CFA_NO_STATISTICS__)
+                        tls.pick.push.attempt++;
+                #endif
+                // If we can't lock it retry
+                if( !__atomic_try_acquire( &list.data[i].lock ) ) continue;
+                verify(list.data[i].last_id == -1u);
+                list.data[i].last_id = kernelTLS.this_processor->id;
+                __attribute__((unused)) size_t num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+                bool first = false;
+                verify( list.data[i].last_id == kernelTLS.this_processor->id );
+                verify( list.data[i].lock );
+                // Actually push it
+                if(push(list.data[i], thrd)) {
+                        size_t ret = __atomic_fetch_add( &empty.count, 1z, __ATOMIC_SEQ_CST);
+                        first = (ret == 0);
+                        __cfa_readyQ_mask_t word;
+                        __cfa_readyQ_mask_t bit;
+                        [bit, word] = extract(i);
+                        verifyf((empty.mask[word] & (1ull << bit)) == 0, "Before set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
+                        __attribute__((unused)) bool ret = bts(&empty.mask[word], bit);
+                        verify(!(bool)ret);
+                        verifyf((empty.mask[word] & (1ull << bit)) != 0, "After set %llu:%llu (%u), %llx & %llx", word, bit, i, empty.mask[word], (1ull << bit));
+                }
+                verify(empty.count <= (int)list.count);
+                verify( list.data[i].last_id == kernelTLS.this_processor->id );
+                verify( list.data[i].lock );
+                // Unlock and return
+                list.data[i].last_id = -1u;
+                __atomic_unlock( &list.data[i].lock );
+                #if !defined(__CFA_NO_STATISTICS__)
+                        tls.pick.push.success++;
+                        tls.full.value += num;
+                        tls.full.count += 1;
+                #endif
+                return first;
+        }
+}
+//-----------------------------------------------------------------------
+static struct thread_desc * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+        #if !defined(__CFA_NO_STATISTICS__)
+                tls.pick.pop.attempt++;
+        #endif
+        // Pick the bet list
+        int w = i;
+        if( __builtin_expect(ts(list.data[j]) != 0, true) ) {
+                w = (ts(list.data[i]) < ts(list.data[j])) ? i : j;
+        }
+        __intrusive_ready_queue_t & list = list.data[w];
+        // If list looks empty retry
+        if( ts(list) == 0 ) return 0p;
+        // If we can't get the lock retry
+        if( !__atomic_try_acquire(&list.lock) ) return 0p;
+        verify(list.last_id == -1u);
+        list.last_id = kernelTLS.this_processor->id;
+        verify(list.last_id == kernelTLS.this_processor->id);
+        __attribute__((unused)) int num = __atomic_load_n( &empty.count, __ATOMIC_RELAXED );
+        // If list is empty, unlock and retry
+        if( ts(list) == 0 ) {
+                list.last_id = -1u;
+                __atomic_unlock(&list.lock);
+                return 0p;
+        }
+        {
+                __cfa_readyQ_mask_t word;
+                __cfa_readyQ_mask_t bit;
+                [bit, word] = extract(w);
+                verify((empty.mask[word] & (1ull << bit)) != 0);
+        }
+        verify(list.last_id == kernelTLS.this_processor->id);
+        verify(list.lock);
+        // Actually pop the list
+        struct thread_desc * thrd;
+        bool emptied;
+        [thrd, emptied] = pop(list);
+        verify(thrd);
+        verify(list.last_id == kernelTLS.this_processor->id);
+        verify(list.lock);
+        if(emptied) {
+                __atomic_fetch_sub( &empty.count, 1z, __ATOMIC_SEQ_CST);
+                __cfa_readyQ_mask_t word;
+                __cfa_readyQ_mask_t bit;
+                [bit, word] = extract(w);
+                verify((empty.mask[word] & (1ull << bit)) != 0);
+                __attribute__((unused)) bool ret = btr(&empty.mask[word], bit);
+                verify(ret);
+                verify((empty.mask[word] & (1ull << bit)) == 0);
+        }
+        verify(list.lock);
+        // Unlock and return
+        list.last_id = -1u;
+        __atomic_unlock(&list.lock);
+        verify(empty.count >= 0);
+        #if !defined(__CFA_NO_STATISTICS__)
+                tls.pick.pop.success++;
+                tls.full.value += num;
+                tls.full.count += 1;
+        #endif
+        return thrd;
+}
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr) with (cltr->ready_queue) {
+        verify( list.count > 0 );
+        while( __atomic_load_n( &empty.count, __ATOMIC_RELAXED ) != 0) {
+                #if !defined(__CFA_READQ_NO_BITMASK__)
+                        tls.pick.pop.maskrds++;
+                        unsigned i, j;
+                        {
+                                #if !defined(__CFA_NO_SCHED_STATS__)
+                                        tls.pick.pop.maskrds++;
+                                #endif
+                                // Pick two lists at random
+                                unsigned num = ((__atomic_load_n( &list.count, __ATOMIC_RELAXED ) - 1) >> 6) + 1;
+                                unsigned ri = tls_rand();
+                                unsigned rj = tls_rand();
+                                unsigned wdxi = (ri >> 6u) % num;
+                                unsigned wdxj = (rj >> 6u) % num;
+                                size_t maski = __atomic_load_n( &empty.mask[wdxi], __ATOMIC_RELAXED );
+                                size_t maskj = __atomic_load_n( &empty.mask[wdxj], __ATOMIC_RELAXED );
+                                if(maski == 0 && maskj == 0) continue;
+                                unsigned bi = rand_bit(ri, maski);
+                                unsigned bj = rand_bit(rj, maskj);
+                                verifyf(bi < 64, "%zu %u", maski, bi);
+                                verifyf(bj < 64, "%zu %u", maskj, bj);
+                                i = bi | (wdxi << 6);
+                                j = bj | (wdxj << 6);
+                                verifyf(i < list.count, "%u", wdxi << 6);
+                                verifyf(j < list.count, "%u", wdxj << 6);
+                        }
+                        struct thread_desc * thrd = try_pop(cltr, i, j);
+                        if(thrd) return thrd;
+                #else
+                        // Pick two lists at random
+                        int i = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
+                        int j = tls_rand() % __atomic_load_n( &list.count, __ATOMIC_RELAXED );
+                        struct thread_desc * thrd = try_pop(cltr, i, j);
+                        if(thrd) return thrd;
+                #endif
+        }
+        return 0p;
+}
+//-----------------------------------------------------------------------
+static void check( __ready_queue_t & q ) with (q) {
+        #if defined(__CFA_WITH_VERIFY__)
+                {
+                        int idx = 0;
+                        for( w ; __cfa_readyQ_mask_size ) {
+                                for( b ; 8 * sizeof(__cfa_readyQ_mask_t) ) {
+                                        bool is_empty = idx < list.count ? (ts(list.data[idx]) == 0) : true;
+                                        bool should_be_empty = 0 == (empty.mask[w] & (1z << b));
+                                        assertf(should_be_empty == is_empty, "Inconsistent list %d, mask expect : %d, actual is got %d", idx, should_be_empty, (bool)is_empty);
+                                        assert(__cfa_max_readyQs > idx);
+                                        idx++;
+                                }
+                        }
+                }
+                {
+                        for( idx ; list.count ) {
+                                __intrusive_ready_queue_t & sl = list.data[idx];
+                                assert(!list.data[idx].lock);
+                                assert(head(sl)->link.prev == 0p );
+                                assert(head(sl)->link.next->link.prev == head(sl) );
+                                assert(tail(sl)->link.next == 0p );
+                                assert(tail(sl)->link.prev->link.next == tail(sl) );
+                                if(sl.before.link.ts == 0l) {
+                                        assert(tail(sl)->link.next == 0p);
+                                        assert(tail(sl)->link.prev == head(sl));
+                                        assert(head(sl)->link.next == tail(sl));
+                                        assert(head(sl)->link.prev == 0p);
+                                }
+                        }
+                }
+        #endif
+}
+// Call this function of the intrusive list was moved using memcpy
+// fixes the list so that the pointers back to anchors aren't left
+// dangling
+static inline void fix(__intrusive_ready_queue_t & ll) {
+        // if the list is not empty then follow he pointer
+        // and fix its reverse
+        if(ll.before.link.ts != 0l) {
+                head(ll)->link.next->link.prev = head(ll);
+                tail(ll)->link.prev->link.next = tail(ll);
+        }
+        // Otherwise just reset the list
+        else {
+                tail(ll)->link.next = 0p;
+                tail(ll)->link.prev = head(ll);
+                head(ll)->link.next = tail(ll);
+                head(ll)->link.prev = 0p;
+        }
+}
+void ready_queue_grow  (struct cluster * cltr) {
+        uint_fast32_t last_size = ready_mutate_lock( *cltr );
+        check( cltr->ready_queue );
+        with( cltr->ready_queue ) {
+                size_t ncount = list.count;
+                // Check that we have some space left
+                if(ncount + 4 >= __cfa_max_readyQs) abort("Program attempted to create more than maximum number of Ready Queues (%zu)", __cfa_max_readyQs);
+                ncount += 4;
+                // Allocate new array
+                list.data = alloc(list.data, ncount);
+                // Fix the moved data
+                for( idx; (size_t)list.count ) {
+                        fix(list.data[idx]);
+                }
+                // Construct new data
+                for( idx; (size_t)list.count ~ ncount) {
+                        (list.data[idx]){};
+                }
+                // Update original
+                list.count = ncount;
+                // fields in empty don't need to change
+        }
+        // Make sure that everything is consistent
+        check( cltr->ready_queue );
+        ready_mutate_unlock( *cltr, last_size );
+}
+void ready_queue_shrink(struct cluster * cltr) {
+        uint_fast32_t last_size = ready_mutate_lock( *cltr );
+        with( cltr->ready_queue ) {
+                size_t ocount = list.count;
+                // Check that we have some space left
+                if(ocount < 8) abort("Program attempted to destroy more Ready Queues than were created");
+                list.count -= 4;
+                // redistribute old data
+                verify(ocount > list.count);
+                for( idx; (size_t)list.count ~ ocount) {
+                        // This is not strictly needed but makes checking invariants much easier
+                        bool locked = __atomic_try_acquire(&list.data[idx].lock);
+                        verify(locked);
+                        while(0 != ts(list.data[idx])) {
+                                struct thread_desc * thrd;
+                                __attribute__((unused)) bool _;
+                                [thrd, _] = pop(list.data[idx]);
+                                verify(thrd);
+                                push(cltr, thrd);
+                        }
+                        __atomic_unlock(&list.data[idx].lock);
+                        // TODO print the queue statistics here
+                        ^(list.data[idx]){};
+                }
+                // clear the now unused masks
+                {
+                        __cfa_readyQ_mask_t fword, fbit, lword, lbit;
+                        [fbit, fword] = extract(ocount);
+                        [lbit, lword] = extract(list.count);
+                        // For now assume that all queues where coverd by the same bitmask
+                        // This is highly probable as long as grow and shrink use groups of 4
+                        // exclusively
+                        verify(fword == lword);
+                        __cfa_readyQ_mask_t clears = ~0;
+                        for( b ; fbit ~ lbit ) {
+                                clears ^= 1 << b;
+                        }
+                        empty.mask[fword] &= clears;
+                }
+                // Allocate new array
+                list.data = alloc(list.data, list.count);
+                // Fix the moved data
+                for( idx; (size_t)list.count ) {
+                        fix(list.data[idx]);
+                }
+        }
+        // Make sure that everything is consistent
+        check( cltr->ready_queue );
+        ready_mutate_unlock( *cltr, last_size );
+}
+//-----------------------------------------------------------------------
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr) with (cltr->ready_queue) {
+        __atomic_fetch_add( &global_stats.pick.push.attempt, tls.pick.push.attempt, __ATOMIC_SEQ_CST );
+        __atomic_fetch_add( &global_stats.pick.push.success, tls.pick.push.success, __ATOMIC_SEQ_CST );
+        __atomic_fetch_add( &global_stats.pick.pop .maskrds, tls.pick.pop .maskrds, __ATOMIC_SEQ_CST );
+        __atomic_fetch_add( &global_stats.pick.pop .attempt, tls.pick.pop .attempt, __ATOMIC_SEQ_CST );
+        __atomic_fetch_add( &global_stats.pick.pop .success, tls.pick.pop .success, __ATOMIC_SEQ_CST );
+        __atomic_fetch_add( &global_stats.full.value, tls.full.value, __ATOMIC_SEQ_CST );
+        __atomic_fetch_add( &global_stats.full.count, tls.full.count, __ATOMIC_SEQ_CST );
+}
+#endif

libcfa/src/concurrency/thread.cfa

-              r0f9ceacb
+              rb798713
         self_mon_p = &self_mon;
         curr_cluster = &cl;
         next = 0p;
         prev = 0p;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = NULL;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset b798713 for libcfa

Legend:

Download in other formats: