Context Navigation

rb232745	r13c5e19
57	57	} preemption_state;
58	58
59		~~uint32~~_t rand_seed;
	59	__uint128_t rand_seed;
60	60	} kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
61	61	}

libcfa/src/concurrency/io.cfa

-                      rb232745
+                      r13c5e19
                 struct {
                         struct {
+                                __processor_id_t id;
                                 void * stack;
                                 pthread_t kthrd;
 …
                 if( this.io->cltr_flags & CFA_CLUSTER_IO_POLLER_USER_THREAD ) {
                         with( this.io->poller.fast ) {
                                 /* paranoid */ verify( this.procs.head == 0p || &this == mainCluster );
                                 /* paranoid */ verify( this.idles.head == 0p || &this == mainCluster );
+                                /* paranoid */ verify( this.nprocessors == 0 || &this == mainCluster );
+                                /* paranoid */ verify( !ready_mutate_islocked() );
                                 // We need to adjust the clean-up based on where the thread is
                                 if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
+                                        // This is the tricky case
+                                        // The thread was preempted and now it is on the ready queue
+                                        /* paranoid */ verify( thrd.next != 0p );                // The thread should be the last on the list
+                                        /* paranoid */ verify( this.ready_queue.head == &thrd ); // The thread should be the only thing on the list
+                                        // Remove the thread from the ready queue of this cluster
+                                        this.ready_queue.head = 1p;
+                                        thrd.next = 0p;
+                                        __cfaabi_dbg_debug_do( thrd.unpark_stale = true );
+                                        // Fixup the thread state
+                                        thrd.state = Blocked;
+                                        thrd.preempted = __NO_PREEMPTION;
+                                        ready_schedule_lock( (struct __processor_id_t *)active_processor() );
+                                                // This is the tricky case
+                                                // The thread was preempted and now it is on the ready queue
+                                                // The thread should be the last on the list
+                                                /* paranoid */ verify( thrd.link.next != 0p );
+                                                // Remove the thread from the ready queue of this cluster
+                                                __attribute__((unused)) bool removed = remove_head( &this, &thrd );
+                                                /* paranoid */ verify( removed );
+                                                thrd.link.next = 0p;
+                                                thrd.link.prev = 0p;
+                                                __cfaabi_dbg_debug_do( thrd.unpark_stale = true );
+                                                // Fixup the thread state
+                                                thrd.state = Blocked;
+                                                thrd.ticket = 0;
+                                                thrd.preempted = __NO_PREEMPTION;
+                                        ready_schedule_unlock( (struct __processor_id_t *)active_processor() );
                                         // Pretend like the thread was blocked all along
 …
                                         thrd.curr_cluster = active_cluster();
                         // unpark the fast io_poller
+                                        // unpark the fast io_poller
                                         unpark( &thrd __cfaabi_dbg_ctx2 );
+                                }
 …
+                }
+                verify( (shead + ret) == *ring.submit_q.head );
+                uint32_t nhead = *ring.submit_q.head;
+                verifyf( (shead + ret) == nhead, "Expected %u got %u\n", (shead + ret), nhead );
                 // Release the consumed SQEs
 …
                 // update statistics
                 #if !defined(__CFA_NO_STATISTICS__)
                         __tls_stats()->io.submit_q.stats.submit_avg.rdy += to_submit;
                         __tls_stats()->io.submit_q.stats.submit_avg.csm += ret;
                         __tls_stats()->io.submit_q.stats.submit_avg.avl += avail;
                         __tls_stats()->io.submit_q.stats.submit_avg.cnt += 1;
+                        __tls_stats()->io.submit_q.submit_avg.rdy += to_submit;
+                        __tls_stats()->io.submit_q.submit_avg.csm += ret;
+                        __tls_stats()->io.submit_q.submit_avg.avl += avail;
+                        __tls_stats()->io.submit_q.submit_avg.cnt += 1;
                 #endif
 …
                         data->result = cqe.res;
                         if(!in_kernel) { unpark( data->thrd __cfaabi_dbg_ctx2 ); }
                         else         { __unpark( data->thrd __cfaabi_dbg_ctx2 ); }
+                        else         { __unpark( &ring.poller.slow.id, data->thrd __cfaabi_dbg_ctx2 ); }
+                }
 …
         static void * __io_poller_slow( void * arg ) {
+                #if !defined( __CFA_NO_STATISTICS__ )
+                        __stats_t local_stats;
+                        __init_stats( &local_stats );
+                        kernelTLS.this_stats = &local_stats;
+                #endif
                 cluster * cltr = (cluster *)arg;
                 struct __io_data & ring = *cltr->io;
+                ring.poller.slow.id.id = doregister( &ring.poller.slow.id );
                 sigset_t mask;
 …
                                 // Update statistics
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         __tls_stats()->io.complete_q.stats.completed_avg.val += count;
                                         __tls_stats()->io.complete_q.stats.completed_avg.slow_cnt += 1;
+                                        __tls_stats()->io.complete_q.completed_avg.val += count;
+                                        __tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
                                 #endif
                                 if(again) {
                                         __cfadbg_print_safe(io_core, "Kernel I/O : Moving to ring %p to fast poller\n", &ring);
                                         __unpark( &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
+                                        __unpark( &ring.poller.slow.id, &ring.poller.fast.thrd __cfaabi_dbg_ctx2 );
                                         wait( ring.poller.sem );
+                                }
 …
                                 // Update statistics
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         __tls_stats()->io.complete_q.stats.completed_avg.val += count;
                                         __tls_stats()->io.complete_q.stats.completed_avg.slow_cnt += 1;
+                                        __tls_stats()->io.complete_q.completed_avg.val += count;
+                                        __tls_stats()->io.complete_q.completed_avg.slow_cnt += 1;
                                 #endif
+                        }
 …
                 __cfadbg_print_safe(io_core, "Kernel I/O : Slow poller for ring %p stopping\n", &ring);
+                unregister( &ring.poller.slow.id );
                 return 0p;
 …
                         int count;
                         bool again;
+                        [count, again] = __drain_io( *this.ring, 0p, 0, false );
+                        if(!again) reset++;
+                        // Update statistics
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                __tls_stats()->io.complete_q.stats.completed_avg.val += count;
+                                __tls_stats()->io.complete_q.stats.completed_avg.fast_cnt += 1;
+                        #endif
+                        disable_interrupts();
+                                [count, again] = __drain_io( *this.ring, 0p, 0, false );
+                                if(!again) reset++;
+                                // Update statistics
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        __tls_stats()->io.complete_q.completed_avg.val += count;
+                                        __tls_stats()->io.complete_q.completed_avg.fast_cnt += 1;
+                                #endif
+                        enable_interrupts( __cfaabi_dbg_ctx );
                         // If we got something, just yield and check again
 …
                 verify( data != 0 );
+                disable_interrupts();
                 // Prepare the data we need
                 __attribute((unused)) int len   = 0;
 …
                 // Loop around looking for an available spot
                 LOOKING: for() {
+                for() {
                         // Look through the list starting at some offset
                         for(i; cnt) {
 …
                                         // update statistics
                                         #if !defined(__CFA_NO_STATISTICS__)
                                                 __tls_stats()->io.submit_q.stats.alloc_avg.val   += len;
                                                 __tls_stats()->io.submit_q.stats.alloc_avg.block += block;
                                                 __tls_stats()->io.submit_q.stats.alloc_avg.cnt   += 1;
+                                                __tls_stats()->io.submit_q.alloc_avg.val   += len;
+                                                __tls_stats()->io.submit_q.alloc_avg.block += block;
+                                                __tls_stats()->io.submit_q.alloc_avg.cnt   += 1;
                                         #endif
+                                        enable_interrupts( __cfaabi_dbg_ctx );
                                         // Success return the data
 …
                 uint32_t * const tail = ring.submit_q.tail;
                 const uint32_t mask = *ring.submit_q.mask;
+                disable_interrupts();
                 // There are 2 submission schemes, check which one we are using
 …
                         // update statistics
                         #if !defined(__CFA_NO_STATISTICS__)
                                 __tls_stats()->io.submit_q.stats.look_avg.val   += len;
                                 __tls_stats()->io.submit_q.stats.look_avg.block += block;
                                 __tls_stats()->io.submit_q.stats.look_avg.cnt   += 1;
+                                __tls_stats()->io.submit_q.look_avg.val   += len;
+                                __tls_stats()->io.submit_q.look_avg.block += block;
+                                __tls_stats()->io.submit_q.look_avg.cnt   += 1;
                         #endif
 …
                         // update statistics
                         #if !defined(__CFA_NO_STATISTICS__)
                                 __tls_stats()->io.submit_q.stats.submit_avg.csm += 1;
                                 __tls_stats()->io.submit_q.stats.submit_avg.cnt += 1;
+                                __tls_stats()->io.submit_q.submit_avg.csm += 1;
+                                __tls_stats()->io.submit_q.submit_avg.cnt += 1;
                         #endif
 …
                         __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
+                }
+                enable_interrupts( __cfaabi_dbg_ctx );
+        }

libcfa/src/concurrency/kernel.cfa

-                      rb232745
+                      r13c5e19
 static void * __invoke_processor(void * arg);
 void ?{}(processor & this, const char name[], cluster & cltr) with( this ) {
+void ?{}(processor & this, const char name[], cluster & _cltr) with( this ) {
         this.name = name;
         this.cltr = &cltr;
+        this.cltr = &_cltr;
         id = -1u;
         terminated{ 0 };
 …
         this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+        __atomic_fetch_add( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
         __cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
 …
         free( this.stack );
+        __atomic_fetch_sub( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+}
 …
         this.name = name;
         this.preemption_rate = preemption_rate;
+        this.nprocessors = 0;
         ready_queue{};
 …
+        }
-        doregister(this->cltr, this);
+        {
                 // Setup preemption data
 …
                 __cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
+        }
-        unregister(this->cltr, this);
         V( this->terminated );
 …
                 runner{ &this };
                 __cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+                __atomic_fetch_add( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
+        }
 …
         void ^?{}(processor & this) with( this ){
                 /* paranoid */ verify( this.do_terminate == true );
+                __atomic_fetch_sub( &cltr->nprocessors, 1u, __ATOMIC_SEQ_CST );
                 __cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+        }
 …
+}
-void doregister( cluster * cltr, processor * proc ) {
-        // lock      (cltr->idle_lock __cfaabi_dbg_ctx2);
-        // cltr->nprocessors += 1;
-        // push_front(cltr->procs, *proc);
-        // unlock    (cltr->idle_lock);
+}
-void unregister( cluster * cltr, processor * proc ) {
-        // lock  (cltr->idle_lock __cfaabi_dbg_ctx2);
-        // remove(cltr->procs, *proc );
-        // cltr->nprocessors -= 1;
-        // unlock(cltr->idle_lock);
+}
 //-----------------------------------------------------------------------------
 // Debug

libcfa/src/concurrency/kernel.hfa

rb232745	r13c5e19
186	186	// List of idle processors
187	187	StackLF(processor) idles;
188		unsigned int nprocessors;
	188	volatile unsigned int nprocessors;
189	189
190	190	// List of threads

libcfa/src/concurrency/kernel_private.hfa

-                      rb232745
+                      r13c5e19
 #include "stats.hfa"
+#include "bits/random.hfa"
 //-----------------------------------------------------------------------------
 …
 #define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
+static inline uint32_t __tls_rand() {
+        kernelTLS.rand_seed ^= kernelTLS.rand_seed << 6;
+        kernelTLS.rand_seed ^= kernelTLS.rand_seed >> 21;
+        kernelTLS.rand_seed ^= kernelTLS.rand_seed << 7;
+        return kernelTLS.rand_seed;
+static inline uint64_t __tls_rand() {
+        // kernelTLS.rand_seed ^= kernelTLS.rand_seed << 6;
+        // kernelTLS.rand_seed ^= kernelTLS.rand_seed >> 21;
+        // kernelTLS.rand_seed ^= kernelTLS.rand_seed << 7;
+        // return kernelTLS.rand_seed;
+        return __lehmer64( kernelTLS.rand_seed );
+}
 …
 void doregister( struct cluster * cltr, struct $thread & thrd );
 void unregister( struct cluster * cltr, struct $thread & thrd );
-void doregister( struct cluster * cltr, struct processor * proc );
-void unregister( struct cluster * cltr, struct processor * proc );
 //=======================================================================
 …
 //-----------------------------------------------------------------------
+// remove thread from the ready queue of a cluster
+// returns bool if it wasn't found
+bool remove_head(struct cluster * cltr, struct $thread * thrd);
+//-----------------------------------------------------------------------
 // Increase the width of the ready queue (number of lanes) by 4
 void ready_queue_grow  (struct cluster * cltr);

libcfa/src/concurrency/ready_queue.cfa

-                      rb232745
+                      r13c5e19
 #include <unistd.h>
+#include "snzi.hfa"
+#include "ready_subqueue.hfa"
 static const size_t cache_line_size = 64;
 …
 #endif
 #define BIAS 64
+#define BIAS 8
 // returns the maximum number of processors the RWLock support
 …
 //=======================================================================
 // Intrusive Queue used by ready queue
+// Cforall Reqdy Queue used for scheduling
 //=======================================================================
-// Intrusives lanes which are used by the relaxed ready queue
-struct __attribute__((aligned(128))) __intrusive_lane_t {
-        // spin lock protecting the queue
-        volatile bool lock;
-        // anchor for the head and the tail of the queue
-        struct __sentinel_t {
-                // Link lists fields
-                // instrusive link field for threads
-                // must be exactly as in $thread
-                __thread_desc_link link;
-        } before, after;
-        // Optional statistic counters
-        #if !defined(__CFA_NO_SCHED_STATS__)
-                struct __attribute__((aligned(64))) {
-                        // difference between number of push and pops
-                        ssize_t diff;
-                        // total number of pushes and pops
-                        size_t  push;
-                        size_t  pop ;
-                } stat;
-        #endif
-};
-void  ?{}(__intrusive_lane_t & this);
-void ^?{}(__intrusive_lane_t & this);
-// Get the head pointer (one before the first element) from the anchor
-static inline $thread * head(const __intrusive_lane_t & this) {
-        $thread * rhead = ($thread *)(
-                (uintptr_t)( &this.before ) - offsetof( $thread, link )
-        );
-        /* paranoid */ verify(rhead);
-        return rhead;
+}
-// Get the tail pointer (one after the last element) from the anchor
-static inline $thread * tail(const __intrusive_lane_t & this) {
-        $thread * rtail = ($thread *)(
-                (uintptr_t)( &this.after ) - offsetof( $thread, link )
-        );
-        /* paranoid */ verify(rtail);
-        return rtail;
+}
-// Ctor
-void ?{}( __intrusive_lane_t & this ) {
-        this.lock = false;
-        this.before.link.prev = 0p;
-        this.before.link.next = tail(this);
-        this.before.link.ts   = 0;
-        this.after .link.prev = head(this);
-        this.after .link.next = 0p;
-        this.after .link.ts   = 0;
-        #if !defined(__CFA_NO_SCHED_STATS__)
-                this.stat.diff = 0;
-                this.stat.push = 0;
-                this.stat.pop  = 0;
-        #endif
-        // We add a boat-load of assertions here because the anchor code is very fragile
-        /* paranoid */ verify(((uintptr_t)( head(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.before));
-        /* paranoid */ verify(((uintptr_t)( tail(this) ) + offsetof( $thread, link )) == (uintptr_t)(&this.after ));
-        /* paranoid */ verify(head(this)->link.prev == 0p );
-        /* paranoid */ verify(head(this)->link.next == tail(this) );
-        /* paranoid */ verify(tail(this)->link.next == 0p );
-        /* paranoid */ verify(tail(this)->link.prev == head(this) );
-        /* paranoid */ verify(&head(this)->link.prev == &this.before.link.prev );
-        /* paranoid */ verify(&head(this)->link.next == &this.before.link.next );
-        /* paranoid */ verify(&tail(this)->link.prev == &this.after .link.prev );
-        /* paranoid */ verify(&tail(this)->link.next == &this.after .link.next );
-        /* paranoid */ verify(sizeof(__intrusive_lane_t) == 128);
-        /* paranoid */ verify(sizeof(this) == 128);
-        /* paranoid */ verify(__alignof__(__intrusive_lane_t) == 128);
-        /* paranoid */ verify(__alignof__(this) == 128);
-        /* paranoid */ verifyf(((intptr_t)(&this) % 128) == 0, "Expected address to be aligned %p %% 128 == %zd", &this, ((intptr_t)(&this) % 128));
+}
-// Dtor is trivial
-void ^?{}( __intrusive_lane_t & this ) {
-        // Make sure the list is empty
-        /* paranoid */ verify(head(this)->link.prev == 0p );
-        /* paranoid */ verify(head(this)->link.next == tail(this) );
-        /* paranoid */ verify(tail(this)->link.next == 0p );
-        /* paranoid */ verify(tail(this)->link.prev == head(this) );
+}
-// Push a thread onto this lane
-// returns true of lane was empty before push, false otherwise
-bool push(__intrusive_lane_t & this, $thread * node) {
-        #if defined(__CFA_WITH_VERIFY__)
-                /* paranoid */ verify(this.lock);
-                /* paranoid */ verify(node->link.ts != 0);
-                /* paranoid */ verify(node->link.next == 0p);
-                /* paranoid */ verify(node->link.prev == 0p);
-                /* paranoid */ verify(tail(this)->link.next == 0p);
-                /* paranoid */ verify(head(this)->link.prev == 0p);
-                if(this.before.link.ts == 0l) {
-                        /* paranoid */ verify(tail(this)->link.prev == head(this));
-                        /* paranoid */ verify(head(this)->link.next == tail(this));
-                } else {
-                        /* paranoid */ verify(tail(this)->link.prev != head(this));
-                        /* paranoid */ verify(head(this)->link.next != tail(this));
+                }
-        #endif
-        // Get the relevant nodes locally
-        $thread * tail = tail(this);
-        $thread * prev = tail->link.prev;
-        // Do the push
-        node->link.next = tail;
-        node->link.prev = prev;
-        prev->link.next = node;
-        tail->link.prev = node;
-        // Update stats
-        #if !defined(__CFA_NO_SCHED_STATS__)
-                this.stat.diff++;
-                this.stat.push++;
-        #endif
-        verify(node->link.next == tail(this));
-        // Check if the queue used to be empty
-        if(this.before.link.ts == 0l) {
-                this.before.link.ts = node->link.ts;
-                /* paranoid */ verify(node->link.prev == head(this));
-                return true;
+        }
-        return false;
+}
-// Pop a thread from this lane (must be non-empty)
-// returns popped
-// returns true of lane was empty before push, false otherwise
-[$thread *, bool] pop(__intrusive_lane_t & this) {
-        /* paranoid */ verify(this.lock);
-        /* paranoid */ verify(this.before.link.ts != 0ul);
-        // Get anchors locally
-        $thread * head = head(this);
-        $thread * tail = tail(this);
-        // Get the relevant nodes locally
-        $thread * node = head->link.next;
-        $thread * next = node->link.next;
-        /* paranoid */ verify(node != tail);
-        /* paranoid */ verify(node);
-        // Do the pop
-        head->link.next = next;
-        next->link.prev = head;
-        node->link.[next, prev] = 0p;
-        // Update head time stamp
-        this.before.link.ts = next->link.ts;
-        // Update stats
-        #ifndef __CFA_NO_SCHED_STATS__
-                this.stat.diff--;
-                this.stat.pop ++;
-        #endif
-        // Check if we emptied list and return accordingly
-        /* paranoid */ verify(tail(this)->link.next == 0p);
-        /* paranoid */ verify(head(this)->link.prev == 0p);
-        if(next == tail) {
-                /* paranoid */ verify(this.before.link.ts == 0);
-                /* paranoid */ verify(tail(this)->link.prev == head(this));
-                /* paranoid */ verify(head(this)->link.next == tail(this));
-                return [node, true];
+        }
-        else {
-                /* paranoid */ verify(next->link.ts != 0);
-                /* paranoid */ verify(tail(this)->link.prev != head(this));
-                /* paranoid */ verify(head(this)->link.next != tail(this));
-                /* paranoid */ verify(this.before.link.ts != 0);
-                return [node, false];
+        }
+}
-// Check whether or not list is empty
-static inline bool is_empty(__intrusive_lane_t & this) {
-        // Cannot verify here since it may not be locked
-        return this.before.link.ts == 0;
+}
-// Return the timestamp
-static inline unsigned long long ts(__intrusive_lane_t & this) {
-        // Cannot verify here since it may not be locked
-        return this.before.link.ts;
+}
-//=======================================================================
-// Scalable Non-Zero counter
-//=======================================================================
-union __snzi_val_t {
-        uint64_t _all;
-        struct __attribute__((packed)) {
-                char cnt;
-                uint64_t ver:56;
-        };
-};
-bool cas(volatile __snzi_val_t & self, __snzi_val_t & exp, char _cnt, uint64_t _ver) {
-        __snzi_val_t t;
-        t.ver = _ver;
-        t.cnt = _cnt;
-        /* paranoid */ verify(t._all == ((_ver << 8) | ((unsigned char)_cnt)));
-        return __atomic_compare_exchange_n(&self._all, &exp._all, t._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
-bool cas(volatile __snzi_val_t & self, __snzi_val_t & exp, const __snzi_val_t & tar) {
-        return __atomic_compare_exchange_n(&self._all, &exp._all, tar._all, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
-void ?{}( __snzi_val_t & this ) { this._all = 0; }
-void ?{}( __snzi_val_t & this, const volatile __snzi_val_t & o) { this._all = o._all; }
-struct __attribute__((aligned(128))) __snzi_node_t {
-        volatile __snzi_val_t value;
-        struct __snzi_node_t * parent;
-        bool is_root;
-};
-static inline void arrive( __snzi_node_t & );
-static inline void depart( __snzi_node_t & );
-#define __snzi_half -1
-//--------------------------------------------------
-// Root node
-static void arrive_r( __snzi_node_t & this ) {
-        /* paranoid */ verify( this.is_root );
-        __atomic_fetch_add(&this.value._all, 1, __ATOMIC_SEQ_CST);
+}
-static void depart_r( __snzi_node_t & this ) {
-        /* paranoid */ verify( this.is_root );
-        __atomic_fetch_sub(&this.value._all, 1, __ATOMIC_SEQ_CST);
+}
-//--------------------------------------------------
-// Hierarchical node
-static void arrive_h( __snzi_node_t & this ) {
-        int undoArr = 0;
-        bool success = false;
-        while(!success) {
-                __snzi_val_t x = { this.value };
-                /* paranoid */ verify(x.cnt <= 120);
-                if( x.cnt >= 1 ) {
-                        if( cas( this.value, x, x.cnt + 1, x.ver ) ) {
-                                success = true;
+                        }
+                }
-                /* paranoid */ verify(x.cnt <= 120);
-                if( x.cnt == 0 ) {
-                        if( cas( this.value, x, __snzi_half, x.ver + 1) ) {
-                                success = true;
-                                x.cnt = __snzi_half;
-                                x.ver = x.ver + 1;
+                        }
+                }
-                /* paranoid */ verify(x.cnt <= 120);
-                if( x.cnt == __snzi_half ) {
-                        /* paranoid */ verify( this.parent);
-                        arrive( *this.parent );
-                        if( !cas( this.value, x, 1, x.ver) ) {
-                                undoArr = undoArr + 1;
+                        }
+                }
+        }
-        for(int i = 0; i < undoArr; i++) {
-                /* paranoid */ verify( this.parent );
-                depart( *this.parent );
+        }
+}
-static void depart_h( __snzi_node_t & this ) {
-        while(true) {
-                const __snzi_val_t x = { this.value };
-                /* paranoid */ verifyf(x.cnt >= 1, "%d", x.cnt);
-                if( cas( this.value, x, x.cnt - 1, x.ver ) ) {
-                        if( x.cnt == 1 ) {
-                                /* paranoid */ verify( this.parent );
-                                depart( *this.parent );
+                        }
-                        return;
+                }
+        }
+}
-//--------------------------------------------------
-// All nodes
-static inline void arrive( __snzi_node_t & this ) {
-        if(this.is_root) arrive_r( this );
-        else arrive_h( this );
+}
-static inline void depart( __snzi_node_t & this ) {
-        if(this.is_root) depart_r( this );
-        else depart_h( this );
+}
-static inline bool query( __snzi_node_t & this ) {
-        /* paranoid */ verify( this.is_root );
-        return this.value._all > 0;
+}
-//--------------------------------------------------
-// SNZI object
-void  ?{}( __snzi_t & this, unsigned depth ) with( this ) {
-        mask = (1 << depth) - 1;
-        root = (1 << (depth + 1)) - 2;
-        nodes = alloc( root + 1 );
-        int width = 1 << depth;
-        for(int i = 0; i < root; i++) {
-                nodes[i].value._all = 0;
-                nodes[i].parent = &nodes[(i / 2) + width ];
-                nodes[i].is_root = false;
+        }
-        nodes[ root ].value._all = 0;
-        nodes[ root ].parent = 0p;
-        nodes[ root ].is_root = true;
+}
-void ^?{}( __snzi_t & this ) {
-        free( this.nodes );
+}
-static inline void arrive( __snzi_t & this, int idx) {
-        idx &= this.mask;
-        arrive( this.nodes[idx] );
+}
-static inline void depart( __snzi_t & this, int idx) {
-        idx &= this.mask;
-        depart( this.nodes[idx] );
+}
-static inline bool query( const __snzi_t & this ) {
-        return query( this.nodes[ this.root ] );
+}
-//=======================================================================
-// Cforall Reqdy Queue used by ready queue
-//=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
 …
                         unsigned rlow  = r % BIAS;
                         unsigned rhigh = r / BIAS;
                         if(0 != (rlow % BIAS) && kernelTLS.this_processor) {
+                        if((0 != rlow) && kernelTLS.this_processor) {
                                 // (BIAS - 1) out of BIAS chances
                                 // Use perferred queues
+                                i = (kernelTLS.this_processor->id * 4) + (rhigh % 4);
+                                unsigned pid = kernelTLS.this_processor->id * 4;
+                                i = pid + (rhigh % 4);
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        __tls_stats()->ready.pick.push.local++;
+                                #endif
+                        }
                         else {
 …
+}
+static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
+static struct $thread * try_pop(struct cluster * cltr, unsigned i);
+// Pop from the ready queue from a given cluster
+__attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+        #if defined(BIAS)
+                // Don't bother trying locally too much
+                int local_tries = 8;
+        #endif
+        // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+        while( query(snzi) ) {
+                // Pick two lists at random
+                unsigned i,j;
+                #if defined(BIAS)
+                        uint64_t r = __tls_rand();
+                        unsigned rlow  = r % BIAS;
+                        uint64_t rhigh = r / BIAS;
+                        if(local_tries && 0 != rlow) {
+                                // (BIAS - 1) out of BIAS chances
+                                // Use perferred queues
+                                unsigned pid = kernelTLS.this_processor->id * 4;
+                                i = pid + (rhigh % 4);
+                                j = pid + ((rhigh >> 32ull) % 4);
+                                // count the tries
+                                local_tries--;
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        __tls_stats()->ready.pick.pop.local++;
+                                #endif
+                        }
+                        else {
+                                // 1 out of BIAS chances
+                                // Use all queues
+                                i = rhigh;
+                                j = rhigh >> 32ull;
+                        }
+                #else
+                        i = __tls_rand();
+                        j = __tls_rand();
+                #endif
+                i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                // try popping from the 2 picked lists
+                struct $thread * thrd = try_pop(cltr, i, j);
+                if(thrd) return thrd;
+        }
+        // All lanes where empty return 0p
+        return 0p;
+}
 //-----------------------------------------------------------------------
 // Given 2 indexes, pick the list with the oldest push an try to pop from it
 static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
         #if !defined(__CFA_NO_STATISTICS__)
                 __tls_stats()->ready.pick.pop.attempt++;
 …
+        }
+        return try_pop(cltr, w);
+}
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
         // Get relevant elements locally
         __intrusive_lane_t & lane = lanes.data[w];
 …
         return thrd;
+}
+// Pop from the ready queue from a given cluster
+__attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+        // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+        while( query(snzi) ) {
+                // Pick two lists at random
+                #if defined(BIAS)
+                        unsigned i = __tls_rand();
+                        unsigned j = __tls_rand();
+                        if(0 == (i % BIAS)) {
+                                i = i / BIAS;
+                        }
+                        else {
+                                i = ((kernelTLS.this_processor->id * 4) + ((i / BIAS) % 4));
+                                j = ((kernelTLS.this_processor->id * 4) + ((j / BIAS) % 4));
+                        }
+                #else
+                        unsigned i = __tls_rand();
+                        unsigned j = __tls_rand();
+                #endif
+                i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                // try popping from the 2 picked lists
+                struct $thread * thrd = try_pop(cltr, i, j);
+                if(thrd) return thrd;
+        }
+        // All lanes where empty return 0p
+        return 0p;
+//-----------------------------------------------------------------------
+bool remove_head(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+        for(i; lanes.count) {
+                __intrusive_lane_t & lane = lanes.data[i];
+                bool removed = false;
+                __atomic_acquire(&lane.lock);
+                        if(head(lane)->link.next == thrd) {
+                                $thread * pthrd;
+                                bool emptied;
+                                [pthrd, emptied] = pop(lane);
+                                /* paranoid */ verify( pthrd == thrd );
+                                removed = true;
+                                if(emptied) {
+                                        depart( snzi, i );
+                                }
+                        }
+                __atomic_unlock(&lane.lock);
+                if( removed ) return true;
+        }
+        return false;
+}

libcfa/src/concurrency/stats.cfa

-                      rb232745
+                      r13c5e19
                 stats->ready.pick.pop .probe   = 0;
                 stats->ready.pick.pop .attempt = 0;
+                stats->ready.pick.pop .local   = 0;
                 stats->ready.pick.pop .success = 0;
                 stats->ready.sleep.halts   = 0;
 …
         void __tally_stats( struct __stats_t * cltr, struct __stats_t * proc ) {
                 __atomic_fetch_add( &cltr->ready.pick.push.attempt, proc->ready.pick.push.attempt, __ATOMIC_SEQ_CST );
+                __atomic_fetch_add( &cltr->ready.pick.push.local  , proc->ready.pick.push.local  , __ATOMIC_SEQ_CST );
                 __atomic_fetch_add( &cltr->ready.pick.push.success, proc->ready.pick.push.success, __ATOMIC_SEQ_CST );
                 __atomic_fetch_add( &cltr->ready.pick.pop .probe  , proc->ready.pick.pop .probe  , __ATOMIC_SEQ_CST );
                 __atomic_fetch_add( &cltr->ready.pick.pop .attempt, proc->ready.pick.pop .attempt, __ATOMIC_SEQ_CST );
+                __atomic_fetch_add( &cltr->ready.pick.pop .local  , proc->ready.pick.pop .local  , __ATOMIC_SEQ_CST );
                 __atomic_fetch_add( &cltr->ready.pick.pop .success, proc->ready.pick.pop .success, __ATOMIC_SEQ_CST );
                 __atomic_fetch_add( &cltr->ready.sleep.halts  , proc->ready.sleep.halts  , __ATOMIC_SEQ_CST );
 …
+        }
         void __print_stats( struct __stats_t * stats ) {
+        void __print_stats( struct __stats_t * stats ) with( *stats ) {
                 double push_sur = (100.0 * ((double)stats->ready.pick.push.success) / stats->ready.pick.push.attempt);
                 double pop_sur  = (100.0 * ((double)stats->ready.pick.pop .success) / stats->ready.pick.pop .attempt);
+                double push_sur = (100.0 * ((double)ready.pick.push.success) / ready.pick.push.attempt);
+                double pop_sur  = (100.0 * ((double)ready.pick.pop .success) / ready.pick.pop .attempt);
                 double push_len = ((double)stats->ready.pick.push.attempt) / stats->ready.pick.push.success;
                 double pop_len  = ((double)stats->ready.pick.pop .attempt) / stats->ready.pick.pop .success;
+                double push_len = ((double)ready.pick.push.attempt) / ready.pick.push.success;
+                double pop_len  = ((double)ready.pick.pop .attempt) / ready.pick.pop .success;
                 #if defined(HAVE_LINUX_IO_URING_H)
                         double avgrdy = ((double)submit_avg.rdy) / submit_avg.cnt;
                         double avgcsm = ((double)submit_avg.csm) / submit_avg.cnt;
                         double avgavl = ((double)submit_avg.avl) / submit_avg.cnt;
+                        double avgrdy = ((double)io.submit_q.submit_avg.rdy) / io.submit_q.submit_avg.cnt;
+                        double avgcsm = ((double)io.submit_q.submit_avg.csm) / io.submit_q.submit_avg.cnt;
+                        double avgavl = ((double)io.submit_q.submit_avg.avl) / io.submit_q.submit_avg.cnt;
                         double lavgv = 0;
                         double lavgb = 0;
                         if(look_avg.cnt != 0) {
                                 lavgv = ((double)look_avg.val  ) / look_avg.cnt;
                                 lavgb = ((double)look_avg.block) / look_avg.cnt;
+                        if(io.submit_q.look_avg.cnt != 0) {
+                                lavgv = ((double)io.submit_q.look_avg.val  ) / io.submit_q.look_avg.cnt;
+                                lavgb = ((double)io.submit_q.look_avg.block) / io.submit_q.look_avg.cnt;
+                        }
                         double aavgv = 0;
                         double aavgb = 0;
                         if(alloc_avg.cnt != 0) {
                                 aavgv = ((double)alloc_avg.val  ) / alloc_avg.cnt;
                                 aavgb = ((double)alloc_avg.block) / alloc_avg.cnt;
+                        if(io.submit_q.alloc_avg.cnt != 0) {
+                                aavgv = ((double)io.submit_q.alloc_avg.val  ) / io.submit_q.alloc_avg.cnt;
+                                aavgb = ((double)io.submit_q.alloc_avg.block) / io.submit_q.alloc_avg.cnt;
+                        }
                 #endif
 …
                         "- total threads run      : %'15lu\n"
                         "- total threads scheduled: %'15lu\n"
                         "- push average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts)\n"
                         "- pop  average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts)\n"
+                        "- push average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts, %'15lu local)\n"
+                        "- pop  average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts, %'15lu local)\n"
                         "- Idle Sleep -\n"
                         "-- halts                 : %'15lu\n"
 …
                                 "\n"
                                 "----- I/O Stats -----\n"
                                 "- total submit calls     : %'15llu\n"
+                                "- total submit calls     : %'15lu\n"
                                 "- avg ready entries      : %'18.2lf\n"
                                 "- avg submitted entries  : %'18.2lf\n"
                                 "- avg available entries  : %'18.2lf\n"
                                 "- total ready search     : %'15llu\n"
+                                "- total ready search     : %'15lu\n"
                                 "- avg ready search len   : %'18.2lf\n"
                                 "- avg ready search block : %'18.2lf\n"
                                 "- total alloc search     : %'15llu\n"
+                                "- total alloc search     : %'15lu\n"
                                 "- avg alloc search len   : %'18.2lf\n"
                                 "- avg alloc search block : %'18.2lf\n"
                                 "- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
+                                "- total wait calls       : %'15lu   (%'lu slow, %'lu fast)\n"
                                 "- avg completion/wait    : %'18.2lf\n"
                         #endif
                         , stats->ready.pick.pop.success
                         , stats->ready.pick.push.success
                         , push_len, push_sur, stats->ready.pick.push.attempt
                         , pop_len , pop_sur , stats->ready.pick.pop .attempt
                         , stats->ready.sleep.halts, stats->ready.sleep.cancels, stats->ready.sleep.wakes, stats->ready.sleep.exits
+                        , ready.pick.pop.success
+                        , ready.pick.push.success
+                        , push_len, push_sur, ready.pick.push.attempt, ready.pick.push.local
+                        , pop_len , pop_sur , ready.pick.pop .attempt, ready.pick.pop .local
+                        , ready.sleep.halts, ready.sleep.cancels, ready.sleep.wakes, ready.sleep.exits
                         #if defined(HAVE_LINUX_IO_URING_H)
                                 , submit_avg.cnt
+                                , io.submit_q.submit_avg.cnt
                                 , avgrdy
                                 , avgcsm
                                 , avgavl
                                 , look_avg.cnt
+                                , io.submit_q.look_avg.cnt
                                 , lavgv
                                 , lavgb
                                 , alloc_avg.cnt
+                                , io.submit_q.alloc_avg.cnt
                                 , aavgv
                                 , aavgb
                                 , completed_avg.slow_cnt + completed_avg.fast_cnt
                                 , completed_avg.slow_cnt,  completed_avg.fast_cnt
                                 , ((double)completed_avg.val) / (completed_avg.slow_cnt + completed_avg.fast_cnt)
+                                , io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt
+                                , io.complete_q.completed_avg.slow_cnt,  io.complete_q.completed_avg.fast_cnt
+                                , ((double)io.complete_q.completed_avg.val) / (io.complete_q.completed_avg.slow_cnt + io.complete_q.completed_avg.fast_cnt)
                         #endif
                 );

libcfa/src/concurrency/stats.hfa

-                      rb232745
+                      r13c5e19
                                 volatile uint64_t attempt;
+                                // number of attemps at pushing something something to preferred queues
+                                volatile uint64_t local;
                                 // number of successes at pushing
                                 volatile uint64_t success;
 …
                                 // number of attemps at poping something
                                 volatile uint64_t attempt;
+                                // number of attemps at poping something from preferred queues
+                                volatile uint64_t local;
                                 // number of successes at poping
 …
                         struct {
                                 struct {
                                         unsigned long long int val;
                                         unsigned long long int slow_cnt;
                                         unsigned long long int fast_cnt;
+                                        volatile uint64_t val;
+                                        volatile uint64_t slow_cnt;
+                                        volatile uint64_t fast_cnt;
                                 } completed_avg;
                         } complete_q;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 13c5e19

Legend:

Download in other formats: