Changeset c8a0210 for libcfa/src/concurrency

libcfa/src/concurrency/coroutine.cfa

-              r857a1c6
+              rc8a0210
 //-----------------------------------------------------------------------------
+FORALL_DATA_INSTANCE(CoroutineCancelled, (coroutine_t &), (coroutine_t))
+forall(T &)
+void mark_exception(CoroutineCancelled(T) *) {}
+EHM_VIRTUAL_TABLE(SomeCoroutineCancelled, std_coroutine_cancelled);
 forall(T &)
 …
         // TODO: Remove explitate vtable set once trac#186 is fixed.
         CoroutineCancelled(T) except;
         except.virtual_table = &get_exception_vtable(&except);
+        SomeCoroutineCancelled except;
+        except.virtual_table = &std_coroutine_cancelled;
         except.the_coroutine = &cor;
         except.the_exception = except;
+        throwResume except;
+        // Why does this need a cast?
+        throwResume (SomeCoroutineCancelled &)except;
         except->virtual_table->free( except );

libcfa/src/concurrency/coroutine.hfa

-              r857a1c6
+              rc8a0210
 //-----------------------------------------------------------------------------
 // Exception thrown from resume when a coroutine stack is cancelled.
+FORALL_DATA_EXCEPTION(CoroutineCancelled, (coroutine_t &), (coroutine_t)) (
+EHM_EXCEPTION(SomeCoroutineCancelled)(
+        void * the_coroutine;
+        exception_t * the_exception;
+);
+EHM_EXTERN_VTABLE(SomeCoroutineCancelled, std_coroutine_cancelled);
+EHM_FORALL_EXCEPTION(CoroutineCancelled, (coroutine_t &), (coroutine_t)) (
         coroutine_t * the_coroutine;
         exception_t * the_exception;
 …
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
 trait is_coroutine(T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
+trait is_coroutine(T & | IS_RESUMPTION_EXCEPTION(SomeCoroutineCancelled)) {
         void main(T & this);
         $coroutine * get_coroutine(T & this);

libcfa/src/concurrency/invoke.h

r857a1c6	rc8a0210
148	148	struct $thread * prev;
149	149	volatile unsigned long long ts;
150		~~int preferred;~~
151	150	};
152	151

libcfa/src/concurrency/io/call.cfa.in

-              r857a1c6
+              rc8a0210
                 sqe->opcode = IORING_OP_{op};
                 sqe->user_data = (__u64)(uintptr_t)&future;
+                sqe->user_data = (uintptr_t)&future;
                 sqe->flags = sflags;
                 sqe->ioprio = 0;
 …
                 asm volatile("": : :"memory");
                 verify( sqe->user_data == (__u64)(uintptr_t)&future );
+                verify( sqe->user_data == (uintptr_t)&future );
                 cfa_io_submit( ctx, &idx, 1, 0 != (submit_flags & CFA_IO_LAZY) );
         #endif
 …
                 'fd'  : 'fd',
                 'off' : 'offset',
                 'addr': '(__u64)iov',
+                'addr': '(uintptr_t)iov',
                 'len' : 'iovcnt',
         }, define = 'CFA_HAVE_PREADV2'),
 …
                 'fd'  : 'fd',
                 'off' : 'offset',
                 'addr': '(__u64)iov',
+                'addr': '(uintptr_t)iov',
                 'len' : 'iovcnt'
         }, define = 'CFA_HAVE_PWRITEV2'),
 …
                 'addr': 'fd',
                 'len': 'op',
                 'off': '(__u64)event'
+                'off': '(uintptr_t)event'
         }),
         # CFA_HAVE_IORING_OP_SYNC_FILE_RANGE
 …
         Call('SENDMSG', 'ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)(struct msghdr *)msg',
+                'addr': '(uintptr_t)(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
 …
         Call('RECVMSG', 'ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)(struct msghdr *)msg',
+                'addr': '(uintptr_t)(struct msghdr *)msg',
                 'len': '1',
                 'msg_flags': 'flags'
 …
         Call('SEND', 'ssize_t send(int sockfd, const void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)buf',
+                'addr': '(uintptr_t)buf',
                 'len': 'len',
                 'msg_flags': 'flags'
 …
         Call('RECV', 'ssize_t recv(int sockfd, void *buf, size_t len, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)buf',
+                'addr': '(uintptr_t)buf',
                 'len': 'len',
                 'msg_flags': 'flags'
 …
         Call('ACCEPT', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)addr',
                 'addr2': '(__u64)addrlen',
+                'addr': '(uintptr_t)addr',
+                'addr2': '(uintptr_t)addrlen',
                 'accept_flags': 'flags'
         }),
 …
         Call('CONNECT', 'int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen)', {
                 'fd': 'sockfd',
                 'addr': '(__u64)addr',
+                'addr': '(uintptr_t)addr',
                 'off': 'addrlen'
         }),
 …
         Call('FALLOCATE', 'int fallocate(int fd, int mode, off_t offset, off_t len)', {
                 'fd': 'fd',
                 'addr': '(__u64)len',
+                'addr': '(uintptr_t)len',
                 'len': 'mode',
                 'off': 'offset'
 …
         # CFA_HAVE_IORING_OP_MADVISE
         Call('MADVISE', 'int madvise(void *addr, size_t length, int advice)', {
                 'addr': '(__u64)addr',
+                'addr': '(uintptr_t)addr',
                 'len': 'length',
                 'fadvise_advice': 'advice'
 …
         Call('OPENAT', 'int openat(int dirfd, const char *pathname, int flags, mode_t mode)', {
                 'fd': 'dirfd',
                 'addr': '(__u64)pathname',
+                'addr': '(uintptr_t)pathname',
                 'len': 'mode',
                 'open_flags': 'flags;'
 …
                 'addr': 'pathname',
                 'len': 'sizeof(*how)',
                 'off': '(__u64)how',
+                'off': '(uintptr_t)how',
         }, define = 'CFA_HAVE_OPENAT2'),
         # CFA_HAVE_IORING_OP_CLOSE
 …
         Call('STATX', 'int statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf)', {
                 'fd': 'dirfd',
                 'off': '(__u64)statxbuf',
+                'off': '(uintptr_t)statxbuf',
                 'addr': 'pathname',
                 'len': 'mask',
 …
         Call('READ', 'ssize_t read(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(__u64)buf',
+                'addr': '(uintptr_t)buf',
                 'len': 'count'
         }),
 …
         Call('WRITE', 'ssize_t write(int fd, void * buf, size_t count)', {
                 'fd': 'fd',
                 'addr': '(__u64)buf',
+                'addr': '(uintptr_t)buf',
                 'len': 'count'
         }),

libcfa/src/concurrency/kernel.cfa

-              r857a1c6
+              rc8a0210
 static void __wake_one(cluster * cltr);
 static void push  (__cluster_idles & idles, processor & proc);
 static void remove(__cluster_idles & idles, processor & proc);
 static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
+static void mark_idle (__cluster_proc_list & idles, processor & proc);
+static void mark_awake(__cluster_proc_list & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query_idles( & __cluster_proc_list idles );
 extern void __cfa_io_start( processor * );
 …
                                 // Push self to idle stack
                                 push(this->cltr->idles, * this);
+                                mark_idle(this->cltr->procs, * this);
                                 // Confirm the ready-queue is empty
 …
                                 if( readyThread ) {
                                         // A thread was found, cancel the halt
                                         remove(this->cltr->idles, * this);
+                                        mark_awake(this->cltr->procs, * this);
                                         #if !defined(__CFA_NO_STATISTICS__)
 …
                                 // We were woken up, remove self from idle
                                 remove(this->cltr->idles, * this);
+                                mark_awake(this->cltr->procs, * this);
                                 // DON'T just proceed, start looking again
 …
                                 #if !defined(__CFA_NO_STATISTICS__)
                                         __tls_stats()->ready.threads.threads++;
+                                        __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this );
                                 #endif
                                 // This is case 2, the racy case, someone tried to run this thread before it finished blocking
 …
         #if !defined(__CFA_NO_STATISTICS__)
                 __tls_stats()->ready.threads.threads--;
+                __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", this );
         #endif
 …
                 if( kernelTLS().this_stats ) {
                         __tls_stats()->ready.threads.threads++;
+                        __push_stat( __tls_stats(), __tls_stats()->ready.threads.threads, false, "Processor", kernelTLS().this_processor );
+                }
                 else {
                         __atomic_fetch_add(&cl->stats->ready.threads.threads, 1, __ATOMIC_RELAXED);
+                        __push_stat( cl->stats, cl->stats->ready.threads.threads, true, "Cluster", cl );
+                }
         #endif
 …
         ready_schedule_lock();
                 $thread * thrd = pop( this );
+                $thread * thrd = pop_fast( this );
         ready_schedule_unlock();
 …
         unsigned idle;
         unsigned total;
         [idle, total, p] = query(this->idles);
+        [idle, total, p] = query_idles(this->procs);
         // If no one is sleeping, we are done
 …
+}
 static void push  (__cluster_idles & this, processor & proc) {
+static void mark_idle(__cluster_proc_list & this, processor & proc) {
         /* paranoid */ verify( ! __preemption_enabled() );
         lock( this );
                 this.idle++;
                 /* paranoid */ verify( this.idle <= this.total );
                 insert_first(this.list, proc);
+                remove(proc);
+                insert_first(this.idles, proc);
         unlock( this );
         /* paranoid */ verify( ! __preemption_enabled() );
+}
 static void remove(__cluster_idles & this, processor & proc) {
+static void mark_awake(__cluster_proc_list & this, processor & proc) {
         /* paranoid */ verify( ! __preemption_enabled() );
         lock( this );
                 this.idle--;
                 /* paranoid */ verify( this.idle >= 0 );
                 remove(proc);
+                insert_last(this.actives, proc);
         unlock( this );
         /* paranoid */ verify( ! __preemption_enabled() );
+}
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+static [unsigned idle, unsigned total, * processor] query_idles( & __cluster_proc_list this ) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( ready_schedule_islocked() );
         for() {
                 uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
 …
                 unsigned idle    = this.idle;
                 unsigned total   = this.total;
                 processor * proc = &this.list`first;
+                processor * proc = &this.idles`first;
                 // Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
                 asm volatile("": : :"memory");
 …
                 return [idle, total, proc];
+        }
+        /* paranoid */ verify( ready_schedule_islocked() );
+        /* paranoid */ verify( ! __preemption_enabled() );
+}

libcfa/src/concurrency/kernel.hfa

-              r857a1c6
+              rc8a0210
         struct cluster * cltr;
+        // Id within the cluster
+        unsigned cltr_id;
+        // Ready Queue state per processor
+        struct {
+                unsigned short its;
+                unsigned short itr;
+                unsigned id;
+                unsigned target;
+                unsigned long long int cutoff;
+        } rdq;
         // Set to true to notify the processor should terminate
 …
 // Cluster Tools
 // Intrusives lanes which are used by the relaxed ready queue
+// Intrusives lanes which are used by the ready queue
 struct __attribute__((aligned(128))) __intrusive_lane_t;
 void  ?{}(__intrusive_lane_t & this);
 void ^?{}(__intrusive_lane_t & this);
+// Counter used for wether or not the lanes are all empty
+struct __attribute__((aligned(128))) __snzi_node_t;
+struct __snzi_t {
+        unsigned mask;
+        int root;
+        __snzi_node_t * nodes;
+};
+void  ?{}( __snzi_t & this, unsigned depth );
+void ^?{}( __snzi_t & this );
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __timestamp_t;
+void  ?{}(__timestamp_t & this);
+void ^?{}(__timestamp_t & this);
 //TODO adjust cache size to ARCHITECTURE
 // Structure holding the relaxed ready queue
 struct __ready_queue_t {
-        // Data tracking how many/which lanes are used
-        // Aligned to 128 for cache locality
-        __snzi_t snzi;
         // Data tracking the actual lanes
         // On a seperate cacheline from the used struct since
 …
                 __intrusive_lane_t * volatile data;
+                // Array of times
+                __timestamp_t * volatile tscs;
                 // Number of lanes (empty or not)
                 volatile size_t count;
 …
 // Idle Sleep
 struct __cluster_idles {
+struct __cluster_proc_list {
         // Spin lock protecting the queue
         volatile uint64_t lock;
 …
         // List of idle processors
+        dlist(processor, processor) list;
+        dlist(processor, processor) idles;
+        // List of active processors
+        dlist(processor, processor) actives;
 };
 …
         // List of idle processors
         __cluster_idles idles;
+        __cluster_proc_list procs;
         // List of threads

libcfa/src/concurrency/kernel/startup.cfa

-              r857a1c6
+              rc8a0210
                         __print_stats( st, mainProcessor->print_stats, "Processor ", mainProcessor->name, (void*)mainProcessor );
+                }
+                #if defined(CFA_STATS_ARRAY)
+                        __flush_stat( st, "Processor", mainProcessor );
+                #endif
         #endif
 …
                         __print_stats( &local_stats, proc->print_stats, "Processor ", proc->name, (void*)proc );
+                }
+                #if defined(CFA_STATS_ARRAY)
+                        __flush_stat( &local_stats, "Processor", proc );
+                #endif
         #endif
 …
         this.name = name;
         this.cltr = &_cltr;
+        this.rdq.its = 0;
+        this.rdq.itr = 0;
+        this.rdq.id  = -1u;
+        this.rdq.target = -1u;
+        this.rdq.cutoff = -1ull;
         do_terminate = false;
         preemption_alarm = 0p;
 …
         #endif
+        lock( this.cltr->idles );
+                int target = this.cltr->idles.total += 1u;
+        unlock( this.cltr->idles );
+        id = doregister((__processor_id_t*)&this);
+        // Register and Lock the RWlock so no-one pushes/pops while we are changing the queue
+        uint_fast32_t last_size = ready_mutate_register((__processor_id_t*)&this);
+                this.cltr->procs.total += 1u;
+                insert_last(this.cltr->procs.actives, this);
+                // Adjust the ready queue size
+                ready_queue_grow( cltr );
+        // Unlock the RWlock
+        ready_mutate_unlock( last_size );
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
         // Lock the RWlock so no-one pushes/pops while we are changing the queue
         uint_fast32_t last_size = ready_mutate_lock();
+                this.cltr->procs.total -= 1u;
+                remove(this);
                 // Adjust the ready queue size
+                this.cltr_id = ready_queue_grow( cltr, target );
+        // Unlock the RWlock
+        ready_mutate_unlock( last_size );
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
+// Not a ctor, it just preps the destruction but should not destroy members
+static void deinit(processor & this) {
+        lock( this.cltr->idles );
+                int target = this.cltr->idles.total -= 1u;
+        unlock( this.cltr->idles );
+        // Lock the RWlock so no-one pushes/pops while we are changing the queue
+        uint_fast32_t last_size = ready_mutate_lock();
+                // Adjust the ready queue size
+                ready_queue_shrink( this.cltr, target );
+        // Unlock the RWlock
+        ready_mutate_unlock( last_size );
+        // Finally we don't need the read_lock any more
+        unregister((__processor_id_t*)&this);
+                ready_queue_shrink( this.cltr );
+        // Unlock the RWlock and unregister: we don't need the read_lock any more
+        ready_mutate_unregister((__processor_id_t*)&this, last_size );
         close(this.idle);
 …
 //-----------------------------------------------------------------------------
 // Cluster
 static void ?{}(__cluster_idles & this) {
+static void ?{}(__cluster_proc_list & this) {
         this.lock  = 0;
         this.idle  = 0;
         this.total = 0;
-        (this.list){};
+}
 …
                 // Adjust the ready queue size
                 ready_queue_grow( &this, 0 );
+                ready_queue_grow( &this );
         // Unlock the RWlock
 …
                 // Adjust the ready queue size
                 ready_queue_shrink( &this, 0 );
+                ready_queue_shrink( &this );
         // Unlock the RWlock
 …
                         __print_stats( this.stats, this.print_stats, "Cluster", this.name, (void*)&this );
+                }
+                #if defined(CFA_STATS_ARRAY)
+                        __flush_stat( this.stats, "Cluster", &this );
+                #endif
                 free( this.stats );
         #endif

libcfa/src/concurrency/kernel_private.hfa

-              r857a1c6
+              rc8a0210
 // Cluster lock API
 //=======================================================================
-// Cells use by the reader writer lock
-// while not generic it only relies on a opaque pointer
-struct __attribute__((aligned(128))) __scheduler_lock_id_t {
-        // Spin lock used as the underlying lock
-        volatile bool lock;
-        // Handle pointing to the proc owning this cell
-        // Used for allocating cells and debugging
-        __processor_id_t * volatile handle;
-        #ifdef __CFA_WITH_VERIFY__
-                // Debug, check if this is owned for reading
-                bool owned;
-        #endif
-};
-static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
 // Lock-Free registering/unregistering of threads
 // Register a processor to a given cluster and get its unique id in return
 unsigned doregister( struct __processor_id_t * proc );
+void register_proc_id( struct __processor_id_t * );
 // Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct __processor_id_t * proc );
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+        for() {
+                uint64_t l = this.lock;
+                if(
+                        (0 == (l % 2))
+                        && __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+                ) return;
+                Pause();
+        }
+}
+static inline void unlock(__cluster_idles & this) {
+        /* paranoid */ verify( 1 == (this.lock % 2) );
+        __atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
+void unregister_proc_id( struct __processor_id_t * proc );
 //=======================================================================
 …
         __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __attribute__((aligned(128))) __scheduler_lock_id_t {
+        // Spin lock used as the underlying lock
+        volatile bool lock;
+        // Handle pointing to the proc owning this cell
+        // Used for allocating cells and debugging
+        __processor_id_t * volatile handle;
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                bool owned;
+        #endif
+};
+static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
 //-----------------------------------------------------------------------
 …
 void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );
+//-----------------------------------------------------------------------
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+// For convenience, also acquires the lock
+static inline uint_fast32_t ready_mutate_register( struct __processor_id_t * proc ) {
+        register_proc_id( proc );
+        return ready_mutate_lock();
+}
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+// assumes the lock is acquired
+static inline void ready_mutate_unregister( struct __processor_id_t * proc, uint_fast32_t last_s ) {
+        ready_mutate_unlock( last_s );
+        unregister_proc_id( proc );
+}
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_proc_list & this) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        // Start by locking the global RWlock so that we know no-one is
+        // adding/removing processors while we mess with the idle lock
+        ready_schedule_lock();
+        // Simple counting lock, acquired, acquired by incrementing the counter
+        // to an odd number
+        for() {
+                uint64_t l = this.lock;
+                if(
+                        (0 == (l % 2))
+                        && __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+                ) return;
+                Pause();
+        }
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
+static inline void unlock(__cluster_proc_list & this) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( 1 == (this.lock % 2) );
+        // Simple couting lock, release by incrementing to an even number
+        __atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+        // Release the global lock, which we acquired when locking
+        ready_schedule_unlock();
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
 //=======================================================================
 // Ready-Queue API
 //-----------------------------------------------------------------------
-// pop thread from the ready queue of a cluster
-// returns 0p if empty
-__attribute__((hot)) bool query(struct cluster * cltr);
-//-----------------------------------------------------------------------
 // push thread onto a ready queue for a cluster
 // returns true if the list was previously empty, false otherwise
 __attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);
+__attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd);
 //-----------------------------------------------------------------------
 …
 // returns 0p if empty
 // May return 0p spuriously
 __attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+__attribute__((hot)) struct $thread * pop_fast(struct cluster * cltr);
 //-----------------------------------------------------------------------
 …
 //-----------------------------------------------------------------------
-// remove thread from the ready queue of a cluster
-// returns bool if it wasn't found
-bool remove_head(struct cluster * cltr, struct $thread * thrd);
-//-----------------------------------------------------------------------
 // Increase the width of the ready queue (number of lanes) by 4
 unsigned ready_queue_grow  (struct cluster * cltr, int target);
+void ready_queue_grow  (struct cluster * cltr);
 //-----------------------------------------------------------------------
 // Decrease the width of the ready queue (number of lanes) by 4
 void ready_queue_shrink(struct cluster * cltr, int target);
+void ready_queue_shrink(struct cluster * cltr);

libcfa/src/concurrency/preemption.cfa

-              r857a1c6
+              rc8a0210
 static void * alarm_loop( __attribute__((unused)) void * args ) {
         __processor_id_t id;
         id.id = doregister(&id);
+        register_proc_id(&id);
         __cfaabi_tls.this_proc_id = &id;
 …
 EXIT:
         __cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
         unregister(&id);
+        register_proc_id(&id);
         return 0p;

libcfa/src/concurrency/ready_queue.cfa

-              r857a1c6
+              rc8a0210
 // #define __CFA_DEBUG_PRINT_READY_QUEUE__
-// #define USE_SNZI
 // #define USE_MPSC
+#define USE_RELAXED_FIFO
+// #define USE_WORK_STEALING
 #include "bits/defs.hfa"
 …
 #include <unistd.h>
-#include "snzi.hfa"
 #include "ready_subqueue.hfa"
 …
 #endif
+#define BIAS 4
+#if   defined(USE_RELAXED_FIFO)
+        #define BIAS 4
+        #define READYQ_SHARD_FACTOR 4
+        #define SEQUENTIAL_SHARD 1
+#elif defined(USE_WORK_STEALING)
+        #define READYQ_SHARD_FACTOR 2
+        #define SEQUENTIAL_SHARD 2
+#else
+        #error no scheduling strategy selected
+#endif
+static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred);
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w);
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
+static inline struct $thread * search(struct cluster * cltr);
 // returns the maximum number of processors the RWLock support
 …
 //=======================================================================
 // Lock-Free registering/unregistering of threads
 unsigned doregister( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+void register_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
         __cfadbg_print_safe(ready_queue, "Kernel : Registering proc %p for RW-Lock\n", proc);
 …
                         /*paranoid*/ verify(0 == (__alignof__(data[i]) % cache_line_size));
                         /*paranoid*/ verify((((uintptr_t)&data[i]) % cache_line_size) == 0);
                         return i;
+                        proc->id = i;
+                }
+        }
 …
         /*paranoid*/ verify(__alignof__(data[n]) == (2 * cache_line_size));
         /*paranoid*/ verify((((uintptr_t)&data[n]) % cache_line_size) == 0);
         return n;
+}
 void unregister( struct __processor_id_t * proc ) with(*__scheduler_lock) {
+        proc->id = n;
+}
+void unregister_proc_id( struct __processor_id_t * proc ) with(*__scheduler_lock) {
         unsigned id = proc->id;
         /*paranoid*/ verify(id < ready);
 …
 //=======================================================================
 // Cforall Reqdy Queue used for scheduling
+// Cforall Ready Queue used for scheduling
 //=======================================================================
 void ?{}(__ready_queue_t & this) with (this) {
         lanes.data  = 0p;
+        lanes.tscs  = 0p;
         lanes.count = 0;
+}
 void ^?{}(__ready_queue_t & this) with (this) {
+        verify( 1 == lanes.count );
+        #ifdef USE_SNZI
+                verify( !query( snzi ) );
+        #endif
+        verify( SEQUENTIAL_SHARD == lanes.count );
         free(lanes.data);
+        free(lanes.tscs);
+}
 //-----------------------------------------------------------------------
+__attribute__((hot)) bool query(struct cluster * cltr) {
+        #ifdef USE_SNZI
+                return query(cltr->ready_queue.snzi);
+        #endif
+        return true;
+}
+static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
+        unsigned i;
+        bool local;
+        #if defined(BIAS)
+#if defined(USE_RELAXED_FIFO)
+        //-----------------------------------------------------------------------
+        // get index from random number with or without bias towards queues
+        static inline [unsigned, bool] idx_from_r(unsigned r, unsigned preferred) {
+                unsigned i;
+                bool local;
                 unsigned rlow  = r % BIAS;
                 unsigned rhigh = r / BIAS;
 …
                         // (BIAS - 1) out of BIAS chances
                         // Use perferred queues
                         i = preferred + (rhigh % 4);
+                        i = preferred + (rhigh % READYQ_SHARD_FACTOR);
                         local = true;
+                }
 …
                         local = false;
+                }
+        #else
+                i = r;
+                local = false;
+        #endif
+        return [i, local];
+}
+//-----------------------------------------------------------------------
+__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+        __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+        const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+        // write timestamp
+        thrd->link.ts = rdtscl();
+        bool first = false;
+        __attribute__((unused)) bool local;
+        __attribute__((unused)) int preferred;
+        #if defined(BIAS)
+                preferred =
+                        //*
+                        external ? -1 : kernelTLS().this_processor->cltr_id;
+                        /*/
+                        thrd->link.preferred * 4;
+                        //*/
+        #endif
+        // Try to pick a lane and lock it
+        unsigned i;
+        do {
+                // Pick the index of a lane
+                // unsigned r = __tls_rand();
+                unsigned r = __tls_rand_fwd();
+                [i, local] = idx_from_r(r, preferred);
+                i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                return [i, local];
+        }
+        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+                __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+                const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+                // write timestamp
+                thrd->link.ts = rdtscl();
+                bool local;
+                int preferred = external ? -1 : kernelTLS().this_processor->rdq.id;
+                // Try to pick a lane and lock it
+                unsigned i;
+                do {
+                        // Pick the index of a lane
+                        unsigned r = __tls_rand_fwd();
+                        [i, local] = idx_from_r(r, preferred);
+                        i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                if(external) {
+                                        if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.local, 1, __ATOMIC_RELAXED);
+                                        __atomic_fetch_add(&cltr->stats->ready.pick.ext.attempt, 1, __ATOMIC_RELAXED);
+                                }
+                                else {
+                                        if(local) __tls_stats()->ready.pick.push.local++;
+                                        __tls_stats()->ready.pick.push.attempt++;
+                                }
+                        #endif
+                #if defined(USE_MPSC)
+                        // mpsc always succeeds
+                } while( false );
+                #else
+                        // If we can't lock it retry
+                } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                #endif
+                // Actually push it
+                push(lanes.data[i], thrd);
+                #if !defined(USE_MPSC)
+                        // Unlock and return
+                        __atomic_unlock( &lanes.data[i].lock );
+                #endif
+                // Mark the current index in the tls rng instance as having an item
+                __tls_rand_advance_bck();
+                __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+                // Update statistics
                 #if !defined(__CFA_NO_STATISTICS__)
                         if(external) {
                                 if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.local, 1, __ATOMIC_RELAXED);
                                 __atomic_fetch_add(&cltr->stats->ready.pick.ext.attempt, 1, __ATOMIC_RELAXED);
+                                if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.lsuccess, 1, __ATOMIC_RELAXED);
+                                __atomic_fetch_add(&cltr->stats->ready.pick.ext.success, 1, __ATOMIC_RELAXED);
+                        }
                         else {
                                 if(local) __tls_stats()->ready.pick.push.local++;
                                 __tls_stats()->ready.pick.push.attempt++;
+                                if(local) __tls_stats()->ready.pick.push.lsuccess++;
+                                __tls_stats()->ready.pick.push.success++;
+                        }
                 #endif
+        #if defined(USE_MPSC)
+                // mpsc always succeeds
+        } while( false );
+        #else
+                // If we can't lock it retry
+        } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+        #endif
+        // Actually push it
+        #ifdef USE_SNZI
+                bool lane_first =
+        #endif
+        push(lanes.data[i], thrd);
+        #ifdef USE_SNZI
+                // If this lane used to be empty we need to do more
+                if(lane_first) {
+                        // Check if the entire queue used to be empty
+                        first = !query(snzi);
+                        // Update the snzi
+                        arrive( snzi, i );
+                }
+        #endif
+        #if !defined(USE_MPSC)
+                // Unlock and return
+                __atomic_unlock( &lanes.data[i].lock );
+        #endif
+        // Mark the current index in the tls rng instance as having an item
+        __tls_rand_advance_bck();
+        __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+        }
+        // Pop from the ready queue from a given cluster
+        __attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+                /* paranoid */ verify( lanes.count > 0 );
+                /* paranoid */ verify( kernelTLS().this_processor );
+                /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+                unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+                int preferred = kernelTLS().this_processor->rdq.id;
+                // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+                for(25) {
+                        // Pick two lists at random
+                        unsigned ri = __tls_rand_bck();
+                        unsigned rj = __tls_rand_bck();
+                        unsigned i, j;
+                        __attribute__((unused)) bool locali, localj;
+                        [i, locali] = idx_from_r(ri, preferred);
+                        [j, localj] = idx_from_r(rj, preferred);
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                if(locali && localj) {
+                                        __tls_stats()->ready.pick.pop.local++;
+                                }
+                        #endif
+                        i %= count;
+                        j %= count;
+                        // try popping from the 2 picked lists
+                        struct $thread * thrd = try_pop(cltr, i, j);
+                        if(thrd) {
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if( locali || localj ) __tls_stats()->ready.pick.pop.lsuccess++;
+                                #endif
+                                return thrd;
+                        }
+                }
+                // All lanes where empty return 0p
+                return 0p;
+        }
+        __attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) {
+                return search(cltr);
+        }
+#endif
+#if defined(USE_WORK_STEALING)
+        __attribute__((hot)) void push(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+                __cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
+                const bool external = (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+                /* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
+                // write timestamp
+                thrd->link.ts = rdtscl();
+                // Try to pick a lane and lock it
+                unsigned i;
+                do {
+                        if(unlikely(external)) {
+                                i = __tls_rand() % lanes.count;
+                        }
+                        else {
+                                processor * proc = kernelTLS().this_processor;
+                                unsigned r = proc->rdq.its++;
+                                i =  proc->rdq.id + (r % READYQ_SHARD_FACTOR);
+                        }
+                #if defined(USE_MPSC)
+                        // mpsc always succeeds
+                } while( false );
+                #else
+                        // If we can't lock it retry
+                } while( !__atomic_try_acquire( &lanes.data[i].lock ) );
+                #endif
+                // Actually push it
+                push(lanes.data[i], thrd);
+                #if !defined(USE_MPSC)
+                        // Unlock and return
+                        __atomic_unlock( &lanes.data[i].lock );
+                #endif
+                __cfadbg_print_safe(ready_queue, "Kernel : Pushed %p on cluster %p (idx: %u, mask %llu, first %d)\n", thrd, cltr, i, used.mask[0], lane_first);
+        }
+        // Pop from the ready queue from a given cluster
+        __attribute__((hot)) $thread * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
+                /* paranoid */ verify( lanes.count > 0 );
+                /* paranoid */ verify( kernelTLS().this_processor );
+                /* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
+                processor * proc = kernelTLS().this_processor;
+                if(proc->rdq.target == -1u) {
+                        proc->rdq.target = __tls_rand() % lanes.count;
+                        unsigned it1  = proc->rdq.itr;
+                        unsigned it2  = proc->rdq.itr + 1;
+                        unsigned idx1 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+                        unsigned idx2 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
+                        unsigned long long tsc1 = ts(lanes.data[idx1]);
+                        unsigned long long tsc2 = ts(lanes.data[idx2]);
+                        proc->rdq.cutoff = min(tsc1, tsc2);
+                }
+                else if(lanes.tscs[proc->rdq.target].tv < proc->rdq.cutoff) {
+                        $thread * t = try_pop(cltr, proc->rdq.target);
+                        proc->rdq.target = -1u;
+                        if(t) return t;
+                }
+                for(READYQ_SHARD_FACTOR) {
+                        unsigned i = proc->rdq.id + (--proc->rdq.itr % READYQ_SHARD_FACTOR);
+                        if($thread * t = try_pop(cltr, i)) return t;
+                }
+                return 0p;
+        }
+        __attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+                for(25) {
+                        unsigned i = __tls_rand() % lanes.count;
+                        $thread * t = try_pop(cltr, i);
+                        if(t) return t;
+                }
+                return search(cltr);
+        }
+#endif
+//=======================================================================
+// Various Ready Queue utilities
+//=======================================================================
+// these function work the same or almost the same
+// whether they are using work-stealing or relaxed fifo scheduling
+//-----------------------------------------------------------------------
+// try to pop from a lane given by index w
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
+        // Get relevant elements locally
+        __intrusive_lane_t & lane = lanes.data[w];
+        // If list looks empty retry
+        if( is_empty(lane) ) return 0p;
+        // If we can't get the lock retry
+        if( !__atomic_try_acquire(&lane.lock) ) return 0p;
+        // If list is empty, unlock and retry
+        if( is_empty(lane) ) {
+                __atomic_unlock(&lane.lock);
+                return 0p;
+        }
+        // Actually pop the list
+        struct $thread * thrd;
+        thrd = pop(lane);
+        /* paranoid */ verify(thrd);
+        /* paranoid */ verify(lane.lock);
+        // Unlock and return
+        __atomic_unlock(&lane.lock);
         // Update statistics
         #if !defined(__CFA_NO_STATISTICS__)
+                if(external) {
+                        if(local) __atomic_fetch_add(&cltr->stats->ready.pick.ext.lsuccess, 1, __ATOMIC_RELAXED);
+                        __atomic_fetch_add(&cltr->stats->ready.pick.ext.success, 1, __ATOMIC_RELAXED);
+                }
+                else {
+                        if(local) __tls_stats()->ready.pick.push.lsuccess++;
+                        __tls_stats()->ready.pick.push.success++;
+                }
+                __tls_stats()->ready.pick.pop.success++;
         #endif
+        // return whether or not the list was empty before this push
+        return first;
+}
+static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j);
+static struct $thread * try_pop(struct cluster * cltr, unsigned i);
+// Pop from the ready queue from a given cluster
+__attribute__((hot)) $thread * pop(struct cluster * cltr) with (cltr->ready_queue) {
+        /* paranoid */ verify( lanes.count > 0 );
+        unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+        int preferred;
+        #if defined(BIAS)
+                // Don't bother trying locally too much
+                preferred = kernelTLS().this_processor->cltr_id;
+        #if defined(USE_WORK_STEALING)
+                lanes.tscs[w].tv = thrd->link.ts;
         #endif
+        // As long as the list is not empty, try finding a lane that isn't empty and pop from it
+        #ifdef USE_SNZI
+                while( query(snzi) ) {
+        #else
+                for(25) {
+        #endif
+                // Pick two lists at random
+                // unsigned ri = __tls_rand();
+                // unsigned rj = __tls_rand();
+                unsigned ri = __tls_rand_bck();
+                unsigned rj = __tls_rand_bck();
+                unsigned i, j;
+                __attribute__((unused)) bool locali, localj;
+                [i, locali] = idx_from_r(ri, preferred);
+                [j, localj] = idx_from_r(rj, preferred);
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(locali && localj) {
+                                __tls_stats()->ready.pick.pop.local++;
+                        }
+                #endif
+                i %= count;
+                j %= count;
+                // try popping from the 2 picked lists
+                struct $thread * thrd = try_pop(cltr, i, j);
+                if(thrd) {
+                        #if defined(BIAS) && !defined(__CFA_NO_STATISTICS__)
+                                if( locali || localj ) __tls_stats()->ready.pick.pop.lsuccess++;
+                        #endif
+                        return thrd;
+                }
+        }
+        // All lanes where empty return 0p
+        return 0p;
+}
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
+        // return the popped thread
+        return thrd;
+}
+//-----------------------------------------------------------------------
+// try to pop from any lanes making sure you don't miss any threads push
+// before the start of the function
+static inline struct $thread * search(struct cluster * cltr) with (cltr->ready_queue) {
         /* paranoid */ verify( lanes.count > 0 );
         unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
 …
+}
 //-----------------------------------------------------------------------
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.pick.pop.attempt++;
+        #endif
+        // Pick the bet list
+        int w = i;
+        if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+                w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+        }
+        return try_pop(cltr, w);
+}
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned w) with (cltr->ready_queue) {
+        // Get relevant elements locally
+        __intrusive_lane_t & lane = lanes.data[w];
+        // If list looks empty retry
+        if( is_empty(lane) ) return 0p;
+        // If we can't get the lock retry
+        if( !__atomic_try_acquire(&lane.lock) ) return 0p;
+        // If list is empty, unlock and retry
+        if( is_empty(lane) ) {
+                __atomic_unlock(&lane.lock);
+                return 0p;
+        }
+        // Actually pop the list
+        struct $thread * thrd;
+        thrd = pop(lane);
+        /* paranoid */ verify(thrd);
+        /* paranoid */ verify(lane.lock);
+        #ifdef USE_SNZI
+                // If this was the last element in the lane
+                if(emptied) {
+                        depart( snzi, w );
+                }
+        #endif
+        // Unlock and return
+        __atomic_unlock(&lane.lock);
+        // Update statistics
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.pick.pop.success++;
+        #endif
+        // Update the thread bias
+        thrd->link.preferred = w / 4;
+        // return the popped thread
+        return thrd;
+}
+//-----------------------------------------------------------------------
+bool remove_head(struct cluster * cltr, struct $thread * thrd) with (cltr->ready_queue) {
+        for(i; lanes.count) {
+                __intrusive_lane_t & lane = lanes.data[i];
+                bool removed = false;
+                __atomic_acquire(&lane.lock);
+                        if(head(lane)->link.next == thrd) {
+                                $thread * pthrd;
+                                pthrd = pop(lane);
+                                /* paranoid */ verify( pthrd == thrd );
+                                removed = true;
+                                #ifdef USE_SNZI
+                                        if(emptied) {
+                                                depart( snzi, i );
+                                        }
+                                #endif
+                        }
+                __atomic_unlock(&lane.lock);
+                if( removed ) return true;
+        }
+        return false;
+}
+//-----------------------------------------------------------------------
+// Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
         #if defined(__CFA_WITH_VERIFY__) && !defined(USE_MPSC)
 …
+}
+//-----------------------------------------------------------------------
+// Given 2 indexes, pick the list with the oldest push an try to pop from it
+static inline struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.pick.pop.attempt++;
+        #endif
+        // Pick the bet list
+        int w = i;
+        if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
+                w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
+        }
+        return try_pop(cltr, w);
+}
 // Call this function of the intrusive list was moved using memcpy
 // fixes the list so that the pointers back to anchors aren't left dangling
 …
+}
+static void assign_list(unsigned & value, dlist(processor, processor) & list, unsigned count) {
+        processor * it = &list`first;
+        for(unsigned i = 0; i < count; i++) {
+                /* paranoid */ verifyf( it, "Unexpected null iterator, at index %u of %u\n", i, count);
+                it->rdq.id = value;
+                it->rdq.target = -1u;
+                value += READYQ_SHARD_FACTOR;
+                it = &(*it)`next;
+        }
+}
+static void reassign_cltr_id(struct cluster * cltr) {
+        unsigned preferred = 0;
+        assign_list(preferred, cltr->procs.actives, cltr->procs.total - cltr->procs.idle);
+        assign_list(preferred, cltr->procs.idles  , cltr->procs.idle );
+}
+static void fix_times( struct cluster * cltr ) with( cltr->ready_queue ) {
+        #if defined(USE_WORK_STEALING)
+                lanes.tscs = alloc(lanes.count, lanes.tscs`realloc);
+                for(i; lanes.count) {
+                        lanes.tscs[i].tv = ts(lanes.data[i]);
+                }
+        #endif
+}
 // Grow the ready queue
+unsigned ready_queue_grow(struct cluster * cltr, int target) {
+        unsigned preferred;
+void ready_queue_grow(struct cluster * cltr) {
         size_t ncount;
+        int target = cltr->procs.total;
         /* paranoid */ verify( ready_mutate_islocked() );
 …
         // grow the ready queue
         with( cltr->ready_queue ) {
-                #ifdef USE_SNZI
-                        ^(snzi){};
-                #endif
                 // Find new count
                 // Make sure we always have atleast 1 list
                 if(target >= 2) {
+                        ncount = target * 4;
+                        preferred = ncount - 4;
+                        ncount = target * READYQ_SHARD_FACTOR;
                 } else {
+                        ncount = 1;
+                        preferred = 0;
+                        ncount = SEQUENTIAL_SHARD;
+                }
 …
                 // Update original
                 lanes.count = ncount;
+                #ifdef USE_SNZI
+                        // Re-create the snzi
+                        snzi{ log2( lanes.count / 8 ) };
+                        for( idx; (size_t)lanes.count ) {
+                                if( !is_empty(lanes.data[idx]) ) {
+                                        arrive(snzi, idx);
+                                }
+                        }
+                #endif
+        }
+        }
+        fix_times(cltr);
+        reassign_cltr_id(cltr);
         // Make sure that everything is consistent
 …
         /* paranoid */ verify( ready_mutate_islocked() );
-        return preferred;
+}
 // Shrink the ready queue
 void ready_queue_shrink(struct cluster * cltr, int target) {
+void ready_queue_shrink(struct cluster * cltr) {
         /* paranoid */ verify( ready_mutate_islocked() );
         __cfadbg_print_safe(ready_queue, "Kernel : Shrinking ready queue\n");
 …
         /* paranoid */ check( cltr->ready_queue );
+        int target = cltr->procs.total;
         with( cltr->ready_queue ) {
-                #ifdef USE_SNZI
-                        ^(snzi){};
-                #endif
                 // Remember old count
                 size_t ocount = lanes.count;
 …
                 // Find new count
                 // Make sure we always have atleast 1 list
                 lanes.count = target >= 2 ? target * 4: 1;
+                lanes.count = target >= 2 ? target * READYQ_SHARD_FACTOR: SEQUENTIAL_SHARD;
                 /* paranoid */ verify( ocount >= lanes.count );
                 /* paranoid */ verify( lanes.count == target * 4 || target < 2 );
+                /* paranoid */ verify( lanes.count == target * READYQ_SHARD_FACTOR || target < 2 );
                 // for printing count the number of displaced threads
 …
                         fix(lanes.data[idx]);
+                }
+                #ifdef USE_SNZI
+                        // Re-create the snzi
+                        snzi{ log2( lanes.count / 8 ) };
+                        for( idx; (size_t)lanes.count ) {
+                                if( !is_empty(lanes.data[idx]) ) {
+                                        arrive(snzi, idx);
+                                }
+                        }
+                #endif
+        }
+        }
+        fix_times(cltr);
+        reassign_cltr_id(cltr);
         // Make sure that everything is consistent

libcfa/src/concurrency/ready_subqueue.hfa

-              r857a1c6
+              rc8a0210
         #endif
+}
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __timestamp_t {
+        volatile unsigned long long tv;
+};
+void  ?{}(__timestamp_t & this) { this.tv = 0; }
+void ^?{}(__timestamp_t & this) {}

libcfa/src/concurrency/stats.cfa

-              r857a1c6
+              rc8a0210
 #include <inttypes.h>
 #include "bits/debug.hfa"
+#include "bits/locks.hfa"
 #include "stats.hfa"
 …
                         stats->io.calls.errors.busy = 0;
                         stats->io.poller.sleeps     = 0;
+                #endif
+                #if defined(CFA_STATS_ARRAY)
+                        stats->array.values = alloc(CFA_STATS_ARRAY);
+                        stats->array.cnt = 0;
                 #endif
+        }
 …
                 #endif
+        }
+        #if defined(CFA_STATS_ARRAY)
+                extern "C" {
+                        #include <stdio.h>
+                        #include <errno.h>
+                        #include <sys/stat.h>
+                        #include <fcntl.h>
+                }
+                void __flush_stat( struct __stats_t * this, const char * name, void * handle) {
+                        int ret = mkdir(".cfadata", 0755);
+                        if(ret < 0 && errno != EEXIST) abort("Failed to create directory .cfadata: %d\n", errno);
+                        char filename[100];
+                        snprintf(filename, 100, ".cfadata/%s%p.data", name, handle);
+                        int fd = open(filename, O_WRONLY | O_APPEND | O_CREAT, 0644);
+                        if(fd < 0) abort("Failed to create file %s: %d\n", filename, errno);
+                        for(i; this->array.cnt) {
+                                char line[100];
+                                size_t n = snprintf(line, 100, "%llu, %lld\n", this->array.values[i].ts, this->array.values[i].value);
+                                write(fd, line, n);
+                        }
+                        this->array.cnt = 0;
+                        close(fd);
+                }
+                static __spinlock_t stats_lock;
+                void __push_stat( struct __stats_t * this, int64_t value, bool external, const char * name, void * handle ) {
+                        if(external) lock(stats_lock __cfaabi_dbg_ctx2);
+                        if( this->array.cnt >= CFA_STATS_ARRAY ) __flush_stat( this, name, handle );
+                        size_t idx = this->array.cnt;
+                        this->array.cnt++;
+                        if(external) unlock(stats_lock);
+                        this->array.values[idx].ts = rdtscl();
+                        this->array.values[idx].value = value;
+                }
+        #endif
 #endif

libcfa/src/concurrency/stats.hfa

-              r857a1c6
+              rc8a0210
 #pragma once
+// #define CFA_STATS_ARRAY 10000
 #include <stdint.h>
 …
         #endif
+        #if defined(CFA_STATS_ARRAY)
+                struct __stats_elem_t {
+                        long long int ts;
+                        int64_t value;
+                };
+        #endif
         struct __attribute__((aligned(128))) __stats_t {
                 __stats_readQ_t ready;
 …
                         __stats_io_t    io;
                 #endif
+                #if defined(CFA_STATS_ARRAY)
+                        struct {
+                                __stats_elem_t * values;
+                                volatile size_t cnt;
+                        } array;
+                #endif
         };
 …
         void __tally_stats( struct __stats_t *, struct __stats_t * );
         void __print_stats( struct __stats_t *, int, const char *, const char *, void * );
+        #if defined(CFA_STATS_ARRAY)
+                void __push_stat ( struct __stats_t *, int64_t value, bool external, const char * name, void * handle);
+                void __flush_stat( struct __stats_t *, const char *, void * );
+        #else
+                static inline void __push_stat ( struct __stats_t *, int64_t, bool, const char *, void * ) {}
+                static inline void __flush_stat( struct __stats_t *, const char *, void * ) {}
+        #endif
 #endif

libcfa/src/concurrency/thread.cfa

-              r857a1c6
+              rc8a0210
         link.next = 0p;
         link.prev = 0p;
-        link.preferred = -1;
         #if defined( __CFA_WITH_VERIFY__ )
                 canary = 0x0D15EA5E0D15EA5Ep;
 …
+}
+FORALL_DATA_INSTANCE(ThreadCancelled, (thread_t &), (thread_t))
+EHM_VIRTUAL_TABLE(SomeThreadCancelled, std_thread_cancelled);
 forall(T &)
 …
 forall(T &)
 const char * msg(ThreadCancelled(T) *) {
         return "ThreadCancelled";
+        return "ThreadCancelled(...)";
+}
 forall(T &)
 static void default_thread_cancel_handler(ThreadCancelled(T) & ) {
+        // Improve this error message, can I do formatting?
         abort( "Unhandled thread cancellation.\n" );
+}
+forall(T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
+static void default_thread_cancel_handler(SomeThreadCancelled & ) {
+        // Improve this error message, can I do formatting?
+        abort( "Unhandled thread cancellation.\n" );
+}
+forall(T & | is_thread(T) | IS_EXCEPTION(SomeThreadCancelled))
 void ?{}( thread_dtor_guard_t & this,
                 T & thrd, void(*cancelHandler)(ThreadCancelled(T) &)) {
         $monitor * m = get_monitor(thrd);
+                T & thrd, void(*cancelHandler)(SomeThreadCancelled &)) {
+        $monitor * m = get_monitor(thrd);
         $thread * desc = get_thread(thrd);
         // Setup the monitor guard
         void (*dtor)(T& mutex this) = ^?{};
         bool join = cancelHandler != (void(*)(ThreadCancelled(T)&))0;
+        bool join = cancelHandler != (void(*)(SomeThreadCancelled&))0;
         (this.mg){&m, (void(*)())dtor, join};
 …
+        }
         desc->state = Cancelled;
         void(*defaultResumptionHandler)(ThreadCancelled(T) &) =
+        void(*defaultResumptionHandler)(SomeThreadCancelled &) =
                 join ? cancelHandler : default_thread_cancel_handler;
-        ThreadCancelled(T) except;
         // TODO: Remove explitate vtable set once trac#186 is fixed.
+        except.virtual_table = &get_exception_vtable(&except);
+        SomeThreadCancelled except;
+        except.virtual_table = &std_thread_cancelled;
         except.the_thread = &thrd;
         except.the_exception = __cfaehm_cancellation_exception( cancellation );
+        throwResume except;
+        // Why is this cast required?
+        throwResume (SomeThreadCancelled &)except;
         except.the_exception->virtual_table->free( except.the_exception );
 …
 //-----------------------------------------------------------------------------
 forall(T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(SomeThreadCancelled))
 T & join( T & this ) {
         thread_dtor_guard_t guard = { this, defaultResumptionHandler };

libcfa/src/concurrency/thread.hfa

-              r857a1c6
+              rc8a0210
 };
+FORALL_DATA_EXCEPTION(ThreadCancelled, (thread_t &), (thread_t)) (
+EHM_EXCEPTION(SomeThreadCancelled) (
+        void * the_thread;
+        exception_t * the_exception;
+);
+EHM_EXTERN_VTABLE(SomeThreadCancelled, std_thread_cancelled);
+EHM_FORALL_EXCEPTION(ThreadCancelled, (thread_t &), (thread_t)) (
         thread_t * the_thread;
         exception_t * the_exception;
 …
 };
 forall( T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
 void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(ThreadCancelled(T) &) );
+forall( T & | is_thread(T) | IS_EXCEPTION(SomeThreadCancelled) )
+void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(SomeThreadCancelled &) );
 void ^?{}( thread_dtor_guard_t & this );
 …
 //----------
 // join
 forall( T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(SomeThreadCancelled) )
 T & join( T & this );

Context Navigation

Legend:

libcfa/src/concurrency/coroutine.cfa

libcfa/src/concurrency/coroutine.hfa

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/io/call.cfa.in

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel/startup.cfa

libcfa/src/concurrency/kernel_private.hfa

libcfa/src/concurrency/preemption.cfa

libcfa/src/concurrency/ready_queue.cfa

libcfa/src/concurrency/ready_subqueue.hfa

libcfa/src/concurrency/stats.cfa

libcfa/src/concurrency/stats.hfa

libcfa/src/concurrency/thread.cfa

libcfa/src/concurrency/thread.hfa

Download in other formats: