Diff [2cbfe9291f0e8e1365d80ff4c80cf2d8025148dd:b7d6a36126af369a09bd3958b2fe17c537999f77] for / – Cforall

libcfa/src/Makefile.am

r2cbfe92	rb7d6a36
48	48	thread_headers_nosrc = concurrency/invoke.h
49	49	thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
50		thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
	50	thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
51	51	else
52	52	headers =

libcfa/src/Makefile.in

-              r2cbfe92
+              rb7d6a36
         concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
         concurrency/invoke.c concurrency/preemption.cfa \
         concurrency/coroutine.cfa concurrency/thread.cfa \
         concurrency/kernel.cfa concurrency/monitor.cfa \
         concurrency/mutex.cfa
+        concurrency/ready_queue.cfa concurrency/coroutine.cfa \
+        concurrency/thread.cfa concurrency/kernel.cfa \
+        concurrency/monitor.cfa concurrency/mutex.cfa
 @BUILDLIB_TRUE@am__objects_3 = concurrency/coroutine.lo \
 @BUILDLIB_TRUE@ concurrency/thread.lo concurrency/kernel.lo \
 …
 @BUILDLIB_TRUE@ concurrency/CtxSwitch-@ARCHITECTURE@.lo \
 @BUILDLIB_TRUE@ concurrency/alarm.lo concurrency/invoke.lo \
+@BUILDLIB_TRUE@ concurrency/preemption.lo $(am__objects_3)
+@BUILDLIB_TRUE@ concurrency/preemption.lo \
+@BUILDLIB_TRUE@ concurrency/ready_queue.lo $(am__objects_3)
 am_libcfathread_la_OBJECTS = $(am__objects_4)
 libcfathread_la_OBJECTS = $(am_libcfathread_la_OBJECTS)
 …
 @BUILDLIB_FALSE@thread_headers =
 @BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
 @BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 #----------------------------------------------------------------------------------------------------------------
 …
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \
+        concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/ready_queue.lo: concurrency/$(am__dirstamp) \
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/coroutine.lo: concurrency/$(am__dirstamp) \

libcfa/src/bits/debug.hfa

-              r2cbfe92
+              rb7d6a36
         #include <stdarg.h>
         #include <stdio.h>
+        #include <unistd.h>
         extern void __cfaabi_bits_write( int fd, const char buffer[], int len );
 …
 #endif
+// #define __CFA_DEBUG_PRINT__
 #ifdef __CFA_DEBUG_PRINT__
         #define __cfaabi_dbg_write( buffer, len )         __cfaabi_bits_write( STDERR_FILENO, buffer, len )
         #define __cfaabi_dbg_acquire()                    __cfaabi_bits_acquire()
         #define __cfaabi_dbg_release()                    __cfaabi_bits_release()
         #define __cfaabi_dbg_print_safe(...)              __cfaabi_bits_print_safe   (__VA_ARGS__)
         #define __cfaabi_dbg_print_nolock(...)            __cfaabi_bits_print_nolock (__VA_ARGS__)
         #define __cfaabi_dbg_print_buffer(...)            __cfaabi_bits_print_buffer (__VA_ARGS__)
         #define __cfaabi_dbg_print_buffer_decl(...)       char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( __dbg_text, __dbg_len );
         #define __cfaabi_dbg_print_buffer_local(...)      __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_dbg_write( __dbg_text, __dbg_len );
+        #define __cfaabi_dbg_print_safe(...)              __cfaabi_bits_print_safe  ( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_nolock(...)            __cfaabi_bits_print_nolock( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_buffer(...)            __cfaabi_bits_print_buffer( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_buffer_decl(...)       char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( STDERR_FILENO, __dbg_text, __dbg_len );
+        #define __cfaabi_dbg_print_buffer_local(...)      __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( STDERR_FILENO, __dbg_text, __dbg_len );
 #else
         #define __cfaabi_dbg_write(...)               ((void)0)

libcfa/src/bits/defs.hfa

-              r2cbfe92
+              rb7d6a36
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
+// #define __CFA_NO_BIT_TEST_AND_SET__
+#if defined( __i386 )
+static inline bool __atomic_bts(volatile unsigned long int * target, unsigned long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsl %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+static inline bool __atomic_btr(volatile unsigned long int * target, unsigned long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long int mask = 1ul << bit;
+        unsigned long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+        #else
+        int result = 0;
+        asm volatile(
+            "LOCK btrl %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __x86_64 )
+static inline bool __atomic_bts(volatile unsigned long long int * target, unsigned long long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsq %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+static inline bool __atomic_btr(volatile unsigned long long int * target, unsigned long long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+        #else
+        int result = 0;
+        asm volatile(
+            "LOCK btrq %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+#elif defined( __ARM_ARCH )
+    #error __atomic_bts and __atomic_btr not implemented for arm
+#else
+        #error uknown hardware architecture
+#endif

libcfa/src/concurrency/invoke.h

-              r2cbfe92
+              rb7d6a36
         };
+        // Link lists fields
+        // instrusive link field for threads
+        struct __thread_desc_link {
+                struct thread_desc * next;
+                struct thread_desc * prev;
+                unsigned long long ts;
+        };
         struct thread_desc {
                 // Core threading fields
 …
                 // Link lists fields
                 // instrusive link field for threads
                 struct thread_desc * next;
+                struct __thread_desc_link link;
                 struct {
 …
         extern "Cforall" {
                 static inline thread_desc *& get_next( thread_desc & this ) {
                         return this.next;
+                        return this.link.next;
+                }

libcfa/src/concurrency/kernel.cfa

-              r2cbfe92
+              rb7d6a36
         self_mon.recursion = 1;
         self_mon_p = &self_mon;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = 0p;
 …
         this.name = name;
         this.cltr = &cltr;
+        id = -1u;
         terminated{ 0 };
         do_terminate = false;
 …
         this.preemption_rate = preemption_rate;
         ready_queue{};
+        ready_queue_lock{};
+        procs{ __get };
+        ready_lock{};
         idles{ __get };
         threads{ __get };
 …
         __cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
+        doregister(this->cltr, this);
+        // register the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor) {
+                this->id = doregister(this->cltr, this);
+                ready_queue_grow( this->cltr );
+        }
+        {
 …
+        }
-        unregister(this->cltr, this);
         V( this->terminated );
+        // unregister the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor) {
+                ready_queue_shrink( this->cltr );
+                unregister(this->cltr, this);
+        }
         __cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+        stats_tls_tally(this->cltr);
+}
 …
         verify( ! kernelTLS.preemption_state.enabled );
+        verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+        verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+        ready_schedule_lock(thrd->curr_cluster, kernelTLS.this_processor);
+                bool was_empty = push( thrd->curr_cluster, thrd );
+        ready_schedule_unlock(thrd->curr_cluster, kernelTLS.this_processor);
         with( *thrd->curr_cluster ) {
+                lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
+                bool was_empty = !(ready_queue != 0);
+                append( ready_queue, thrd );
+                unlock( ready_queue_lock );
+                if(was_empty) {
+                        lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                        if(idles) {
+                                wake_fast(idles.head);
+                        }
+                        unlock    (proc_list_lock);
+                }
+                else if( struct processor * idle = idles.head ) {
+                        wake_fast(idle);
+                }
+                // if(was_empty) {
+                //      lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                //      if(idles) {
+                //              wake_fast(idles.head);
+                //      }
+                //      unlock    (proc_list_lock);
+                // }
+                // else if( struct processor * idle = idles.head ) {
+                //      wake_fast(idle);
+                // }
+        }
 …
 thread_desc * nextThread(cluster * this) with( *this ) {
         verify( ! kernelTLS.preemption_state.enabled );
+        lock( ready_queue_lock __cfaabi_dbg_ctx2 );
+        thread_desc * head = pop_head( ready_queue );
+        unlock( ready_queue_lock );
+        ready_schedule_lock(this, kernelTLS.this_processor);
+                thread_desc * head = pop( this );
+        ready_schedule_unlock(this, kernelTLS.this_processor);
         verify( ! kernelTLS.preemption_state.enabled );
         return head;
 …
                 pending_preemption = false;
                 kernel_thread = pthread_self();
+                id = -1u;
                 runner{ &this };
 …
         mainProcessor = (processor *)&storage_mainProcessor;
         (*mainProcessor){};
+        mainProcessor->id = doregister(mainCluster, mainProcessor);
         //initialize the global state variables
 …
         kernel_stop_preemption();
+        unregister(mainCluster, mainProcessor);
         // Destroy the main processor and its context in reverse order of construction
         // These were manually constructed so we need manually destroy them
+        ^(mainProcessor->runner){};
+        ^(mainProcessor){};
+        void ^?{}(processor & this) with( this ) {
+                //don't join the main thread here, that wouldn't make any sense
+                __cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+        }
+        ^(*mainProcessor){};
         // Final step, destroy the main thread since it is no longer needed
+        // Since we provided a stack to this taxk it will not destroy anything
+        ^(mainThread){};
+        // Since we provided a stack to this task it will not destroy anything
+        ^(*mainThread){};
+        ^(*mainCluster){};
         ^(__cfa_dbg_global_clusters.list){};
 …
 //=============================================================================================
 static void halt(processor * this) with( *this ) {
+        // verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
+        with( *cltr ) {
+                lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                remove    (procs, *this);
+                push_front(idles, *this);
+                unlock    (proc_list_lock);
+        }
+        __cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+        wait( idleLock );
+        __cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+        with( *cltr ) {
+                lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                remove    (idles, *this);
+                push_front(procs, *this);
+                unlock    (proc_list_lock);
+        }
+        // // verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
+        // with( *cltr ) {
+        //      lock      (proc_list_lock __cfaabi_dbg_ctx2);
+        //      push_front(idles, *this);
+        //      unlock    (proc_list_lock);
+        // }
+        // __cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+        // wait( idleLock );
+        // __cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+        // with( *cltr ) {
+        //      lock      (proc_list_lock __cfaabi_dbg_ctx2);
+        //      remove    (idles, *this);
+        //      unlock    (proc_list_lock);
+        // }
+}
 …
+}
-void doregister( cluster * cltr, processor * proc ) {
-        lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-        cltr->nprocessors += 1;
-        push_front(cltr->procs, *proc);
-        unlock    (cltr->proc_list_lock);
+}
-void unregister( cluster * cltr, processor * proc ) {
-        lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-        remove(cltr->procs, *proc );
-        cltr->nprocessors -= 1;
-        unlock(cltr->proc_list_lock);
+}
 //-----------------------------------------------------------------------------
 // Debug

libcfa/src/concurrency/kernel.hfa

-              r2cbfe92
+              rb7d6a36
         // Cluster from which to get threads
         struct cluster * cltr;
+        unsigned int id;
         // Name of the processor
 …
+}
+//-----------------------------------------------------------------------------
+// Cluster Tools
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __processor_id;
+// Reader-Writer lock protecting the ready-queue
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
+struct __clusterRWLock_t {
+        // total cachelines allocated
+        unsigned int max;
+        // cachelines currently in use
+        volatile unsigned int alloc;
+        // cachelines ready to itereate over
+        // (!= to alloc when thread is in second half of doregister)
+        volatile unsigned int ready;
+        // writer lock
+        volatile bool lock;
+        // data pointer
+        __processor_id * data;
+};
+void  ?{}(__clusterRWLock_t & this);
+void ^?{}(__clusterRWLock_t & this);
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t {
+        // spin lock protecting the queue
+        volatile bool lock;
+        // anchor for the head and the tail of the queue
+        struct __sentinel_t {
+                // Link lists fields
+                // instrusive link field for threads
+                // must be exactly as in thread_desc
+                __thread_desc_link link;
+        } before, after;
+#if defined(__CFA_WITH_VERIFY__)
+        // id of last processor to acquire the lock
+        // needed only to check for mutual exclusion violations
+        unsigned int last_id;
+        // number of items on this list
+        // needed only to check for deadlocks
+        unsigned int count;
+#endif
+        // Optional statistic counters
+        #if !defined(__CFA_NO_SCHED_STATS__)
+                struct __attribute__((aligned(64))) {
+                        // difference between number of push and pops
+                        ssize_t diff;
+                        // total number of pushes and pops
+                        size_t  push;
+                        size_t  pop ;
+                } stat;
+        #endif
+};
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
+typedef unsigned long long __cfa_readyQ_mask_t;
+// enum {
+//      __cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
+//      __cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
+// };
+#define __cfa_lane_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_lanes (__cfa_lane_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+//TODO adjust cache size to ARCHITECTURE
+// Structure holding the relaxed ready queue
+struct __attribute__((aligned(128))) __ready_queue_t {
+        // Data tracking how many/which lanes are used
+        // Aligned to 128 for cache locality
+        struct {
+                // number of non-empty lanes
+                volatile size_t count;
+                // bit mask, set bits indentify which lanes are non-empty
+                volatile __cfa_readyQ_mask_t mask[ __cfa_lane_mask_size ];
+        } used;
+        // Data tracking the actual lanes
+        // On a seperate cacheline from the used struct since
+        // used can change on each push/pop but this data
+        // only changes on shrink/grow
+        struct __attribute__((aligned(64))) {
+                // Arary of lanes
+                __intrusive_lane_t * volatile data;
+                // Number of lanes (empty or not)
+                volatile size_t count;
+        } lanes;
+        // Statistics
+        #if !defined(__CFA_NO_STATISTICS__)
+                __attribute__((aligned(64))) struct {
+                        struct {
+                                // Push statistic
+                                struct {
+                                        // number of attemps at pushing something
+                                        volatile size_t attempt;
+                                        // number of successes at pushing
+                                        volatile size_t success;
+                                } push;
+                                // Pop statistic
+                                struct {
+                                        // number of reads of the mask
+                                        // picking an empty __cfa_readyQ_mask_t counts here
+                                        // but not as an attempt
+                                        volatile size_t maskrds;
+                                        // number of attemps at poping something
+                                        volatile size_t attempt;
+                                        // number of successes at poping
+                                        volatile size_t success;
+                                } pop;
+                        } pick;
+                        // stats on the "used" struct of the queue
+                        // tracks average number of queues that are not empty
+                        // when pushing / poping
+                        struct {
+                                volatile size_t value;
+                                volatile size_t count;
+                        } used;
+                } global_stats;
+        #endif
+};
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
 //-----------------------------------------------------------------------------
 // Cluster
 struct cluster {
         // Ready queue locks
         __spinlock_t ready_queue_lock;
+        __clusterRWLock_t ready_lock;
         // Ready queue for threads
         __queue_t(thread_desc) ready_queue;
+        __ready_queue_t ready_queue;
         // Name of the cluster
 …
         // List of processors
         __spinlock_t proc_list_lock;
-        __dllist_t(struct processor) procs;
         __dllist_t(struct processor) idles;
-        unsigned int nprocessors;
         // List of threads

libcfa/src/concurrency/kernel_private.hfa

-              r2cbfe92
+              rb7d6a36
 //-----------------------------------------------------------------------------
 // Utils
 #define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
 static inline uint32_t tls_rand() {
 …
 void unregister( struct cluster * cltr, struct thread_desc & thrd );
+void doregister( struct cluster * cltr, struct processor * proc );
+void unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+struct __attribute__((aligned(64))) __processor_id {
+        processor * volatile handle;
+        volatile bool lock;
+};
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister( struct cluster * cltr, struct processor * proc );
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
+static inline void __atomic_acquire(volatile bool * ll) {
+        while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+                while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+                        asm volatile("pause");
+        }
+        /* paranoid */ verify(*ll);
+}
+// Non-Blocking acquire
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+// Release
+static inline void __atomic_unlock(volatile bool * ll) {
+        /* paranoid */ verify(*ll);
+        __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+        unsigned iproc = proc->id;
+        /*paranoid*/ verify(data[iproc].handle == proc);
+        /*paranoid*/ verify(iproc < ready);
+        // Step 1 : make sure no writer are in the middle of the critical section
+        while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+                asm volatile("pause");
+        // Fence needed because we don't want to start trying to acquire the lock
+        // before we read a false.
+        // Not needed on x86
+        // std::atomic_thread_fence(std::memory_order_seq_cst);
+        // Step 2 : acquire our local lock
+        __atomic_acquire( &data[iproc].lock );
+        /*paranoid*/ verify(data[iproc].lock);
+}
+static inline void ready_schedule_unlock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+        unsigned iproc = proc->id;
+        /*paranoid*/ verify(data[iproc].handle == proc);
+        /*paranoid*/ verify(iproc < ready);
+        /*paranoid*/ verify(data[iproc].lock);
+        __atomic_unlock(&data[iproc].lock);
+}
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr );
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t /* value returned by lock */ );
+//=======================================================================
+// Ready-Queue API
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
+void ready_queue_grow  (struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_shrink(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// Statics call at the end of each thread to register statistics
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr);
+#else
+static inline void stats_tls_tally(struct cluster * cltr) {}
+#endif
 // Local Variables: //

libcfa/src/concurrency/monitor.cfa

-              r2cbfe92
+              rb7d6a36
+        }
         __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? node->waiting_thread : 0p );
+        __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? (thread_desc*)node->waiting_thread : (thread_desc*)0p );
         return ready2run ? node->waiting_thread : 0p;
+}
 …
         for(    thread_desc ** thrd_it = &entry_queue.head;
                 *thrd_it;
                 thrd_it = &(*thrd_it)->next
+                thrd_it = &(*thrd_it)->link.next
         ) {
                 // For each acceptable check if it matches

libcfa/src/concurrency/preemption.cfa

r2cbfe92	rb7d6a36
120	120	// If there are still alarms pending, reset the timer
121	121	if( alarms->head ) {
122		__cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
	122	// __cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
123	123	Duration delta = alarms->head->alarm - currtime;
124	124	Duration caped = max(delta, 50`us);

libcfa/src/concurrency/thread.cfa

-              r2cbfe92
+              rb7d6a36
         self_mon_p = &self_mon;
         curr_cluster = &cl;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = 0p;

libcfa/src/stdhdr/assert.h

-              r2cbfe92
+              rb7d6a36
         #define verify(x) assert(x)
         #define verifyf(x, ...) assertf(x, __VA_ARGS__)
+        #define verifyfail(...)
         #define __CFA_WITH_VERIFY__
 #else
         #define verify(x)
         #define verifyf(x, ...)
+        #define verifyfail(...)
 #endif

tests/concurrent/examples/.expect/datingService.txt

-              r2cbfe92
+              rb7d6a36
-Girl:17 is dating Boy at 2 with ccode 17
- Boy:2 is dating Girl 17 with ccode 17
- Boy:14 is dating Girl 5 with ccode 5
-Girl:5 is dating Boy at 14 with ccode 5
- Boy:9 is dating Girl 10 with ccode 10
-Girl:10 is dating Boy at 9 with ccode 10
- Boy:1 is dating Girl 18 with ccode 18
-Girl:18 is dating Boy at 1 with ccode 18
- Boy:16 is dating Girl 3 with ccode 3
-Girl:3 is dating Boy at 16 with ccode 3
- Boy:5 is dating Girl 14 with ccode 14
-Girl:14 is dating Boy at 5 with ccode 14
- Boy:15 is dating Girl 4 with ccode 4
-Girl:4 is dating Boy at 15 with ccode 4
-Girl:0 is dating Boy at 19 with ccode 0
- Boy:19 is dating Girl 0 with ccode 0
-Girl:9 is dating Boy at 10 with ccode 9
- Boy:10 is dating Girl 9 with ccode 9
-Girl:11 is dating Boy at 8 with ccode 11
- Boy:8 is dating Girl 11 with ccode 11
- Boy:12 is dating Girl 7 with ccode 7
-Girl:7 is dating Boy at 12 with ccode 7
- Boy:11 is dating Girl 8 with ccode 8
-Girl:8 is dating Boy at 11 with ccode 8
-Girl:16 is dating Boy at 3 with ccode 16
- Boy:3 is dating Girl 16 with ccode 16
-Girl:15 is dating Boy at 4 with ccode 15
- Boy:4 is dating Girl 15 with ccode 15
-Girl:19 is dating Boy at 0 with ccode 19
- Boy:0 is dating Girl 19 with ccode 19
-Girl:2 is dating Boy at 17 with ccode 2
- Boy:17 is dating Girl 2 with ccode 2
- Boy:13 is dating Girl 6 with ccode 6
-Girl:6 is dating Boy at 13 with ccode 6
- Boy:7 is dating Girl 12 with ccode 12
-Girl:12 is dating Boy at 7 with ccode 12
-Girl:13 is dating Boy at 6 with ccode 13
- Boy:6 is dating Girl 13 with ccode 13
-Girl:1 is dating Boy at 18 with ccode 1
- Boy:18 is dating Girl 1 with ccode 1

tests/concurrent/examples/datingService.cfa

-              r2cbfe92
+              rb7d6a36
 //
 // Cforall Version 1.0.0 Copyright (C) 2017 University of Waterloo
 //
+//
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 …
                 signal_block( Boys[ccode] );                                    // restart boy to set phone number
         } // if
         sout | "Girl:" | PhoneNo | "is dating Boy at" | BoyPhoneNo | "with ccode" | ccode;
+        // sout | "Girl:" | PhoneNo | "is dating Boy at" | BoyPhoneNo | "with ccode" | ccode;
         return BoyPhoneNo;
 } // DatingService girl
 …
                 signal_block( Girls[ccode] );                                   // restart girl to set phone number
         } // if
         sout | " Boy:" | PhoneNo | "is dating Girl" | GirlPhoneNo | "with ccode" | ccode;
+        // sout | " Boy:" | PhoneNo | "is dating Girl" | GirlPhoneNo | "with ccode" | ccode;
         return GirlPhoneNo;
 } // DatingService boy

tests/concurrent/waitfor/when.cfa

-              r2cbfe92
+              rb7d6a36
 void arbiter( global_t & mutex this ) {
+        // There is a race at start where callers can get in before the arbiter.
+        // It doesn't really matter here so just restart the loop correctly and move on
+        this.last_call = 6;
         for( int i = 0; i < N; i++ ) {
                    when( this.last_call == 6 ) waitfor( call1 : this ) { if( this.last_call != 1) { serr | "Expected last_call to be 1 got" | this.last_call; } }

Context Navigation

Changes in / [2cbfe92:b7d6a36]

Legend:

libcfa/src/Makefile.am

libcfa/src/Makefile.in

libcfa/src/bits/debug.hfa

libcfa/src/bits/defs.hfa

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel_private.hfa

libcfa/src/concurrency/monitor.cfa

libcfa/src/concurrency/preemption.cfa

libcfa/src/concurrency/thread.cfa

libcfa/src/stdhdr/assert.h

tests/concurrent/examples/.expect/datingService.txt

tests/concurrent/examples/datingService.cfa

tests/concurrent/waitfor/when.cfa

Download in other formats: