Diff [737c98acc56f97fa28241bc0ce3e5692f6904e70:2a3d44698a8f41614824b5eaaf857999a33d334b] for / – Cforall

libcfa/src/Makefile.am

r737c98a	r2a3d446
48	48	thread_headers_nosrc = concurrency/invoke.h
49	49	thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
50		thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
	50	thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
51	51	else
52	52	headers =

libcfa/src/Makefile.in

-                      r737c98a
+                      r2a3d446
         concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
         concurrency/invoke.c concurrency/preemption.cfa \
         concurrency/coroutine.cfa concurrency/thread.cfa \
         concurrency/kernel.cfa concurrency/monitor.cfa \
         concurrency/mutex.cfa
+        concurrency/ready_queue.cfa concurrency/coroutine.cfa \
+        concurrency/thread.cfa concurrency/kernel.cfa \
+        concurrency/monitor.cfa concurrency/mutex.cfa
 @BUILDLIB_TRUE@am__objects_3 = concurrency/coroutine.lo \
 @BUILDLIB_TRUE@ concurrency/thread.lo concurrency/kernel.lo \
 …
 @BUILDLIB_TRUE@ concurrency/CtxSwitch-@ARCHITECTURE@.lo \
 @BUILDLIB_TRUE@ concurrency/alarm.lo concurrency/invoke.lo \
+@BUILDLIB_TRUE@ concurrency/preemption.lo $(am__objects_3)
+@BUILDLIB_TRUE@ concurrency/preemption.lo \
+@BUILDLIB_TRUE@ concurrency/ready_queue.lo $(am__objects_3)
 am_libcfathread_la_OBJECTS = $(am__objects_4)
 libcfathread_la_OBJECTS = $(am_libcfathread_la_OBJECTS)
 …
 @BUILDLIB_FALSE@thread_headers =
 @BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
 @BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
 #----------------------------------------------------------------------------------------------------------------
 …
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \
+        concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/ready_queue.lo: concurrency/$(am__dirstamp) \
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/coroutine.lo: concurrency/$(am__dirstamp) \

libcfa/src/bits/defs.hfa

-                      r737c98a
+                      r2a3d446
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
+// #define __CFA_NO_BIT_TEST_AND_SET__
+static inline bool bts(volatile unsigned long long int * target, unsigned long long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+    #else
+        int result = 0;
+        asm volatile(
+            "LOCK btsq %[bit], %[target]\n\t"
+            : "=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}
+static inline bool btr(volatile unsigned long long int * target, unsigned long long int bit ) {
+        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
+        unsigned long long int mask = 1ul << bit;
+        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
+        return (ret & mask) != 0;
+        #else
+        int result = 0;
+        asm volatile(
+            "LOCK btrq %[bit], %[target]\n\t"
+            :"=@ccc" (result)
+            : [target] "m" (*target), [bit] "r" (bit)
+        );
+        return result != 0;
+    #endif
+}

libcfa/src/concurrency/invoke.h

-                      r737c98a
+                      r2a3d446
         };
+        // Link lists fields
+        // instrusive link field for threads
+        struct __thread_desc_link {
+                struct thread_desc * next;
+                struct thread_desc * prev;
+                unsigned long long ts;
+        };
         struct thread_desc {
                 // Core threading fields
 …
                 // Link lists fields
                 // instrusive link field for threads
                 struct thread_desc * next;
+                struct __thread_desc_link link;
                 struct {
 …
         extern "Cforall" {
                 static inline thread_desc *& get_next( thread_desc & this ) {
                         return this.next;
+                        return this.link.next;
+                }

libcfa/src/concurrency/kernel.cfa

-                      r737c98a
+                      r2a3d446
         self_mon.recursion = 1;
         self_mon_p = &self_mon;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = 0p;
 …
         this.name = name;
         this.cltr = &cltr;
+        id = -1u;
         terminated{ 0 };
         do_terminate = false;
 …
         this.preemption_rate = preemption_rate;
         ready_queue{};
+        ready_queue_lock{};
+        procs{ __get };
+        ready_lock{};
         idles{ __get };
         threads{ __get };
 …
         __cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
+        doregister(this->cltr, this);
+        // register the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor) {
+                this->id = doregister(this->cltr, this);
+                ready_queue_grow( this->cltr );
+        }
+        {
 …
+        }
-        unregister(this->cltr, this);
         V( this->terminated );
+        // unregister the processor unless it's the main thread which is handled in the boot sequence
+        if(this != mainProcessor) {
+                ready_queue_shrink( this->cltr );
+                unregister(this->cltr, this);
+        }
         __cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+        stats_tls_tally(this->cltr);
+}
 …
         );
         Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+        Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
         Abort( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
 …
         verify( ! kernelTLS.preemption_state.enabled );
+        verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+        verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+        ready_schedule_lock(thrd->curr_cluster, kernelTLS.this_processor);
+                bool was_empty = push( thrd->curr_cluster, thrd );
+        ready_schedule_unlock(thrd->curr_cluster, kernelTLS.this_processor);
         with( *thrd->curr_cluster ) {
-                lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
-                bool was_empty = !(ready_queue != 0);
-                append( ready_queue, thrd );
-                unlock( ready_queue_lock );
                 if(was_empty) {
                         lock      (proc_list_lock __cfaabi_dbg_ctx2);
 …
                         wake_fast(idle);
+                }
+        }
 …
 thread_desc * nextThread(cluster * this) with( *this ) {
         verify( ! kernelTLS.preemption_state.enabled );
+        lock( ready_queue_lock __cfaabi_dbg_ctx2 );
+        thread_desc * head = pop_head( ready_queue );
+        unlock( ready_queue_lock );
+        ready_schedule_lock(this, kernelTLS.this_processor);
+                thread_desc * head = pop( this );
+        ready_schedule_unlock(this, kernelTLS.this_processor);
         verify( ! kernelTLS.preemption_state.enabled );
         return head;
 …
                 pending_preemption = false;
                 kernel_thread = pthread_self();
+                id = -1u;
                 runner{ &this };
 …
         mainProcessor = (processor *)&storage_mainProcessor;
         (*mainProcessor){};
+        mainProcessor->id = doregister(mainCluster, mainProcessor);
         //initialize the global state variables
 …
         kernel_stop_preemption();
+        unregister(mainCluster, mainProcessor);
         // Destroy the main processor and its context in reverse order of construction
         // These were manually constructed so we need manually destroy them
+        ^(mainProcessor->runner){};
+        ^(mainProcessor){};
+        void ^?{}(processor & this) with( this ) {
+                //don't join the main thread here, that wouldn't make any sense
+                __cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+        }
+        ^(*mainProcessor){};
         // Final step, destroy the main thread since it is no longer needed
+        // Since we provided a stack to this taxk it will not destroy anything
+        ^(mainThread){};
+        // Since we provided a stack to this task it will not destroy anything
+        ^(*mainThread){};
+        ^(*mainCluster){};
         ^(__cfa_dbg_global_clusters.list){};
 …
         with( *cltr ) {
                 lock      (proc_list_lock __cfaabi_dbg_ctx2);
-                remove    (procs, *this);
                 push_front(idles, *this);
                 unlock    (proc_list_lock);
 …
                 lock      (proc_list_lock __cfaabi_dbg_ctx2);
                 remove    (idles, *this);
-                push_front(procs, *this);
                 unlock    (proc_list_lock);
+        }
 …
+}
-void doregister( cluster * cltr, processor * proc ) {
-        lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-        cltr->nprocessors += 1;
-        push_front(cltr->procs, *proc);
-        unlock    (cltr->proc_list_lock);
+}
-void unregister( cluster * cltr, processor * proc ) {
-        lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
-        remove(cltr->procs, *proc );
-        cltr->nprocessors -= 1;
-        unlock(cltr->proc_list_lock);
+}
 //-----------------------------------------------------------------------------
 // Debug

libcfa/src/concurrency/kernel.hfa

-                      r737c98a
+                      r2a3d446
         // Cluster from which to get threads
         struct cluster * cltr;
+        unsigned int id;
         // Name of the processor
 …
+}
+//-----------------------------------------------------------------------------
+// Cluster Tools
+struct __processor_id;
+// Reader-Writer lock protecting the ready-queue
+struct __clusterRWLock_t {
+        // total cachelines allocated
+        unsigned int max;
+        // cachelines currently in use
+        volatile unsigned int alloc;
+        // cachelines ready to itereate over
+        // (!= to alloc when thread is in second half of doregister)
+        volatile unsigned int ready;
+        // writer lock
+        volatile bool lock;
+        // data pointer
+        __processor_id * data;
+};
+void  ?{}(__clusterRWLock_t & this);
+void ^?{}(__clusterRWLock_t & this);
+// Underlying sub quues of the ready queue
+struct __attribute__((aligned(128))) __intrusive_ready_queue_t {
+        // spin lock protecting the queue
+        volatile bool lock;
+        unsigned int last_id;
+        // anchor for the head and the tail of the queue
+        struct __sentinel_t {
+                // Link lists fields
+                // instrusive link field for threads
+                // must be exactly as in thread_desc
+                __thread_desc_link link;
+        } before, after;
+        // Optional statistic counters
+        #if !defined(__CFA_NO_SCHED_STATS__)
+                struct __attribute__((aligned(64))) {
+                        // difference between number of push and pops
+                        ssize_t diff;
+                        // total number of pushes and pops
+                        size_t  push;
+                        size_t  pop ;
+                } stat;
+        #endif
+};
+void  ?{}(__intrusive_ready_queue_t & this);
+void ^?{}(__intrusive_ready_queue_t & this);
+typedef unsigned long long __cfa_readyQ_mask_t;
+// enum {
+//      __cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
+//      __cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
+// };
+#define __cfa_readyQ_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
+#define __cfa_max_readyQs (__cfa_readyQ_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
+//TODO adjust cache size to ARCHITECTURE
+struct __attribute__((aligned(128))) __ready_queue_t {
+        struct {
+                volatile size_t count;
+                volatile __cfa_readyQ_mask_t mask[ __cfa_readyQ_mask_size ];
+        } empty;
+        struct __attribute__((aligned(64))) {
+                __intrusive_ready_queue_t * volatile data;
+                volatile size_t count;
+        } list;
+        #if !defined(__CFA_NO_STATISTICS__)
+                __attribute__((aligned(64))) struct {
+                        struct {
+                                struct {
+                                        volatile size_t attempt;
+                                        volatile size_t success;
+                                } push;
+                                struct {
+                                        volatile size_t maskrds;
+                                        volatile size_t attempt;
+                                        volatile size_t success;
+                                } pop;
+                        } pick;
+                        struct {
+                                volatile size_t value;
+                                volatile size_t count;
+                        } full;
+                } global_stats;
+        #endif
+};
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
 //-----------------------------------------------------------------------------
 // Cluster
 struct cluster {
         // Ready queue locks
         __spinlock_t ready_queue_lock;
+        __clusterRWLock_t ready_lock;
         // Ready queue for threads
         __queue_t(thread_desc) ready_queue;
+        __ready_queue_t ready_queue;
         // Name of the cluster
 …
         // List of processors
         __spinlock_t proc_list_lock;
-        __dllist_t(struct processor) procs;
         __dllist_t(struct processor) idles;
-        unsigned int nprocessors;
         // List of threads

libcfa/src/concurrency/kernel_private.hfa

-                      r737c98a
+                      r2a3d446
 //-----------------------------------------------------------------------------
 // Utils
 #define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
+#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
 static inline uint32_t tls_rand() {
 …
 void unregister( struct cluster * cltr, struct thread_desc & thrd );
+void doregister( struct cluster * cltr, struct processor * proc );
+void unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+struct __attribute__((aligned(64))) __processor_id {
+        processor * volatile handle;
+        volatile bool lock;
+};
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister( struct cluster * cltr, struct processor * proc );
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct cluster * cltr, struct processor * proc );
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+static inline void __atomic_acquire(volatile bool * ll) {
+        while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+                while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+                        asm volatile("pause");
+        }
+        /* paranoid */ verify(*ll);
+}
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+static inline void __atomic_unlock(volatile bool * ll) {
+        /* paranoid */ verify(*ll);
+        __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+        unsigned iproc = proc->id;
+        /*paranoid*/ verify(data[iproc].handle == proc);
+        /*paranoid*/ verify(iproc < ready);
+        // Step 1 : make sure no writer are in the middle of the critical section
+        while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+                asm volatile("pause");
+        // Fence needed because we don't want to start trying to acquire the lock
+        // before we read a false.
+        // Not needed on x86
+        // std::atomic_thread_fence(std::memory_order_seq_cst);
+        // Step 2 : acquire our local lock
+        __atomic_acquire( &data[iproc].lock );
+        /*paranoid*/ verify(data[iproc].lock);
+}
+static inline void ready_schedule_unlock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
+        unsigned iproc = proc->id;
+        /*paranoid*/ verify(data[iproc].handle == proc);
+        /*paranoid*/ verify(iproc < ready);
+        /*paranoid*/ verify(data[iproc].lock);
+        __atomic_store_n(&data[iproc].lock, false, __ATOMIC_RELEASE);
+}
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( struct cluster & cltr );
+void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );
+//=======================================================================
+// Ready-Queue API
+__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
+__attribute__((hot)) thread_desc * pop(struct cluster * cltr);
+void ready_queue_grow  (struct cluster * cltr);
+void ready_queue_shrink(struct cluster * cltr);
+#if !defined(__CFA_NO_STATISTICS__)
+void stats_tls_tally(struct cluster * cltr);
+#else
+static inline void stats_tls_tally(struct cluster * cltr) {}
+#endif
 // Local Variables: //

libcfa/src/concurrency/monitor.cfa

r737c98a	r2a3d446
841	841	for( thread_desc ** thrd_it = &entry_queue.head;
842	842	*thrd_it;
843		thrd_it = &(*thrd_it)->next
	843	thrd_it = &(*thrd_it)->link.next
844	844	) {
845	845	// For each acceptable check if it matches

libcfa/src/concurrency/thread.cfa

-                      r737c98a
+                      r2a3d446
         self_mon_p = &self_mon;
         curr_cluster = &cl;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
         node.next = 0p;

Context Navigation

Changes in / [737c98a:2a3d446]

Legend:

libcfa/src/Makefile.am

libcfa/src/Makefile.in

libcfa/src/bits/defs.hfa

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel_private.hfa

libcfa/src/concurrency/monitor.cfa

libcfa/src/concurrency/thread.cfa

Download in other formats: