Changes in / [e307e12:f80f840]


Ignore:
Location:
libcfa/src
Files:
1 added
9 edited

Legend:

Unmodified
Added
Removed
  • libcfa/src/Makefile.am

    re307e12 rf80f840  
    4848thread_headers_nosrc = concurrency/invoke.h
    4949thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
    50 thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
     50thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
    5151else
    5252headers =
  • libcfa/src/Makefile.in

    re307e12 rf80f840  
    165165        concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
    166166        concurrency/invoke.c concurrency/preemption.cfa \
    167         concurrency/coroutine.cfa concurrency/thread.cfa \
    168         concurrency/kernel.cfa concurrency/monitor.cfa \
    169         concurrency/mutex.cfa
     167        concurrency/ready_queue.cfa concurrency/coroutine.cfa \
     168        concurrency/thread.cfa concurrency/kernel.cfa \
     169        concurrency/monitor.cfa concurrency/mutex.cfa
    170170@BUILDLIB_TRUE@am__objects_3 = concurrency/coroutine.lo \
    171171@BUILDLIB_TRUE@ concurrency/thread.lo concurrency/kernel.lo \
     
    174174@BUILDLIB_TRUE@ concurrency/CtxSwitch-@ARCHITECTURE@.lo \
    175175@BUILDLIB_TRUE@ concurrency/alarm.lo concurrency/invoke.lo \
    176 @BUILDLIB_TRUE@ concurrency/preemption.lo $(am__objects_3)
     176@BUILDLIB_TRUE@ concurrency/preemption.lo \
     177@BUILDLIB_TRUE@ concurrency/ready_queue.lo $(am__objects_3)
    177178am_libcfathread_la_OBJECTS = $(am__objects_4)
    178179libcfathread_la_OBJECTS = $(am_libcfathread_la_OBJECTS)
     
    462463@BUILDLIB_FALSE@thread_headers =
    463464@BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
    464 @BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
     465@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa concurrency/ready_queue.cfa ${thread_headers:.hfa=.cfa}
    465466
    466467#----------------------------------------------------------------------------------------------------------------
     
    598599        concurrency/$(DEPDIR)/$(am__dirstamp)
    599600concurrency/preemption.lo: concurrency/$(am__dirstamp) \
     601        concurrency/$(DEPDIR)/$(am__dirstamp)
     602concurrency/ready_queue.lo: concurrency/$(am__dirstamp) \
    600603        concurrency/$(DEPDIR)/$(am__dirstamp)
    601604concurrency/coroutine.lo: concurrency/$(am__dirstamp) \
  • libcfa/src/bits/defs.hfa

    re307e12 rf80f840  
    5353    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
    5454}
     55
     56// #define __CFA_NO_BIT_TEST_AND_SET__
     57
     58static inline bool bts(volatile unsigned long long int * target, unsigned long long int bit ) {
     59        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
     60        unsigned long long int mask = 1ul << bit;
     61        unsigned long long int ret = __atomic_fetch_or(target, mask, (int)__ATOMIC_RELAXED);
     62        return (ret & mask) != 0;
     63    #else
     64        int result = 0;
     65        asm volatile(
     66            "LOCK btsq %[bit], %[target]\n\t"
     67            : "=@ccc" (result)
     68            : [target] "m" (*target), [bit] "r" (bit)
     69        );
     70        return result != 0;
     71    #endif
     72}
     73
     74static inline bool btr(volatile unsigned long long int * target, unsigned long long int bit ) {
     75        #if defined(__CFA_NO_BIT_TEST_AND_SET__)
     76        unsigned long long int mask = 1ul << bit;
     77        unsigned long long int ret = __atomic_fetch_and(target, ~mask, (int)__ATOMIC_RELAXED);
     78        return (ret & mask) != 0;
     79        #else
     80        int result = 0;
     81        asm volatile(
     82            "LOCK btrq %[bit], %[target]\n\t"
     83            :"=@ccc" (result)
     84            : [target] "m" (*target), [bit] "r" (bit)
     85        );
     86        return result != 0;
     87    #endif
     88}
  • libcfa/src/concurrency/invoke.h

    re307e12 rf80f840  
    158158        };
    159159
     160        // Link lists fields
     161        // instrusive link field for threads
     162        struct __thread_desc_link {
     163                struct thread_desc * next;
     164                struct thread_desc * prev;
     165                unsigned long long ts;
     166        };
     167
    160168        struct thread_desc {
    161169                // Core threading fields
     
    188196                // Link lists fields
    189197                // instrusive link field for threads
    190                 struct thread_desc * next;
     198                struct __thread_desc_link link;
    191199
    192200                struct {
     
    199207        extern "Cforall" {
    200208                static inline thread_desc *& get_next( thread_desc & this ) {
    201                         return this.next;
     209                        return this.link.next;
    202210                }
    203211
  • libcfa/src/concurrency/kernel.cfa

    re307e12 rf80f840  
    187187        self_mon.recursion = 1;
    188188        self_mon_p = &self_mon;
    189         next = 0p;
     189        link.next = 0p;
     190        link.prev = 0p;
    190191
    191192        node.next = 0p;
     
    212213        this.name = name;
    213214        this.cltr = &cltr;
     215        id = -1u;
    214216        terminated{ 0 };
    215217        do_terminate = false;
     
    242244        this.preemption_rate = preemption_rate;
    243245        ready_queue{};
    244         ready_queue_lock{};
    245 
    246         procs{ __get };
     246        ready_lock{};
     247
    247248        idles{ __get };
    248249        threads{ __get };
     
    273274        __cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
    274275
    275         doregister(this->cltr, this);
     276        // register the processor unless it's the main thread which is handled in the boot sequence
     277        if(this != mainProcessor) {
     278                this->id = doregister(this->cltr, this);
     279                ready_queue_grow( this->cltr );
     280        }
     281
    276282
    277283        {
     
    305311        }
    306312
    307         unregister(this->cltr, this);
    308 
    309313        V( this->terminated );
    310314
     315
     316        // unregister the processor unless it's the main thread which is handled in the boot sequence
     317        if(this != mainProcessor) {
     318                ready_queue_shrink( this->cltr );
     319                unregister(this->cltr, this);
     320        }
     321
    311322        __cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
     323
     324        stats_tls_tally(this->cltr);
    312325}
    313326
     
    469482        );
    470483
    471         Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" ); 
     484        Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
    472485
    473486        Abort( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
     
    535548        verify( ! kernelTLS.preemption_state.enabled );
    536549
    537         verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
     550        verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
     551
     552
     553        ready_schedule_lock(thrd->curr_cluster, kernelTLS.this_processor);
     554                bool was_empty = push( thrd->curr_cluster, thrd );
     555        ready_schedule_unlock(thrd->curr_cluster, kernelTLS.this_processor);
    538556
    539557        with( *thrd->curr_cluster ) {
    540                 lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
    541                 bool was_empty = !(ready_queue != 0);
    542                 append( ready_queue, thrd );
    543                 unlock( ready_queue_lock );
    544 
    545558                if(was_empty) {
    546559                        lock      (proc_list_lock __cfaabi_dbg_ctx2);
     
    553566                        wake_fast(idle);
    554567                }
    555 
    556568        }
    557569
     
    562574thread_desc * nextThread(cluster * this) with( *this ) {
    563575        verify( ! kernelTLS.preemption_state.enabled );
    564         lock( ready_queue_lock __cfaabi_dbg_ctx2 );
    565         thread_desc * head = pop_head( ready_queue );
    566         unlock( ready_queue_lock );
     576
     577        ready_schedule_lock(this, kernelTLS.this_processor);
     578                thread_desc * head = pop( this );
     579        ready_schedule_unlock(this, kernelTLS.this_processor);
     580
    567581        verify( ! kernelTLS.preemption_state.enabled );
    568582        return head;
     
    726740                pending_preemption = false;
    727741                kernel_thread = pthread_self();
     742                id = -1u;
    728743
    729744                runner{ &this };
     
    735750        mainProcessor = (processor *)&storage_mainProcessor;
    736751        (*mainProcessor){};
     752
     753        mainProcessor->id = doregister(mainCluster, mainProcessor);
    737754
    738755        //initialize the global state variables
     
    781798        kernel_stop_preemption();
    782799
     800        unregister(mainCluster, mainProcessor);
     801
    783802        // Destroy the main processor and its context in reverse order of construction
    784803        // These were manually constructed so we need manually destroy them
    785         ^(mainProcessor->runner){};
    786         ^(mainProcessor){};
     804        void ^?{}(processor & this) with( this ) {
     805                //don't join the main thread here, that wouldn't make any sense
     806                __cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
     807        }
     808
     809        ^(*mainProcessor){};
    787810
    788811        // Final step, destroy the main thread since it is no longer needed
    789         // Since we provided a stack to this taxk it will not destroy anything
    790         ^(mainThread){};
     812        // Since we provided a stack to this task it will not destroy anything
     813        ^(*mainThread){};
     814
     815        ^(*mainCluster){};
    791816
    792817        ^(__cfa_dbg_global_clusters.list){};
     
    804829        with( *cltr ) {
    805830                lock      (proc_list_lock __cfaabi_dbg_ctx2);
    806                 remove    (procs, *this);
    807831                push_front(idles, *this);
    808832                unlock    (proc_list_lock);
     
    818842                lock      (proc_list_lock __cfaabi_dbg_ctx2);
    819843                remove    (idles, *this);
    820                 push_front(procs, *this);
    821844                unlock    (proc_list_lock);
    822845        }
     
    959982}
    960983
    961 void doregister( cluster * cltr, processor * proc ) {
    962         lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
    963         cltr->nprocessors += 1;
    964         push_front(cltr->procs, *proc);
    965         unlock    (cltr->proc_list_lock);
    966 }
    967 
    968 void unregister( cluster * cltr, processor * proc ) {
    969         lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
    970         remove(cltr->procs, *proc );
    971         cltr->nprocessors -= 1;
    972         unlock(cltr->proc_list_lock);
    973 }
    974 
    975984//-----------------------------------------------------------------------------
    976985// Debug
  • libcfa/src/concurrency/kernel.hfa

    re307e12 rf80f840  
    107107        // Cluster from which to get threads
    108108        struct cluster * cltr;
     109        unsigned int id;
    109110
    110111        // Name of the processor
     
    161162}
    162163
     164
     165//-----------------------------------------------------------------------------
     166// Cluster Tools
     167struct __processor_id;
     168
     169// Reader-Writer lock protecting the ready-queue
     170struct __clusterRWLock_t {
     171        // total cachelines allocated
     172        unsigned int max;
     173
     174        // cachelines currently in use
     175        volatile unsigned int alloc;
     176
     177        // cachelines ready to itereate over
     178        // (!= to alloc when thread is in second half of doregister)
     179        volatile unsigned int ready;
     180
     181        // writer lock
     182        volatile bool lock;
     183
     184        // data pointer
     185        __processor_id * data;
     186};
     187
     188void  ?{}(__clusterRWLock_t & this);
     189void ^?{}(__clusterRWLock_t & this);
     190
     191// Underlying sub quues of the ready queue
     192struct __attribute__((aligned(128))) __intrusive_ready_queue_t {
     193        // spin lock protecting the queue
     194        volatile bool lock;
     195        unsigned int last_id;
     196
     197        // anchor for the head and the tail of the queue
     198        struct __sentinel_t {
     199                // Link lists fields
     200                // instrusive link field for threads
     201                // must be exactly as in thread_desc
     202                __thread_desc_link link;
     203        } before, after;
     204
     205        // Optional statistic counters
     206        #if !defined(__CFA_NO_SCHED_STATS__)
     207                struct __attribute__((aligned(64))) {
     208                        // difference between number of push and pops
     209                        ssize_t diff;
     210
     211                        // total number of pushes and pops
     212                        size_t  push;
     213                        size_t  pop ;
     214                } stat;
     215        #endif
     216};
     217
     218void  ?{}(__intrusive_ready_queue_t & this);
     219void ^?{}(__intrusive_ready_queue_t & this);
     220
     221typedef unsigned long long __cfa_readyQ_mask_t;
     222
     223// enum {
     224//      __cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
     225//      __cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
     226// };
     227
     228#define __cfa_readyQ_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
     229#define __cfa_max_readyQs (__cfa_readyQ_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))
     230
     231//TODO adjust cache size to ARCHITECTURE
     232struct __attribute__((aligned(128))) __ready_queue_t {
     233        struct {
     234                volatile size_t count;
     235                volatile __cfa_readyQ_mask_t mask[ __cfa_readyQ_mask_size ];
     236        } empty;
     237
     238        struct __attribute__((aligned(64))) {
     239                __intrusive_ready_queue_t * volatile data;
     240                volatile size_t count;
     241        } list;
     242
     243        #if !defined(__CFA_NO_STATISTICS__)
     244                __attribute__((aligned(64))) struct {
     245                        struct {
     246                                struct {
     247                                        volatile size_t attempt;
     248                                        volatile size_t success;
     249                                } push;
     250                                struct {
     251                                        volatile size_t maskrds;
     252                                        volatile size_t attempt;
     253                                        volatile size_t success;
     254                                } pop;
     255                        } pick;
     256                        struct {
     257                                volatile size_t value;
     258                                volatile size_t count;
     259                        } full;
     260                } global_stats;
     261
     262        #endif
     263};
     264
     265void  ?{}(__ready_queue_t & this);
     266void ^?{}(__ready_queue_t & this);
     267
    163268//-----------------------------------------------------------------------------
    164269// Cluster
    165270struct cluster {
    166271        // Ready queue locks
    167         __spinlock_t ready_queue_lock;
     272        __clusterRWLock_t ready_lock;
    168273
    169274        // Ready queue for threads
    170         __queue_t(thread_desc) ready_queue;
     275        __ready_queue_t ready_queue;
    171276
    172277        // Name of the cluster
     
    178283        // List of processors
    179284        __spinlock_t proc_list_lock;
    180         __dllist_t(struct processor) procs;
    181285        __dllist_t(struct processor) idles;
    182         unsigned int nprocessors;
    183286
    184287        // List of threads
  • libcfa/src/concurrency/kernel_private.hfa

    re307e12 rf80f840  
    101101//-----------------------------------------------------------------------------
    102102// Utils
    103 #define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
     103#define KERNEL_STORAGE(T,X) __attribute((aligned(__alignof__(T)))) static char storage_##X[sizeof(T)]
    104104
    105105static inline uint32_t tls_rand() {
     
    117117void unregister( struct cluster * cltr, struct thread_desc & thrd );
    118118
    119 void doregister( struct cluster * cltr, struct processor * proc );
    120 void unregister( struct cluster * cltr, struct processor * proc );
     119//=======================================================================
     120// Cluster lock API
     121//=======================================================================
     122struct __attribute__((aligned(64))) __processor_id {
     123        processor * volatile handle;
     124        volatile bool lock;
     125};
     126
     127// Lock-Free registering/unregistering of threads
     128// Register a processor to a given cluster and get its unique id in return
     129unsigned doregister( struct cluster * cltr, struct processor * proc );
     130
     131// Unregister a processor from a given cluster using its id, getting back the original pointer
     132void     unregister( struct cluster * cltr, struct processor * proc );
     133
     134//=======================================================================
     135// Reader-writer lock implementation
     136// Concurrent with doregister/unregister,
     137//    i.e., threads can be added at any point during or between the entry/exit
     138static inline void __atomic_acquire(volatile bool * ll) {
     139        while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
     140                while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
     141                        asm volatile("pause");
     142        }
     143        /* paranoid */ verify(*ll);
     144}
     145
     146static inline bool __atomic_try_acquire(volatile bool * ll) {
     147        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
     148}
     149
     150static inline void __atomic_unlock(volatile bool * ll) {
     151        /* paranoid */ verify(*ll);
     152        __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
     153}
     154
     155//-----------------------------------------------------------------------
     156// Reader side : acquire when using the ready queue to schedule but not
     157//  creating/destroying queues
     158static inline void ready_schedule_lock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
     159        unsigned iproc = proc->id;
     160        /*paranoid*/ verify(data[iproc].handle == proc);
     161        /*paranoid*/ verify(iproc < ready);
     162
     163        // Step 1 : make sure no writer are in the middle of the critical section
     164        while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
     165                asm volatile("pause");
     166
     167        // Fence needed because we don't want to start trying to acquire the lock
     168        // before we read a false.
     169        // Not needed on x86
     170        // std::atomic_thread_fence(std::memory_order_seq_cst);
     171
     172        // Step 2 : acquire our local lock
     173        __atomic_acquire( &data[iproc].lock );
     174        /*paranoid*/ verify(data[iproc].lock);
     175}
     176
     177static inline void ready_schedule_unlock( struct cluster * cltr, struct processor * proc) with(cltr->ready_lock) {
     178        unsigned iproc = proc->id;
     179        /*paranoid*/ verify(data[iproc].handle == proc);
     180        /*paranoid*/ verify(iproc < ready);
     181        /*paranoid*/ verify(data[iproc].lock);
     182        __atomic_store_n(&data[iproc].lock, false, __ATOMIC_RELEASE);
     183}
     184
     185//-----------------------------------------------------------------------
     186// Writer side : acquire when changing the ready queue, e.g. adding more
     187//  queues or removing them.
     188uint_fast32_t ready_mutate_lock( struct cluster & cltr );
     189
     190void ready_mutate_unlock( struct cluster & cltr, uint_fast32_t );
     191
     192//=======================================================================
     193// Ready-Queue API
     194
     195__attribute__((hot)) bool push(struct cluster * cltr, struct thread_desc * thrd);
     196__attribute__((hot)) thread_desc * pop(struct cluster * cltr);
     197void ready_queue_grow  (struct cluster * cltr);
     198void ready_queue_shrink(struct cluster * cltr);
     199
     200#if !defined(__CFA_NO_STATISTICS__)
     201void stats_tls_tally(struct cluster * cltr);
     202#else
     203static inline void stats_tls_tally(struct cluster * cltr) {}
     204#endif
    121205
    122206// Local Variables: //
  • libcfa/src/concurrency/monitor.cfa

    re307e12 rf80f840  
    841841        for(    thread_desc ** thrd_it = &entry_queue.head;
    842842                *thrd_it;
    843                 thrd_it = &(*thrd_it)->next
     843                thrd_it = &(*thrd_it)->link.next
    844844        ) {
    845845                // For each acceptable check if it matches
  • libcfa/src/concurrency/thread.cfa

    re307e12 rf80f840  
    4141        self_mon_p = &self_mon;
    4242        curr_cluster = &cl;
    43         next = 0p;
     43        link.next = 0p;
     44        link.prev = 0p;
    4445
    4546        node.next = 0p;
Note: See TracChangeset for help on using the changeset viewer.