Context Navigation

← Previous Change
Next Change →

Changeset b7b3e41 for libcfa/src/concurrency

Timestamp:

Jun 19, 2023, 1:57:11 PM (2 years ago)

Author:

caparson <caparson@…>

Branches:

Children:

Parents:

fa5e1aa5 (diff), 33d4bc8 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.

Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

Location:

libcfa/src/concurrency

Files:

: 6 edited

actor.hfa (modified) (31 diffs)
atomic.hfa (modified) (1 diff)
kernel.hfa (modified) (2 diffs)
kernel/cluster.hfa (modified) (1 diff)
kernel/startup.cfa (modified) (3 diffs)
locks.hfa (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

libcfa/src/concurrency/actor.hfa

-              rfa5e1aa5
+              rb7b3e41
 #endif // CFA_DEBUG
+#define DEBUG_ABORT( cond, string ) CFA_DEBUG( if ( cond ) abort( string ) )
 // Define the default number of processors created in the executor. Must be greater than 0.
 #define __DEFAULT_EXECUTOR_PROCESSORS__ 2
 …
 #define __DEFAULT_EXECUTOR_BUFSIZE__ 10
 #define __STEAL 0 // workstealing toggle. Disjoint from toggles above
+#define __STEAL 1 // workstealing toggle. Disjoint from toggles above
 // workstealing heuristic selection (only set one to be 1)
 …
 struct executor;
 enum Allocation { Nodelete, Delete, Destroy, Finished }; // allocation status
 typedef Allocation (*__receive_fn)(actor &, message &);
+enum allocation { Nodelete, Delete, Destroy, Finished }; // allocation status
+typedef allocation (*__receive_fn)(actor &, message &);
 struct request {
+    actor * base_receiver;
     actor * receiver;
+    message * base_msg;
     message * msg;
     __receive_fn fn;
-    bool stop;
 };
+static inline void ?{}( request & this ) { this.stop = true; } // default ctor makes a sentinel
+static inline void ?{}( request & this, actor * receiver, message * msg, __receive_fn fn ) {
+struct a_msg {
+    int m;
+};
+static inline void ?{}( request & this ) {}
+static inline void ?{}( request & this, actor * base_receiver, actor * receiver, message * base_msg, message * msg, __receive_fn fn ) {
+    this.base_receiver = base_receiver;
     this.receiver = receiver;
+    this.base_msg = base_msg;
     this.msg = msg;
     this.fn = fn;
-    this.stop = false;
+}
 static inline void ?{}( request & this, request & copy ) {
 …
     this.msg = copy.msg;
     this.fn = copy.fn;
-    this.stop = copy.stop;
+}
 …
     last_size = 0;
+}
+static inline void ^?{}( copy_queue & this ) with(this) { adelete(buffer); }
+static inline void ^?{}( copy_queue & this ) with(this) {
+    DEBUG_ABORT( count != 0, "Actor system terminated with messages sent but not received\n" );
+    adelete(buffer);
+}
 static inline void insert( copy_queue & this, request & elem ) with(this) {
 …
+}
 static inline bool isEmpty( copy_queue & this ) with(this) { return count == 0; }
+static inline bool is_empty( copy_queue & this ) with(this) { return count == 0; }
 struct work_queue {
 …
     volatile unsigned long long stamp;
     #ifdef ACTOR_STATS
     size_t stolen_from, try_steal, stolen, failed_swaps, msgs_stolen;
+    size_t stolen_from, try_steal, stolen, empty_stolen, failed_swaps, msgs_stolen;
     unsigned long long processed;
     size_t gulps;
 …
     this.gulps = 0;                                 // number of gulps
     this.failed_swaps = 0;                          // steal swap failures
+    this.empty_stolen = 0;                          // queues empty after steal
     this.msgs_stolen = 0;                           // number of messages stolen
     #endif
 …
 #ifdef ACTOR_STATS
 // aggregate counters for statistics
 size_t __total_tries = 0, __total_stolen = 0, __total_workers, __all_gulps = 0,
+size_t __total_tries = 0, __total_stolen = 0, __total_workers, __all_gulps = 0, __total_empty_stolen = 0,
     __total_failed_swaps = 0, __all_processed = 0, __num_actors_stats = 0, __all_msgs_stolen = 0;
 #endif
 …
         unsigned int nprocessors, nworkers, nrqueues;   // number of processors/threads/request queues
         bool seperate_clus;                                                             // use same or separate cluster for executor
+    volatile bool is_shutdown;                      // flag to communicate shutdown to worker threads
 }; // executor
 …
     __atomic_add_fetch(&__total_stolen, executor_->w_infos[id].stolen, __ATOMIC_SEQ_CST);
     __atomic_add_fetch(&__total_failed_swaps, executor_->w_infos[id].failed_swaps, __ATOMIC_SEQ_CST);
+    __atomic_add_fetch(&__total_empty_stolen, executor_->w_infos[id].empty_stolen, __ATOMIC_SEQ_CST);
     // per worker steal stats (uncomment alongside the lock above this routine to print)
 …
     this.nrqueues = nrqueues;
     this.seperate_clus = seperate_clus;
+    this.is_shutdown = false;
     if ( nworkers == nrqueues )
 …
 static inline void ^?{}( executor & this ) with(this) {
+    #ifdef __STEAL
+    request sentinels[nrqueues];
+    for ( unsigned int i = 0; i < nrqueues; i++ ) {
+        insert( request_queues[i], sentinels[i] );              // force eventually termination
+    } // for
+    #else
+    request sentinels[nworkers];
+    unsigned int reqPerWorker = nrqueues / nworkers, extras = nrqueues % nworkers;
+    for ( unsigned int i = 0, step = 0, range; i < nworkers; i += 1, step += range ) {
+        range = reqPerWorker + ( i < extras ? 1 : 0 );
+        insert( request_queues[step], sentinels[i] );           // force eventually termination
+    } // for
+    #endif
+    is_shutdown = true;
     for ( i; nworkers )
 …
     size_t avg_gulps = __all_gulps == 0 ? 0 : __all_processed / __all_gulps;
     printf("\tGulps:\t\t\t\t\t%lu\n\tAverage Gulp Size:\t\t\t%lu\n\tMissed gulps:\t\t\t\t%lu\n", __all_gulps, avg_gulps, misses);
     printf("\tSteal attempts:\t\t\t\t%lu\n\tSteals:\t\t\t\t\t%lu\n\tSteal failures (no candidates):\t\t%lu\n\tSteal failures (failed swaps):\t\t%lu\n",
         __total_tries, __total_stolen, __total_tries - __total_stolen - __total_failed_swaps, __total_failed_swaps);
+    printf("\tSteal attempts:\t\t\t\t%lu\n\tSteals:\t\t\t\t\t%lu\n\tSteal failures (no candidates):\t\t%lu\n\tSteal failures (failed swaps):\t\t%lu\t Empty steals:\t\t%lu\n",
+        __total_tries, __total_stolen, __total_tries - __total_stolen - __total_failed_swaps, __total_failed_swaps, __total_empty_stolen);
     size_t avg_steal = __total_stolen == 0 ? 0 : __all_msgs_stolen / __total_stolen;
     printf("\tMessages stolen:\t\t\t%lu\n\tAverage steal size:\t\t\t%lu\n", __all_msgs_stolen, avg_steal);
 …
 struct actor {
     size_t ticket;                                          // executor-queue handle
     Allocation allocation_;                                         // allocation action
+    allocation allocation_;                                         // allocation action
     inline virtual_dtor;
 };
 …
     // Once an actor is allocated it must be sent a message or the actor system cannot stop. Hence, its receive
     // member must be called to end it
     verifyf( __actor_executor_, "Creating actor before calling start_actor_system() can cause undefined behaviour.\n" );
+    DEBUG_ABORT( __actor_executor_ == 0p, "Creating actor before calling start_actor_system() can cause undefined behaviour.\n" );
     allocation_ = Nodelete;
     ticket = __get_next_ticket( *__actor_executor_ );
 …
 struct message {
     Allocation allocation_;                     // allocation action
+    allocation allocation_;                     // allocation action
     inline virtual_dtor;
 };
 …
     this.allocation_ = Nodelete;
+}
 static inline void ?{}( message & this, Allocation allocation ) {
     memcpy( &this.allocation_, &allocation, sizeof(allocation) ); // optimization to elide ctor
     verifyf( this.allocation_ != Finished, "The Finished Allocation status is not supported for message types.\n");
+static inline void ?{}( message & this, allocation alloc ) {
+    memcpy( &this.allocation_, &alloc, sizeof(allocation) ); // optimization to elide ctor
+    DEBUG_ABORT( this.allocation_ == Finished, "The Finished allocation status is not supported for message types.\n" );
+}
 static inline void ^?{}( message & this ) with(this) {
 …
 static inline void check_message( message & this ) {
     switch ( this.allocation_ ) {                                               // analyze message status
         case Nodelete: CFA_DEBUG(this.allocation_ = Finished); break;
+        case Nodelete: CFA_DEBUG( this.allocation_ = Finished ); break;
         case Delete: delete( &this ); break;
         case Destroy: ^?{}(this); break;
+        case Destroy: ^?{}( this ); break;
         case Finished: break;
     } // switch
+}
 static inline void set_allocation( message & this, Allocation state ) {
+static inline void set_allocation( message & this, allocation state ) {
     this.allocation_ = state;
+}
 static inline void deliver_request( request & this ) {
+    this.receiver->allocation_ = this.fn( *this.receiver, *this.msg );
+    check_message( *this.msg );
+    check_actor( *this.receiver );
+    DEBUG_ABORT( this.receiver->ticket == (unsigned long int)MAX, "Attempted to send message to deleted/dead actor\n" );
+    this.base_receiver->allocation_ = this.fn( *this.receiver, *this.msg );
+    check_message( *this.base_msg );
+    check_actor( *this.base_receiver );
+}
 …
         curr_steal_queue = request_queues[ i + vic_start ];
         // avoid empty queues and queues that are being operated on
         if ( curr_steal_queue == 0p || curr_steal_queue->being_processed || isEmpty( *curr_steal_queue->c_queue ) )
+        if ( curr_steal_queue == 0p || curr_steal_queue->being_processed || is_empty( *curr_steal_queue->c_queue ) )
             continue;
 …
             executor_->w_infos[id].msgs_stolen += curr_steal_queue->c_queue->count;
             executor_->w_infos[id].stolen++;
+            if ( is_empty( *curr_steal_queue->c_queue ) ) executor_->w_infos[id].empty_stolen++;
             // __atomic_add_fetch(&executor_->w_infos[victim_id].stolen_from, 1, __ATOMIC_RELAXED);
             // replaced_queue[swap_idx]++;
 …
+}
+#define CHECK_TERMINATION if ( unlikely( executor_->is_shutdown ) ) break Exit
 void main( worker & this ) with(this) {
     // #ifdef ACTOR_STATS
 …
         // check if queue is empty before trying to gulp it
         if ( isEmpty( *curr_work_queue->c_queue ) ) {
+        if ( is_empty( *curr_work_queue->c_queue ) ) {
             #ifdef __STEAL
             empty_count++;
 …
         #endif // ACTOR_STATS
         #ifdef __STEAL
         if ( isEmpty( *current_queue ) ) {
             if ( unlikely( no_steal ) ) continue;
+        if ( is_empty( *current_queue ) ) {
+            if ( unlikely( no_steal ) ) { CHECK_TERMINATION; continue; }
             empty_count++;
             if ( empty_count < steal_threshold ) continue;
             empty_count = 0;
+            CHECK_TERMINATION; // check for termination
             __atomic_store_n( &executor_->w_infos[id].stamp, rdtscl(), __ATOMIC_RELAXED );
 …
+        }
         #endif // __STEAL
         while ( ! isEmpty( *current_queue ) ) {
+        while ( ! is_empty( *current_queue ) ) {
             #ifdef ACTOR_STATS
             executor_->w_infos[id].processed++;
 …
             &req = &remove( *current_queue );
             if ( !&req ) continue;
-            if ( req.stop ) break Exit;
             deliver_request( req );
+        }
 …
 static inline void send( actor & this, request & req ) {
     verifyf( this.ticket != (unsigned long int)MAX, "Attempted to send message to deleted/dead actor\n" );
+    DEBUG_ABORT( this.ticket == (unsigned long int)MAX, "Attempted to send message to deleted/dead actor\n" );
     send( *__actor_executor_, req, this.ticket );
+}
 …
     __all_gulps = 0;
     __total_failed_swaps = 0;
+    __total_empty_stolen = 0;
     __all_processed = 0;
     __num_actors_stats = 0;
 …
+}
+// TODO: potentially revisit getting number of processors
+//  ( currently the value stored in active_cluster()->procs.total is often stale
+//  and doesn't reflect how many procs are allocated )
+// static inline void start_actor_system() { start_actor_system( active_cluster()->procs.total ); }
+static inline void start_actor_system() { start_actor_system( 1 ); }
+static inline void start_actor_system() { start_actor_system( get_proc_count( *active_cluster() ) ); }
 static inline void start_actor_system( executor & this ) {
 …
 static inline void stop_actor_system() {
     park( ); // will receive signal when actor system is finished
+    park( ); // will be unparked when actor system is finished
     if ( !__actor_executor_passed ) delete( __actor_executor_ );
 …
 // assigned at creation to __base_msg_finished to avoid unused message warning
 message __base_msg_finished @= { .allocation_ : Finished };
 struct __DeleteMsg { inline message; } DeleteMsg = __base_msg_finished;
 struct __DestroyMsg { inline message; } DestroyMsg = __base_msg_finished;
 struct __FinishedMsg { inline message; } FinishedMsg = __base_msg_finished;
 Allocation receive( actor & this, __DeleteMsg & msg ) { return Delete; }
 Allocation receive( actor & this, __DestroyMsg & msg ) { return Destroy; }
 Allocation receive( actor & this, __FinishedMsg & msg ) { return Finished; }
+struct __delete_msg_t { inline message; } delete_msg = __base_msg_finished;
+struct __destroy_msg_t { inline message; } destroy_msg = __base_msg_finished;
+struct __finished_msg_t { inline message; } finished_msg = __base_msg_finished;
+allocation receive( actor & this, __delete_msg_t & msg ) { return Delete; }
+allocation receive( actor & this, __destroy_msg_t & msg ) { return Destroy; }
+allocation receive( actor & this, __finished_msg_t & msg ) { return Finished; }

libcfa/src/concurrency/atomic.hfa

-              rfa5e1aa5
+              rb7b3e41
 // Created On       : Thu May 25 15:22:46 2023
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu May 25 15:24:45 2023
 // Update Count     : 1
+// Last Modified On : Wed Jun 14 07:48:57 2023
+// Update Count     : 52
 //
+#define LOAD( lock ) (__atomic_load_n( &(lock), __ATOMIC_SEQ_CST ))
+#define LOADM( lock, memorder ) (__atomic_load_n( &(lock), memorder ))
+#define STORE( lock, assn ) (__atomic_store_n( &(lock), assn, __ATOMIC_SEQ_CST ))
+#define STOREM( lock, assn, memorder ) (__atomic_store_n( &(lock), assn, memorder ))
+#define CLR( lock ) (__atomic_clear( &(lock), __ATOMIC_RELEASE ))
+#define CLRM( lock, memorder ) (__atomic_clear( &(lock), memorder ))
+#define TAS( lock ) (__atomic_test_and_set( &(lock), __ATOMIC_ACQUIRE ))
+#define TASM( lock, memorder ) (__atomic_test_and_set( &(lock), memorder ))
+#define CAS( change, comp, assn ) ({typeof(comp) __temp = (comp); __atomic_compare_exchange_n( &(change), &(__temp), (assn), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ); })
+#define CASM( change, comp, assn, memorder... ) ({typeof(comp) * __temp = &(comp); __atomic_compare_exchange_n( &(change), &(__temp), (assn), false, memorder, memorder ); })
+#define CASV( change, comp, assn ) (__atomic_compare_exchange_n( &(change), &(comp), (assn), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ))
+#define CASVM( change, comp, assn, memorder... ) (__atomic_compare_exchange_n( &(change), &(comp), (assn), false, memorder, memorder ))
+#define FAS( change, assn ) (__atomic_exchange_n( &(change), (assn), __ATOMIC_SEQ_CST ))
+#define FASM( change, assn, memorder ) (__atomic_exchange_n( &(change), (assn), memorder ))
+#define FAI( change, Inc ) (__atomic_fetch_add( &(change), (Inc), __ATOMIC_SEQ_CST ))
+#define FAIM( change, Inc, memorder ) (__atomic_fetch_add( &(change), (Inc), memorder ))
+#define LOAD( val ) (LOADM( val, __ATOMIC_SEQ_CST))
+#define LOADM( val, memorder ) (__atomic_load_n( &(val), memorder))
+#define STORE( val, assn ) (STOREM( val, assn, __ATOMIC_SEQ_CST))
+#define STOREM( val, assn, memorder ) (__atomic_store_n( &(val), assn, memorder))
+#define TAS( lock ) (TASM( lock, __ATOMIC_ACQUIRE))
+#define TASM( lock, memorder ) (__atomic_test_and_set( &(lock), memorder))
+#define TASCLR( lock ) (TASCLRM( lock, __ATOMIC_RELEASE))
+#define TASCLRM( lock, memorder ) (__atomic_clear( &(lock), memorder))
+#define FAS( assn, replace ) (FASM(assn, replace, __ATOMIC_SEQ_CST))
+#define FASM( assn, replace, memorder ) (__atomic_exchange_n( &(assn), (replace), memorder))
+#define FAI( assn, Inc ) (__atomic_fetch_add( &(assn), (Inc), __ATOMIC_SEQ_CST))
+#define FAIM( assn, Inc, memorder ) (__atomic_fetch_add( &(assn), (Inc), memorder))
+// Use __sync because __atomic with 128-bit CAA can result in calls to pthread_mutex_lock.
+// #define CAS( assn, comp, replace ) (CASM( assn, comp, replace, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+// #define CASM( assn, comp, replace, memorder... ) ({ typeof(comp) __temp = (comp); __atomic_compare_exchange_n( &(assn), &(__temp), (replace), false, memorder ); })
+#define CAS( assn, comp, replace ) (__sync_bool_compare_and_swap( &assn, comp, replace))
+#define CASM( assn, comp, replace, memorder... ) _Static_assert( false, "memory order unsupported for CAS macro" );
+// #define CASV( assn, comp, replace ) (__atomic_compare_exchange_n( &(assn), &(comp), (replace), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ))
+// #define CASVM( assn, comp, replace, memorder... ) (__atomic_compare_exchange_n( &(assn), &(comp), (replace), false, memorder, memorder ))
+#define CASV( assn, comp, replace ) ({ \
+        typeof(comp) temp = comp; \
+        typeof(comp) old = __sync_val_compare_and_swap( &(assn), (comp), (replace) ); \
+        old == temp ? true : (comp = old, false); \
+})
+#define CASVM( assn, comp, replace, memorder... ) _Static_assert( false, "memory order unsupported for CASV macro" );

libcfa/src/concurrency/kernel.hfa

-              rfa5e1aa5
+              rb7b3e41
         // Total number of processors
         unsigned total;
+    // Number of processors constructed
+    //  incremented at construction time, total is incremented once the processor asyncronously registers
+        unsigned constructed;
         // Total number of idle processors
 …
 static inline struct cluster   * active_cluster  () { return publicTLS_get( this_processor )->cltr; }
+// gets the number of constructed processors on the cluster
+static inline unsigned get_proc_count( cluster & this ) { return this.procs.constructed; }
 // set the number of internal processors
 // these processors are in addition to any explicitly declared processors

libcfa/src/concurrency/kernel/cluster.hfa

rfa5e1aa5	rb7b3e41
40	40
41	41	// convert to log2 scale but using double
42		static inline __readyQ_avg_t __to_readyQ_avg(unsigned long long intsc) { if(unlikely(0 == intsc)) return 0.0; else return log2(intsc); }
	42	static inline __readyQ_avg_t __to_readyQ_avg(unsigned long long intsc) { if(unlikely(0 == intsc)) return 0.0; else return log2((__readyQ_avg_t)intsc); }
43	43
44	44	#define warn_large_before warnf( !strict \|\| old_avg < 35.0, "Suspiciously large previous average: %'lf, %'" PRId64 "ms \n", old_avg, program()`ms )

libcfa/src/concurrency/kernel/startup.cfa

-              rfa5e1aa5
+              rb7b3e41
         this.name = name;
         this.cltr = &_cltr;
+    __atomic_add_fetch( &_cltr.procs.constructed, 1u, __ATOMIC_RELAXED );
         this.rdq.its = 0;
         this.rdq.itr = 0;
 …
         __cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
+    __atomic_sub_fetch( &this.cltr->procs.constructed, 1u, __ATOMIC_RELAXED );
         __atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
         __disable_interrupts_checked();
 …
         this.fdw   = 0p;
         this.idle  = 0;
+    this.constructed = 0;
         this.total = 0;
+}

libcfa/src/concurrency/locks.hfa

rfa5e1aa5	rb7b3e41
32	32	#include "select.hfa"
33	33
34		~~#include <fstream.hfa>~~
35
36	34	// futex headers
37	35	#include <linux/futex.h> /* Definition of FUTEX_* constants */

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: