Context Navigation

-                      r3d5fba21
+                      rbeabdf3
 \begin{cfa}
 struct derived_actor {
     inline actor;       // Plan-9 C inheritance
+        inline actor;      // Plan-9 C inheritance
 };
 void ?{}( derived_actor & this ) { // Default ctor
     ((actor &)this){};  // Call to actor ctor
+        ((actor &)this){};  // Call to actor ctor
+}
 struct derived_msg {
     inline message;     // Plan-9 C nominal inheritance
     char word[12];
+        inline message;  // Plan-9 C nominal inheritance
+        char word[12];
 };
 void ?{}( derived_msg & this, char * new_word ) { // Overloaded ctor
     ((message &) this){ Nodelete }; // Passing allocation to ctor
     strcpy(this.word, new_word);
+        ((message &) this){ Nodelete }; // Passing allocation to ctor
+        strcpy(this.word, new_word);
+}
 Allocation receive( derived_actor & receiver, derived_msg & msg ) {
     printf("The message contained the string: %s\n", msg.word);
     return Finished; // Return finished since actor is done
+        printf("The message contained the string: %s\n", msg.word);
+        return Finished; // Return finished since actor is done
+}
 int main() {
     start_actor_system(); // Sets up executor
     derived_actor my_actor;
     derived_msg my_msg{ "Hello World" }; // Constructor call
     my_actor << my_msg;   // Send message via left shift operator
     stop_actor_system(); // Waits until actors are finished
     return 0;
+        start_actor_system(); // Sets up executor
+        derived_actor my_actor;
+        derived_msg my_msg{ "Hello World" }; // Constructor call
+        my_actor << my_msg;   // Send message via left shift operator
+        stop_actor_system(); // Waits until actors are finished
+        return 0;
+}
 \end{cfa}
 …
 \section{Envelopes}\label{s:envelope}
 In actor systems messages are sent and received by actors.
 When a actor receives a message it  executes its behaviour that is associated with that message type.
+When a actor receives a message it executes its behaviour that is associated with that message type.
 However the unit of work that stores the message, the receiving actor's address, and other pertinent information needs to persist between send and the receive.
 Furthermore the unit of work needs to be able to be stored in some fashion, usually in a queue, until it is executed by an actor.
 …
 While other systems are concerned with stealing actors, the \CFA actor system steals queues.
 This is a result of \CFA's use of the inverted actor system.
  The goal of the \CFA actor work stealing mechanism is to have a zero-victim-cost stealing mechanism.
+The goal of the \CFA actor work stealing mechanism is to have a zero-victim-cost stealing mechanism.
 This does not means that stealing has no cost.
 This goal is to ensure that stealing work does not impact the performance of victim workers.
 …
 \begin{cfa}
 void swap( uint victim_idx, uint my_idx  ) {
     // Step 0:
     work_queue * my_queue = request_queues[my_idx];
     work_queue * vic_queue = request_queues[victim_idx];
     // Step 2:
     request_queues[my_idx] = 0p;
     // Step 3:
     request_queues[victim_idx] = my_queue;
     // Step 4:
     request_queues[my_idx] = vic_queue;
+void swap( uint victim_idx, uint my_idx ) {
+        // Step 0:
+        work_queue * my_queue = request_queues[my_idx];
+        work_queue * vic_queue = request_queues[victim_idx];
+        // Step 2:
+        request_queues[my_idx] = 0p;
+        // Step 3:
+        request_queues[victim_idx] = my_queue;
+        // Step 4:
+        request_queues[my_idx] = vic_queue;
+}
 \end{cfa}
 …
 // This routine is atomic
 bool CAS( work_queue ** ptr, work_queue ** old, work_queue * new ) {
     if ( *ptr != *old )
         return false;
     *ptr = new;
     return true;
+        if ( *ptr != *old )
+                return false;
+        *ptr = new;
+        return true;
+}
 bool try_swap_queues( worker & this, uint victim_idx, uint my_idx ) with(this) {
     // Step 0:
     // request_queues is the shared array of all sharded queues
     work_queue * my_queue = request_queues[my_idx];
     work_queue * vic_queue = request_queues[victim_idx];
     // Step 1:
     // If either queue is 0p then they are in the process of being stolen
     // 0p is CForAll's equivalent of C++'s nullptr
     if ( vic_queue == 0p ) return false;
     // Step 2:
     // Try to set thief's queue ptr to be 0p.
     // If this CAS fails someone stole thief's queue so return false
     if ( !CAS( &request_queues[my_idx], &my_queue, 0p ) )
         return false;
     // Step 3:
     // Try to set victim queue ptr to be thief's queue ptr.
     // If it fails someone stole the other queue, so fix up then return false
     if ( !CAS( &request_queues[victim_idx], &vic_queue, my_queue ) ) {
         request_queues[my_idx] = my_queue; // reset queue ptr back to prev val
         return false;
+    }
     // Step 4:
     // Successfully swapped.
     // Thief's ptr is 0p so no one will touch it
     // Write back without CAS is safe
     request_queues[my_idx] = vic_queue;
     return true;
+        // Step 0:
+        // request_queues is the shared array of all sharded queues
+        work_queue * my_queue = request_queues[my_idx];
+        work_queue * vic_queue = request_queues[victim_idx];
+        // Step 1:
+        // If either queue is 0p then they are in the process of being stolen
+        // 0p is CForAll's equivalent of C++'s nullptr
+        if ( vic_queue == 0p ) return false;
+        // Step 2:
+        // Try to set thief's queue ptr to be 0p.
+        // If this CAS fails someone stole thief's queue so return false
+        if ( !CAS( &request_queues[my_idx], &my_queue, 0p ) )
+                return false;
+        // Step 3:
+        // Try to set victim queue ptr to be thief's queue ptr.
+        // If it fails someone stole the other queue, so fix up then return false
+        if ( !CAS( &request_queues[victim_idx], &vic_queue, my_queue ) ) {
+                request_queues[my_idx] = my_queue; // reset queue ptr back to prev val
+                return false;
+        }
+        // Step 4:
+        // Successfully swapped.
+        // Thief's ptr is 0p so no one will touch it
+        // Write back without CAS is safe
+        request_queues[my_idx] = vic_queue;
+        return true;
+}
 \end{cfa}\label{c:swap}
 …
 \label{t:StaticActorMessagePerformance}
 \begin{tabular}{*{5}{r|}r}
     & \multicolumn{1}{c|}{\CFA (100M)} & \multicolumn{1}{c|}{CAF (10M)} & \multicolumn{1}{c|}{Akka (100M)} & \multicolumn{1}{c|}{\uC (100M)} & \multicolumn{1}{c@{}}{ProtoActor (100M)} \\
+    \hline
     AMD         & \input{data/pykeSendStatic} \\
+    \hline
     Intel       & \input{data/nasusSendStatic}
+        & \multicolumn{1}{c|}{\CFA (100M)} & \multicolumn{1}{c|}{CAF (10M)} & \multicolumn{1}{c|}{Akka (100M)} & \multicolumn{1}{c|}{\uC (100M)} & \multicolumn{1}{c@{}}{ProtoActor (100M)} \\
+        \hline
+        AMD             & \input{data/pykeSendStatic} \\
+        \hline
+        Intel   & \input{data/nasusSendStatic}
 \end{tabular}
 …
 \begin{tabular}{*{5}{r|}r}
     & \multicolumn{1}{c|}{\CFA (20M)} & \multicolumn{1}{c|}{CAF (2M)} & \multicolumn{1}{c|}{Akka (2M)} & \multicolumn{1}{c|}{\uC (20M)} & \multicolumn{1}{c@{}}{ProtoActor (2M)} \\
+    \hline
     AMD         & \input{data/pykeSendDynamic} \\
+    \hline
     Intel       & \input{data/nasusSendDynamic}
+        & \multicolumn{1}{c|}{\CFA (20M)} & \multicolumn{1}{c|}{CAF (2M)} & \multicolumn{1}{c|}{Akka (2M)} & \multicolumn{1}{c|}{\uC (20M)} & \multicolumn{1}{c@{}}{ProtoActor (2M)} \\
+        \hline
+        AMD             & \input{data/pykeSendDynamic} \\
+        \hline
+        Intel   & \input{data/nasusSendDynamic}
 \end{tabular}
 \end{table}
 …
 In the static send benchmark all systems except CAF have static send costs that are in the same ballpark, only varying by ~70ns.
 In the dynamic send benchmark all systems experience slower message sends, as expected due to the extra allocations.
 However,  Akka and ProtoActor, slow down by a more significant margin than the \uC and \CFA.
+However, Akka and ProtoActor, slow down by a more significant margin than the \uC and \CFA.
 This is likely a result of Akka and ProtoActor's garbage collection, which can suffer from hits in performance for allocation heavy workloads, whereas \uC and \CFA have explicit allocation/deallocation.
 …
 \begin{figure}
+    \centering
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/nasusCFABalance-One.pgf}}
+        \subcaption{AMD \CFA Balance-One Benchmark}
+        \label{f:BalanceOneAMD}
+    \end{subfigure}\hfill
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/pykeCFABalance-One.pgf}}
+        \subcaption{Intel \CFA Balance-One Benchmark}
+        \label{f:BalanceOneIntel}
+    \end{subfigure}
+    \caption{The balance-one benchmark comparing stealing heuristics (lower is better).}
+\end{figure}
+\begin{figure}
+    \centering
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/nasusCFABalance-Multi.pgf}}
+        \subcaption{AMD \CFA Balance-Multi Benchmark}
+        \label{f:BalanceMultiAMD}
+    \end{subfigure}\hfill
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/pykeCFABalance-Multi.pgf}}
+        \subcaption{Intel \CFA Balance-Multi Benchmark}
+        \label{f:BalanceMultiIntel}
+    \end{subfigure}
+    \caption{The balance-multi benchmark comparing stealing heuristics (lower is better).}
+        \centering
+        \subfloat[AMD \CFA Balance-One Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/nasusCFABalance-One.pgf}}
+                \label{f:BalanceOneAMD}
+        }
+        \subfloat[Intel \CFA Balance-One Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/pykeCFABalance-One.pgf}}
+                \label{f:BalanceOneIntel}
+        }
+        \caption{The balance-one benchmark comparing stealing heuristics (lower is better).}
+\end{figure}
+\begin{figure}
+        \centering
+        \subfloat[AMD \CFA Balance-Multi Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/nasusCFABalance-Multi.pgf}}
+                \label{f:BalanceMultiAMD}
+        }
+        \subfloat[Intel \CFA Balance-Multi Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/pykeCFABalance-Multi.pgf}}
+                \label{f:BalanceMultiIntel}
+        }
+        \caption{The balance-multi benchmark comparing stealing heuristics (lower is better).}
 \end{figure}
 …
 \begin{figure}
+    \centering
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/nasusExecutor.pgf}}
+        \subcaption{AMD Executor Benchmark}
+        \label{f:ExecutorAMD}
+    \end{subfigure}\hfill
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/pykeExecutor.pgf}}
+        \subcaption{Intel Executor Benchmark}
+        \label{f:ExecutorIntel}
+    \end{subfigure}
+    \caption{The executor benchmark comparing actor systems (lower is better).}
+        \centering
+        \subfloat[AMD Executor Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/nasusExecutor.pgf}}
+                \label{f:ExecutorAMD}
+        }
+        \subfloat[Intel Executor Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/pykeExecutor.pgf}}
+                \label{f:ExecutorIntel}
+        }
+        \caption{The executor benchmark comparing actor systems (lower is better).}
 \end{figure}
 …
 \begin{figure}
+    \centering
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/nasusCFAExecutor.pgf}}
+        \subcaption{AMD \CFA Executor Benchmark}\label{f:cfaExecutorAMD}
+    \end{subfigure}\hfill
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/pykeCFAExecutor.pgf}}
+        \subcaption{Intel \CFA Executor Benchmark}\label{f:cfaExecutorIntel}
+    \end{subfigure}
+    \caption{Executor benchmark comparing \CFA stealing heuristics (lower is better).}
+        \centering
+        \subfloat[AMD \CFA Executor Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/nasusCFAExecutor.pgf}}
+                \label{f:cfaExecutorAMD}
+        }
+        \subfloat[Intel \CFA Executor Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/pykeCFAExecutor.pgf}}
+                \label{f:cfaExecutorIntel}
+        }
+        \caption{Executor benchmark comparing \CFA stealing heuristics (lower is better).}
 \end{figure}
 …
 \begin{figure}
+    \centering
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/nasusRepeat.pgf}}
+        \subcaption{AMD Repeat Benchmark}\label{f:RepeatAMD}
+    \end{subfigure}\hfill
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/pykeRepeat.pgf}}
+        \subcaption{Intel Repeat Benchmark}\label{f:RepeatIntel}
+    \end{subfigure}
+    \caption{The repeat benchmark comparing actor systems (lower is better).}
+        \centering
+        \subfloat[AMD Repeat Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/nasusRepeat.pgf}}
+                \label{f:RepeatAMD}
+        }
+        \subfloat[Intel Repeat Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/pykeRepeat.pgf}}
+                \label{f:RepeatIntel}
+        }
+        \caption{The repeat benchmark comparing actor systems (lower is better).}
 \end{figure}
 …
 \begin{figure}
+    \centering
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/nasusCFARepeat.pgf}}
+        \subcaption{AMD \CFA Repeat Benchmark}\label{f:cfaRepeatAMD}
+    \end{subfigure}\hfill
+    \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/pykeCFARepeat.pgf}}
+        \subcaption{Intel \CFA Repeat Benchmark}\label{f:cfaRepeatIntel}
+    \end{subfigure}
+    \caption{The repeat benchmark comparing \CFA stealing heuristics (lower is better).}
+        \centering
+        \subfloat[AMD \CFA Repeat Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/nasusCFARepeat.pgf}}
+                \label{f:cfaRepeatAMD}
+        }
+        \subfloat[Intel \CFA Repeat Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/pykeCFARepeat.pgf}}
+                \label{f:cfaRepeatIntel}
+        }
+        \caption{The repeat benchmark comparing \CFA stealing heuristics (lower is better).}
 \end{figure}
 …
 \begin{table}[t]
     \centering
     \setlength{\extrarowheight}{2pt}
     \setlength{\tabcolsep}{5pt}
     \caption{Executor Program Memory High Watermark}
     \label{t:ExecutorMemory}
     \begin{tabular}{*{5}{r|}r}
         & \multicolumn{1}{c|}{\CFA} & \multicolumn{1}{c|}{CAF} & \multicolumn{1}{c|}{Akka} & \multicolumn{1}{c|}{\uC} & \multicolumn{1}{c@{}}{ProtoActor} \\
+        \hline
         AMD             & \input{data/pykeExecutorMem} \\
+        \hline
         Intel   & \input{data/nasusExecutorMem}
     \end{tabular}
+        \centering
+        \setlength{\extrarowheight}{2pt}
+        \setlength{\tabcolsep}{5pt}
+        \caption{Executor Program Memory High Watermark}
+        \label{t:ExecutorMemory}
+        \begin{tabular}{*{5}{r|}r}
+                & \multicolumn{1}{c|}{\CFA} & \multicolumn{1}{c|}{CAF} & \multicolumn{1}{c|}{Akka} & \multicolumn{1}{c|}{\uC} & \multicolumn{1}{c@{}}{ProtoActor} \\
+                \hline
+                AMD             & \input{data/pykeExecutorMem} \\
+                \hline
+                Intel   & \input{data/nasusExecutorMem}
+        \end{tabular}
 \end{table}
 …
 \begin{figure}
     \centering
+    \begin{subfigure}{0.5\textwidth}
+        \centering
         \scalebox{0.5}{\input{figures/nasusMatrix.pgf}}
         \subcaption{AMD Matrix Benchmark}\label{f:MatrixAMD}
+    \end{subfigure}\hfill
     \begin{subfigure}{0.5\textwidth}
+        \centering
         \scalebox{0.5}{\input{figures/pykeMatrix.pgf}}
         \subcaption{Intel Matrix Benchmark}\label{f:MatrixIntel}
     \end{subfigure}
+    \caption{The matrix benchmark comparing actor systems (lower is better).}
 \end{figure}
+\begin{figure}
+    \centering
     \begin{subfigure}{0.5\textwidth}
+        \centering
+        \scalebox{0.5}{\input{figures/nasusCFAMatrix.pgf}}
         \subcaption{AMD \CFA Matrix Benchmark}\label{f:cfaMatrixAMD}
+    \end{subfigure}\hfill
     \begin{subfigure}{0.5\textwidth}
+        \centering
         \scalebox{0.5}{\input{figures/pykeCFAMatrix.pgf}}
+        \subcaption{Intel \CFA Matrix Benchmark}\label{f:cfaMatrixIntel}
+    \end{subfigure}
+    \caption{The matrix benchmark comparing \CFA stealing heuristics (lower is better).}
+\end{figure}
+        \centering
+        \subfloat[AMD Matrix Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/nasusMatrix.pgf}}
+                \label{f:MatrixAMD}
+        }
+        \subfloat[Intel Matrix Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/pykeMatrix.pgf}}
+                \label{f:MatrixIntel}
+        }
+        \caption{The matrix benchmark comparing actor systems (lower is better).}
+\end{figure}
+\begin{figure}
+        \centering
+        \subfloat[AMD \CFA Matrix Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/nasusCFAMatrix.pgf}}
+                \label{f:cfaMatrixAMD}
+        }
+        \subfloat[Intel \CFA Matrix Benchmark]{
+                \resizebox{0.5\textwidth}{!}{\input{figures/pykeCFAMatrix.pgf}}
+                \label{f:cfaMatrixIntel}
+        }
+        \caption{The matrix benchmark comparing \CFA stealing heuristics (lower is better).}
+\end{figure}
+% Local Variables: %
+% tab-width: 4 %
+% End: %

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset beabdf3

Legend:

doc/theses/colby_parsons_MMAth/text/actors.tex

Download in other formats: