Index: doc/theses/thierry_delisle_PhD/code/bts.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/bts.cpp	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
+++ doc/theses/thierry_delisle_PhD/code/bts.cpp	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
@@ -0,0 +1,279 @@
+#include <array>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/sysinfo.h>
+
+#include "utils.hpp"
+
+// ================================================================================================
+//                        UTILS
+// ================================================================================================
+
+struct local_stat_t {
+	size_t cnt = 0;
+};
+
+struct global_stat_t {
+	std::atomic_size_t cnt = { 0 };
+};
+
+void atomic_max(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value <= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void atomic_min(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value >= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void tally_stats(global_stat_t & global, local_stat_t & local) {
+	global.cnt   += local.cnt;
+}
+
+void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+		std::cout << "\r" << std::setprecision(4) << durr.count();
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) {
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		size_t c = count.load();
+		if( c == 0 ) {
+			break;
+		}
+		std::cout << "\r" << c;
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+void print_stats(double duration, unsigned nthread, global_stat_t & global) {
+	std::cout << "Done" << std::endl;
+
+	size_t ops = global.cnt;
+	size_t ops_sec = size_t(double(ops) / duration);
+	size_t ops_thread = ops_sec / nthread;
+	auto dur_nano = duration_cast<std::nano>(1.0);
+
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops_thread )<< "\n";
+	std::cout << "Ops/sec/thread: " << ops_thread << "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Total ops     : " << ops << "\n";
+}
+
+static inline bool bts(std::atomic_size_t & target, size_t bit ) {
+	/*
+	int result = 0;
+	asm volatile(
+		"LOCK btsq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
+
+static inline bool btr(std::atomic_size_t & target, size_t bit ) {
+	/*
+	int result = 0;
+	asm volatile(
+		"LOCK btrq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
+
+// ================================================================================================
+//                        EXPERIMENTS
+// ================================================================================================
+
+// ================================================================================================
+__attribute__((noinline)) void runPingPong_body(
+	std::atomic<bool>& done,
+	local_stat_t & local,
+	std::atomic_size_t & target,
+	size_t id
+) {
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+
+		bool ret;
+		ret = bts(target, id);
+		assert(!ret);
+
+		// -----
+
+		ret = btr(target, id);
+		assert(ret);
+		local.cnt++;
+	}
+}
+
+void run(unsigned nthread, double duration) {
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	std::cout << "Initializing ";
+	// List being tested
+	std::atomic_size_t word = { 0 };
+	{
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&done, &word, &barrier, &global](unsigned tid) {
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runPingPong_body(done, local, word, tid - 1);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+
+int main(int argc, char * argv[]) {
+
+	double duration   = 5.0;
+	unsigned nthreads = 2;
+
+	std::cout.imbue(std::locale(""));
+
+	for(;;) {
+		static struct option options[] = {
+			{"duration",  required_argument, 0, 'd'},
+			{"nthreads",  required_argument, 0, 't'},
+			{0, 0, 0, 0}
+		};
+
+		int idx = 0;
+		int opt = getopt_long(argc, argv, "d:t:", options, &idx);
+
+		std::string arg = optarg ? optarg : "";
+		size_t len = 0;
+		switch(opt) {
+			case -1:
+				if(optind != argc) {
+					std::cerr << "Too many arguments " << argc << " " << idx << std::endl;
+					goto usage;
+				}
+				goto run;
+			// Numeric Arguments
+			case 'd':
+				try {
+					duration = std::stod(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Duration must be a valid double, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			case 't':
+				try {
+					nthreads = std::stoul(optarg, &len);
+					if(len != arg.size() || nthreads > (8 * sizeof(size_t))) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Number of threads must be a positive integer less than or equal to " << sizeof(size_t) * 8 << ", was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			// Other cases
+			default: /* ? */
+				std::cerr << opt << std::endl;
+			usage:
+				std::cerr << "Usage: " << argv[0] << ": [options]" << std::endl;
+				std::cerr << std::endl;
+				std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
+				std::cerr << "  -t, --nthreads=NTHREADS  Number of kernel threads" << std::endl;
+				std::exit(1);
+		}
+	}
+	run:
+
+	check_cache_line_size();
+
+	std::cout << "Running " << nthreads << " threads for " << duration << " seconds" << std::endl;
+	run(nthreads, duration);
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex	(revision c5fdebfa78715e593b97117e52e43a2217086c4f)
+++ doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
@@ -56,7 +56,7 @@
 \CFA\cit is a modern, polymorphic, non-object-oriented, backwards-compatible extension of the C programming language. It aims to add high-productivity features while maintaning the predictible performance of C. As such, concurrency in \CFA\cit aims to offer simple and safe high-level tools while still allowing performant code. \CFA concurrrent code is written in the synchronous programming paradigm but uses \glspl{uthrd} in order to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programing. As such, the \CFA \emph{scheduler} is a preemptive user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}.
 
-Scheduling occurs when execution switches from one thread to another, where the second thread is implicitly chosen by the scheduler. This scheduling is an indirect handoff, as opposed to generators and coroutines which explicitly switch to the next generator and coroutine respectively. The cost of switching between two threads for an indirect handoff has two components : the cost of actually context-switching, i.e., changing the relevant registers to move execution from one thread to the other, and the cost of scheduling, i.e., deciding which thread to run next among all the threads ready to run. The first cost is generally constant and fixed, while the scheduling cost can vary based on the system state\footnote{Affecting the context-switch cost is whether it is done in one step, after the scheduling, or in two steps, context-switching to a fixed third-thread before scheduling.}. Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, i.e. \textit{linearizability}, and a new dimension to performance: scalability, where scheduling cost now also depends on contention.
-
-The more threads switch, the more the administrating cost of scheduling becomes noticeable. It is therefore important to build a scheduler with the lowest possible cost and latency. Another important consideration is \emph{fairness}. In principle, scheduling should give the illusion of perfect fairness, where all threads ready to run are running \emph{simultaneously}. While the illusion of simultaneity is easier to reason about, it can break down if the scheduler allows to much unfairness. Therefore, the scheduler should offer as much fairness as needed to guarantee eventual progress, but use unfairness to help performance. In practice, threads must wait in turn but there can be advantages to unfair scheduling, e.g., the express cash register at a grocery store.
+Scheduling occurs when execution switches from one thread to another, where the second thread is implicitly chosen by the scheduler. This scheduling is an indirect handoff, as opposed to generators and coroutines which explicitly switch to the next generator and coroutine respectively. The cost of switching between two threads for an indirect handoff has two components : the cost of actually context-switching, i.e., changing the relevant registers to move execution from one thread to the other, and the cost of scheduling, i.e., deciding which thread to run next among all the threads ready to run. The first cost is generally constant and fixed\footnote{Affecting the context-switch cost is whether it is done in one step, after the scheduling, or in two steps, context-switching to a fixed third-thread before scheduling.}, while the scheduling cost can vary based on the system state. Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, i.e. \textit{linearizability}, and a new dimension to performance: scalability, where scheduling cost now also depends on contention.
+
+The more threads switch, the more the administration cost of scheduling becomes noticeable. It is therefore important to build a scheduler with the lowest possible cost and latency. Another important consideration is \emph{fairness}. In principle, scheduling should give the illusion of perfect fairness, where all threads ready to run are running \emph{simultaneously}. While the illusion of simultaneity is easier to reason about, it can break down if the scheduler allows to much unfairness. Therefore, the scheduler should offer as much fairness as needed to guarantee eventual progress, but use unfairness to help performance. In practice, threads must wait in turn but there can be advantages to unfair scheduling, similar to the the express cash register at a grocery store.
 
 The goal of this research is to produce a scheduler that is simple for programmers to understand and offers good performance. Here understandability does not refer to the API but to how much scheduling concerns programmers need to take into account when writing a \CFA concurrent package. Therefore, the main goal of this proposal is :
@@ -65,23 +65,23 @@
 \end{quote}
 
-For a general purpose scheduler, it is impossible to produce an optimal algorithm as it would require knowledge of the future behaviour of threads. As such, scheduling performance is generally either defined by the best case scenario, a workload to which the scheduler is tailored, or the worst case scenario, i.e., the scheduler behaves no worst than \emph{X}. For this proposal, the performance is evaluated using the second approach to allow \CFA programmers to rely on scheduling performance. A solution to this impossibility is to allow programmers to write their own scheduler, that is not the subject of this proposal, which considers only the default scheduler. As such, it is important that only programmers with exceptionally high performance requirements should need to write their own scheduler and replace the scheduler in this proposal.
-
-This objective includes producing a scheduling strategy with sufficient fairness guarantees, creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily and hide blocking I/O operations, and writing sufficient library tools to allow developers to indirectly use the scheduler.
-
-% ===============================================================================
-% ===============================================================================
-
-\section{Scheduling for \CFA}
-While the \CFA concurrency package does not have any particular scheduling requirements beyond supporting \glspl{uthrd}. Therefore, the detailed requirements of the \CFA scheduler are :
+For a general purpose scheduler, it is impossible to produce an optimal algorithm as it would require knowledge of the future behaviour of threads. As such, scheduling performance is generally either defined by the best case scenario, i.e., a workload to which the scheduler is tailored, or the worst case scenario, i.e., the scheduler behaves no worst than \emph{X}. For this proposal, the performance is evaluated using the second approach to allow \CFA programmers to rely on scheduling performance. Be cause there is no optimal scheduler, ultimately \CFA may allow programmers to write their own scheduler; but that is not the subject of this proposal, which considers only the default scheduler. As such, it is important that only programmers with exceptionally high performance requirements should need to write their own scheduler and replace the scheduler in this proposal.
+
+Finally, the scheduling objective includes producing a scheduling strategy with sufficient fairness guarantees, creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily, scheduling blocking I/O operations, and writing sufficient library tools to allow developers to indirectly use the scheduler.
+
+% ===============================================================================
+% ===============================================================================
+
+\section{\CFA Scheduling}
+To scheduler user-level threads across all workloads, the scheduler has a number of requirements:
 
 \paragraph{Correctness} As with any other concurrent data structure or algorithm, the correctness requirement is paramount. The scheduler cannot allow threads to be dropped from the ready-queue, i.e., scheduled but never run, or be executed multiple times when only being scheduled once. Since \CFA concurrency has no spurious wakeup, this definition of correctness also means the scheduler should have no spurious wakeup. The \CFA scheduler must be correct.
 
-\paragraph{Performance} The performance of a scheduler can generally be mesured in terms of scheduling cost, scalability and latency. Scheduling cost is the cost to switch from one thread to another, as mentioned above. For simple applications where a single kernel thread does most of the scheduling, it is generally the dominating cost. When adding many kernel threads, scalability can become an issue, effectively increasing the cost of context-switching when contention is high. Finally, a third axis of performance is tail latency. This measurement is related to fairness and mesures how long is needed for a thread to be run once scheduled but evaluated in the worst cases. The \CFA scheduler should offer good performance in all three metrics.
-
-\paragraph{Fairness} Like performance, this requirements has several aspect : eventual progress, predictability and performance reliablility. As a hard requirement, the \CFA scheduler must guarantee eventual progress, i.e., prevent starvation, otherwise the above mentioned illusion of simultaneous execution is broken and the scheduler becomes much more complext to reason about. Beyond this requirement, performance should be predictible and reliable, which means similar workloads achieve similar performance and programmer intuition is respected. An example of this is : a thread that yields agressively should not run more often then other tasks. While this is intuitive, it does not hold true for many work-stealing or feedback based schedulers. The \CFA scheduler must guarantee eventual progress and should be predictible and offer reliable performance.
+\paragraph{Performance} The performance of a scheduler can generally be mesured in terms of scheduling cost, scalability and latency. Scheduling cost is the cost to switch from one thread to another, as mentioned above. For simple applications where a single kernel thread does most of the scheduling, it is generally the dominating cost. When adding many kernel threads, scalability becomes an issue, effectively increasing the cost of context-switching when contention is high. Finally, a third axis of performance is tail latency. This measurement is related to fairness and mesures how long is needed for a thread to be run once scheduled and is evaluated in the worst cases. The \CFA scheduler should offer good performance in all three metrics.
+
+\paragraph{Fairness} Like performance, this requirements has several aspect : eventual progress, predictability and performance reliablility. As a hard requirement, the \CFA scheduler must guarantee eventual progress, i.e., prevent starvation, otherwise the above mentioned illusion of simultaneous execution is broken and the scheduler becomes much more complex to reason about. Beyond this requirement, performance should be predictible and reliable, which means similar workloads achieve similar performance and programmer intuition is respected. An example of this is : a thread that yields agressively should not run more often then other tasks. While this is intuitive, it does not hold true for many work-stealing or feedback based schedulers. The \CFA scheduler must guarantee eventual progress and should be predictible and offer reliable performance.
 
 \paragraph{Efficiency} Finally, efficient usage of CPU resources is also an important requirement. This issue is discussed more in depth towards the end of this proposal. It effectively refers to avoiding using CPU power when there are no threads to run, and conversely, use all CPUs available when the workload can benefit from it. Balancing these two states is where the complexity lies. The \CFA scheduler should be efficient with respect to the underlying (shared) computer.
 
-To achieve these requirements, I can reject two broad types of scheduling strategies : feedback-based and priority schedulers.
+\bigskip To achieve these requirements, I can reject two broad types of scheduling strategies : feedback-based and priority schedulers.
 
 \subsection{Feedback-Based Schedulers}
@@ -93,9 +93,9 @@
 \end{enumerate}
 
-While these two assumptions generally hold for operating systems, they may not for user-level threading. Since \CFA has the explicit goal of allowing many smaller threads, this can naturally lead to threads with much shorter lifetime, only being scheduled a few times. Scheduling strategies based on feedback cannot be effective in these cases because they do not have the opportunity to measure the metrics that underlie the algorithm. Note that the problem of feedback convergence (reacting too slowly to scheduling events) is not specific to short lived threads but can also occur with threads that show drastic changes in scheduling, e.g., threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
-
-In the context of operating systems, these concerns can be overshadowed by a more pressing concern : security. When multiple users are involved, it is possible that some users are malevolent and try to exploit the scheduling strategy in order to achieve some nefarious objective. Security concerns mean that more precise and robust fairness metrics must be used to guarantee fairness across processes created by users as well as threads created within a process. In the case of the \CFA scheduler, every thread runs in the same user-space and are controlled by the same user. Fairness across users is therefore a given and it is then possible to safely ignore the possibility that threads are malevolent. This approach allows for a much simpler fairness metric and in this proposal ``fairness'' is considered as follows : when multiple threads are cycling through the system, the total ordering of threads being scheduled, i.e., pushed onto the ready-queue, should not differ much from the total ordering of threads being executed, i.e., popped from the ready-queue.
-
-Since feedback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not use per-threads feedback. Feedback in general is not rejected for secondary concerns like idle sleep, but no feedback is used to decide which thread to run next.
+While these two assumptions generally hold for operating systems, they may not for user-level threading. Since \CFA has the explicit goal of allowing many smaller threads, this can naturally lead to threads with much shorter lifetime, which are only scheduled a few times. Scheduling strategies based on feedback cannot be effective in these cases because they do not have the opportunity to measure the metrics that underlie the algorithm. Note that the problem of feedback convergence (reacting too slowly to scheduling events) is not specific to short lived threads but can also occur with threads that show drastic changes in scheduling, e.g., threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
+
+In the context of operating systems, these concerns can be overshadowed by a more pressing concern : security. When multiple users are involved, it is possible that some users are malevolent and try to exploit the scheduling strategy in order to achieve some nefarious objective. Security concerns mean that more precise and robust fairness metrics must be used to guarantee fairness across processes created by users as well as threads created within a process. In the case of the \CFA scheduler, every thread runs in the same user-space and is controlled by the same user. Fairness across users is therefore a given and it is then possible to safely ignore the possibility that threads are malevolent. This approach allows for a much simpler fairness metric and in this proposal ``fairness'' is considered as follows : when multiple threads are cycling through the system, the total ordering of threads being scheduled, i.e., pushed onto the ready-queue, should not differ much from the total ordering of threads being executed, i.e., popped from the ready-queue.
+
+Since feedback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not use per-threads feedback. Feedback in general is not rejected for secondary concerns like idle sleep for kernel threads, but no feedback is used to decide which thread to run next.
 
 \subsection{Priority Schedulers}
@@ -104,14 +104,14 @@
 An important observation to make is that threads do not need to have explicit priorities for problems to occur. Indeed, any system with multiple ready-queues and attempts to exhaust one queue before accessing the other queues, can encounter starvation problems. A popular scheduling strategy that suffers from implicit priorities is work-stealing. Work-stealing is generally presented as follows, each processor has a list of ready threads.
 \begin{enumerate}
-	\item Run threads from ``this'' processor's list.
+	\item Run threads from ``this'' processor's list first.
 	\item If ``this'' processor's list is empty, run threads from some other processor's list.
 \end{enumerate}
 
-In a loaded system\footnote{A loaded system is a system where threads are being run at the same rate they are scheduled.}, if a thread does not yield, block or preempt for an extended period of time, threads on the same processor list starve if no other processors exhaust their list.
-
-Since priorities can be complex to handle for programmers, the scheduling strategy proposed for the \CFA runtime does not use a strategy with either implicit or explicit thread priorities.
+In a loaded system\footnote{A loaded system is a system where threads are being run at the same rate they are scheduled.}, if a thread does not yield, block or preempt for an extended period of time, threads on the same processor's list starve if no other processors exhaust their list.
+
+Since priorities can be complex for programmers to handle, the scheduling strategy proposed for the \CFA runtime does not use a strategy with either implicit or explicit thread priorities.
 
 \subsection{Schedulers without feedback or priorities}
-This proposal conjectures that is is possible to construct a default scheduler for the \CFA runtime that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about. The simplest fairness guarantee is FIFO ordering, i.e., threads scheduled first run first. However, enforcing FIFO ordering generally conflicts with scalability across multiple processors because of the additionnal synchronization. Thankfully, strict FIFO is not needed for sufficient fairness. Since concurrency is inherently non-deterministic, fairness concerns in scheduling are only a problem if a thread repeatedly runs before another thread can run. This is because the non-determinism means that programmers must already handle ordering problems in order to produce correct code and already must rely on weak guarantees, for example that a specific thread will \emph{eventually} run. Since some reordering does not break correctness, the FIFO fairness guarantee can be significantly relaxed without causing problems. For this proposal, the target guarantee is that the \CFA scheduler provides \emph{probable} FIFO ordering, which allows reordering but makes it improbable that threads are reordered far from their position in total ordering.
+This proposal conjectures that is is possible to construct a default scheduler for the \CFA runtime that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about. The simplest fairness guarantee is FIFO ordering, i.e., threads scheduled first run first. However, enforcing FIFO ordering generally conflicts with scalability across multiple processors because of the additionnal synchronization. Thankfully, strict FIFO is not needed for sufficient fairness. Since concurrency is inherently non-deterministic, fairness concerns in scheduling are only a problem if a thread repeatedly runs before another thread can run. This relaxation is possible because the non-determinism means that programmers must already handle ordering problems in order to produce correct code and already must rely on weak guarantees, for example that a specific thread will \emph{eventually} run. Since some reordering does not break correctness, the FIFO fairness guarantee can be significantly relaxed without causing problems. For this proposal, the target guarantee is that the \CFA scheduler provides \emph{probable} FIFO ordering, which allows reordering but makes it improbable that threads are reordered far from their position in total ordering.
 
 Scheduling is defined as follows :
@@ -127,5 +127,5 @@
 
 \subsection{Ready-Queue} \label{sec:queue}
-A simple ready-queue can be built from a FIFO queue, user-threads are pushed onto the queue when they are ready to run and processors (kernel-threads acting as virtual processors) pop the user-threads from the queue and execute them. Using Trevor's paper\cit as basis, it is simple to build a relaxed FIFO list that is fast and scalable for loaded or overloaded systems. The described queue uses an array of underlying strictly FIFO queues as shown in Figure~\ref{fig:base}\footnote{For this section, the number of underlying queues is assumed to be constant, Section~\ref{sec:resize} will discuss resizing the array.}. Pushing new data is done by selecting one of these underlying queues at random, recording a timestamp for the push and pushing to the selected queue. Popping is done by selecting two queues at random and popping from the queue for which the head has the oldest timestamp. A higher number of underlying queues leads to less contention on each queue and therefore better performance. In a loaded system, it is higly likely the queues are non-empty, i.e., several tasks are on each of the underlying queues. This means that selecting a queue at random to pop from is higly likely to yield a queue with available items. In Figure~\ref{fig:base}, ignoring the ellipsis, the chances of getting an empty queue is 2/7 per pick, meaning two randoms pick will yield an item approximately 9 times out of 10.
+A simple ready-queue can be built from a FIFO queue, where user-threads are pushed onto the queue when they are ready to run, and processors (kernel-threads acting as virtual processors) pop the user-threads from the queue and execute them. Using the paper\cite{alistarh2018relaxed} as a basis, it is simple to build a relaxed FIFO list that is fast and scalable for loaded or overloaded systems. The described queue uses an array of underlying strictly FIFO queues as shown in Figure~\ref{fig:base}\footnote{For this section, the number of underlying queues is assumed to be constant. Section~\ref{sec:resize} discusses resizing the array.}. Pushing new data is done by selecting one of these underlying queues at random, recording a timestamp for the operation and pushing to the selected queue. Popping is done by selecting two queues at random and popping from the queue with the oldest timestamp. A higher number of underlying queues leads to less contention on each queue and therefore better performance. In a loaded system, it is highly likely the queues are non-empty, i.e., several tasks are on each of the underlying queues. This means that selecting a queue at random to pop from is highly likely to yield a queue with available items. In Figure~\ref{fig:base}, ignoring the ellipsis, the chances of getting an empty queue is 2/7 per pick, meaning two random picks yield an item approximately 9 times out of 10.
 
 \begin{figure}
@@ -147,5 +147,5 @@
 \end{figure}
 
-When the ready queue is "more empty", i.e., several of the inner queues are empty, selecting a random queue for popping is less likely to yield a valid selection and more attempts need to be made, resulting in a performance degradation. Figure~\ref{fig:empty} shows an example with fewer elements where the chances of getting an empty queue is 5/7 per pick, meaning two randoms pick will yield an item only half the time. Since the overarching queue is not empty, the pop operation \emph{must} find an element before returning and therefore must retry. Overall performance is therefore influenced by the contention on the underlying queues and pop performance is influenced by the items density. This leads to four performance cases, as depicted in Table~\ref{tab:perfcases}.
+When the ready queue is \emph{more empty}, i.e., several of the queues are empty, selecting a random queue for popping is less likely to yield a valid selection and more attempts need to be made, resulting in a performance degradation. Figure~\ref{fig:empty} shows an example with fewer elements where the chances of getting an empty queue is 5/7 per pick, meaning two random picks yield an item only half the time. Since the ready queue is not empty, the pop operation \emph{must} find an element before returning and therefore must retry. Overall performance is therefore influenced by the contention on the underlying queues and pop performance is influenced by the item density. This leads to four performance cases, as depicted in Table~\ref{tab:perfcases}.
 
 \begin{table}
@@ -161,11 +161,21 @@
 		\end{tabular}
 	\end{center}
-	\caption{Performance of the relaxed FIFO list in different cases. The number of processors (many or few) refers to the number of kernel-threads \emph{actively} attempting to pop user-threads from the queues, not the total number of kernel-threads. The number of threads (many of few) refers to the number of user-threads ready to be run. Many threads means they outnumber processors significantly and most underlying queues have items, few threads means there are barely more threads than processors and most underlying queues are empty. Cases with fewer threads than processors are discussed in Section~\ref{sec:sleep}.}
+	\caption{Performance of the relaxed FIFO list in different cases. The number of processors (many or few) refers to the number of kernel-threads \emph{actively} attempting to pop user-threads from the queues, not the total number of kernel-threads. The number of threads (many or few) refers to the number of user-threads ready to be run. Many threads means they outnumber processors significantly and most underlying queues have items, few threads mean there are barely more threads than processors and most underlying queues are empty. Cases with fewer threads than processors are discussed in Section~\ref{sec:sleep}.}
 	\label{tab:perfcases}
 \end{table}
 
-Performance can be improved in case~D (Table~\ref{tab:perfcases}) by adding information to help processors find which inner queues are used. This aims to avoid the cost of retrying the pop operation but does not affect contention on the underlying queues and can incur some management cost for both push and pop operations.
-
-A bitmask can be used to identify which inner queues are currently in use, as shown in Figure~\ref{fig:emptybit}. This means that processors can often find user-threads in constant time, regardless of how many underlying queues are empty. Furthermore, modern x86 CPUs have an extension (BMI2) which allow using the bitmask with very little overhead compared to a filled readyqueue, offerring decent performance even in cases with many empty inner queues. However, this technique has its limits, with a single word\footnote{Word refers here to however many bits can be written atomicly.} bitmask, the total number of underlying queues in the overarching queue is limited to the number of bits in the word. With a multi-word bitmask, this maximum limit can be increased arbitrarily, but it is not possible to check if the queue is empty by reading the bitmask atomicly. A dense bitmap, i.e., either a single word bitmask or a multi word bitmask where all words are densely packed, also causes additionnal problems in case~C (Table~\ref{tab:perfcases}), which the increased contention on the bitmask both causes new performance degradation and means the accuracy of the bitmask is less reliable due to more hardware threads potentially racing to read and/or update that information.
+Table~\ref{tab:perfcases}
+
+Performance can be improved in case~D (Table~\ref{tab:perfcases}) by adding information to help processors find which inner queues are used. This addition aims to avoid the cost of retrying the pop operation but does not affect contention on the underlying queues and can incur some management cost for both push and pop operations. The approach used to encode this information can vary in density and be either global or local, where density means the information is either packed in few cachelines or spread across several cachelines, and local information means each thread uses an independent copy instead of a single global, i.e., common, source of information.
+
+For example, bitmask can be used to identify which inner queues are currently in use, as shown in Figure~\ref{fig:emptybit}. This means that processors can often find user-threads in constant time, regardless of how many underlying queues are empty. Furthermore, modern x86 CPUs have extended bit manipulation instructions (BMI2) which allow using the bitmask with very little overhead compared to the randomized selection approach for a filled readyqueue, offerring decent performance even in cases with many empty inner queues. However, this technique has its limits: with a single word\footnote{Word refers here to however many bits can be written atomicly.} bitmask, the total number of underlying queues in the ready queue is limited to the number of bits in the word. With a multi-word bitmask, this maximum limit can be increased arbitrarily, but it is not possible to check if the queue is empty by reading the bitmask atomicly.
+
+Finally, a dense bitmap, either single or multi-word, causes additional problems
+in case C (Table 1), because many processors are continuously scanning the
+bitmask to find the few available threads. This increased contention on the
+bitmask(s) reduces performance because of cache misses and the bitmask is
+updated more frequently by the scanning processors racing to read and/or update
+that information. This increased update frequency means the information in the
+bitmask will more often be stale before a processor can use it to find an item.
 
 \begin{figure}
@@ -177,5 +187,5 @@
 \end{figure}
 
-Another approach is to use a hiearchical data structure, for example Figure~\ref{fig:emptytree}. Creating a tree of nodes to reduce contention has been shown to work in similar cases\cit(SNZI: Scalable NonZero Indicators)\footnote{This particular paper seems to be patented in the US. How does that affect \CFA? Can I use it in my work?}. However, this approach may lead to poorer performance in case~B (Table~\ref{tab:perfcases}) due to the inherent pointer chasing cost and already low contention cost in that case.
+Another approach is to use a hiearchical data structure, for example Figure~\ref{fig:emptytree}. Creating a tree of nodes to reduce contention has been shown to work in similar cases\cite{ellen2007snzi}\footnote{This particular paper seems to be patented in the US. How does that affect \CFA? Can I use it in my work?}. However, this approach may lead to poorer performance in case~B (Table~\ref{tab:perfcases}) due to the inherent pointer chasing cost and already low contention cost in that case.
 
 \begin{figure}
@@ -187,7 +197,7 @@
 \end{figure}
 
-Finally, a third approach is to use dense information, similar to the bitmap, but have each thread keep its own independant copies of it. While this approach can offer good scalability \emph{and} low latency, the livelyness of the information can become a problem. In the simple cases, local copies of which underlying queues are empty can become stale and end-up not be useful when for the pop operation. A more serious problem is that reliable information is necessary for some parts of this algorithm to be correct. As mentionned in this section, processors must know \emph{reliably} whether the list is empty or not to decide if they can return \texttt{NULL} or if they must keep looking during a pop operation. Section~\ref{sec:sleep} discusses an other case where reliable information is required for the algorithm to be correct.
-
-There is a fundamental tradeoff among these approach. Dense global information about empty underlying queues will help zero-contention cases at the cost of high-contention case. Sparse global information will help high-contention cases but increase latency in zero-contention-cases, to read and ``aggregate'' the information\footnote{Hiearchical structures, e.g., binary search tree, effectively aggregate information but following pointer chains, learning information for each node. Similarly, other sparse schemes would need to read multiple cachelines to acquire all the information needed.}. Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases, however the information can become stale making it difficult to use to ensure correctness. The fact that these solutions have these fundamental limits suggest that that a more solution that combines these solutions in an interesting ways. The lock discussed in Section~\ref{sec:resize} also allows for solutions that adapt to the number of processors, which couls also prove useful.
+Finally, a third approach is to use dense information, similar to the bitmap, but have each thread keep its own independant copies of it. While this approach can offer good scalability \emph{and} low latency, the livelyness of the information can become a problem. In the simple cases, local copies of which underlying queues are empty can become stale and end-up not being useful for the pop operation. A more serious problem is that reliable information is necessary for some parts of this algorithm to be correct. As mentionned in this section, processors must know \emph{reliably} whether the list is empty or not to decide if they can return \texttt{NULL} or if they must keep looking during a pop operation. Section~\ref{sec:sleep} discusses another case where reliable information is required for the algorithm to be correct.
+
+There is a fundamental tradeoff among these approach. Dense global information about empty underlying queues helps zero-contention cases at the cost of high-contention case. Sparse global information helps high-contention cases but increases latency in zero-contention-cases, to read and ``aggregate'' the information\footnote{Hiearchical structures, e.g., binary search tree, effectively aggregate information but following pointer chains, learning information for each node. Similarly, other sparse schemes would need to read multiple cachelines to acquire all the information needed.}. Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases, however the information can become stale making it difficult to use to ensure correctness. The fact that these solutions have these fundamental limits suggest to me that a better solution combines these properties in an interesting ways. The lock discussed in Section~\ref{sec:resize} also allows for solutions that adapt to the number of processors, which could also prove useful.
 
 \paragraph{Objectives and Existing Work}
Index: doc/theses/thierry_delisle_PhD/comp_II/local.bib
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/local.bib	(revision c5fdebfa78715e593b97117e52e43a2217086c4f)
+++ doc/theses/thierry_delisle_PhD/comp_II/local.bib	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
@@ -221,2 +221,23 @@
   organization={ACM}
 }
+
+% ===============================================================================
+% MISC
+% ===============================================================================
+% Trevor's relaxed FIFO list
+@inproceedings{alistarh2018relaxed,
+  title={Relaxed schedulers can efficiently parallelize iterative algorithms},
+  author={Alistarh, Dan and Brown, Trevor and Kopinsky, Justin and Nadiradze, Giorgi},
+  booktitle={Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing},
+  pages={377--386},
+  year={2018}
+}
+
+% Scalable counters which only support is !0
+@inproceedings{ellen2007snzi,
+  title={SNZI: Scalable nonzero indicators},
+  author={Ellen, Faith and Lev, Yossi and Luchangco, Victor and Moir, Mark},
+  booktitle={Proceedings of the twenty-sixth annual ACM symposium on Principles of distributed computing},
+  pages={13--22},
+  year={2007}
+}
Index: libcfa/src/exception.c
===================================================================
--- libcfa/src/exception.c	(revision c5fdebfa78715e593b97117e52e43a2217086c4f)
+++ libcfa/src/exception.c	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
@@ -252,7 +252,4 @@
 }
 
-#pragma GCC push_options
-#pragma GCC optimize("O0")
-
 // This is our personality routine. For every stack frame annotated with
 // ".cfi_personality 0x3,__gcfa_personality_v0" this function will be called twice when unwinding.
@@ -413,4 +410,7 @@
 	return _URC_CONTINUE_UNWIND;
 }
+
+#pragma GCC push_options
+#pragma GCC optimize("O0")
 
 // Try statements are hoisted out see comments for details. While this could probably be unique
Index: tools/gdb/.gdbinit
===================================================================
--- tools/gdb/.gdbinit	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
+++ tools/gdb/.gdbinit	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
@@ -0,0 +1,9 @@
+# These signal are necessary for Cforall, so tell gdb to ignore them
+# and pass to through to the program.
+handle SIGALRM nostop noprint pass
+handle SIGUSR1 nostop noprint pass
+# Load macros to make gdb understand uC++ user-threads
+source /home/tdelisle/workspace/gdb/utils-gdb.gdb
+source /home/tdelisle/workspace/gdb/utils-gdb.py
+# Have gdb indent complex values to make them readable.
+set print pretty
Index: tools/gdb/README
===================================================================
--- tools/gdb/README	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
+++ tools/gdb/README	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
@@ -0,0 +1,19 @@
+These instructions assume the install <prefix> for uC++ is /usr/local/u++-7.0.0.
+If installed elsewhere, change <prefix>.
+
+Copy <prefix>/.gdbinit to your home directory. If installed elsewhere, change
+the <prefix> within .gdbinit. Thereafter, gdb automatically loads the .gdbinit
+file from the home directory making the following new gdb commands available.
+
+New commands:
+
+    clusters                        : print all clusters
+    processors                      : print all processors on all clusters
+    processors  <clusterName>       : print all processors on cluster
+    task                            : print userCluster tasks, application tasks only
+    task <clusterName>              : print cluster tasks, application tasks only
+    task all                        : print all clusters, all tasks
+    task <id>                       : switch stack to task id on userCluster
+    task 0x<address>	            : switch stack to task on any cluster
+    task <id> <clusterName>         : switch stack to task on specified cluster
+    prevtask                        : return to last switched task
Index: tools/gdb/utils-gdb.gdb
===================================================================
--- tools/gdb/utils-gdb.gdb	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
+++ tools/gdb/utils-gdb.gdb	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
@@ -0,0 +1,69 @@
+# 'server' keyword disables confirmation dialog when re-loading/re-defining
+# 'reset' restores the registers to the stack values at breakpoint to allow the movement commands to work correctly
+server define hook-continue
+reset
+end
+
+server define hook-next
+reset
+end
+
+server define hook-nexti
+reset
+end
+
+server define hook-step
+reset
+end
+
+server define hook-stepi
+reset
+end
+
+server define hook-finish
+reset
+end
+
+server define hook-advance
+reset
+end
+
+server define hook-jump
+reset
+end
+
+server define hook-signal
+reset
+end
+
+server define hook-until
+reset
+end
+
+server define hook-reverse-next
+reset
+end
+
+server define hook-reverse-step
+reset
+end
+
+server define hook-reverse-stepi
+reset
+end
+
+server define hook-reverse-continue
+reset
+end
+
+server define hook-reverse-finish
+reset
+end
+
+server define hook-run
+reset
+end
+
+server define hook-thread
+reset
+end
Index: tools/gdb/utils-gdb.py
===================================================================
--- tools/gdb/utils-gdb.py	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
+++ tools/gdb/utils-gdb.py	(revision 778452d93cf0c86a2356479f21e59dd07e85f21c)
@@ -0,0 +1,767 @@
+#
+# Copyright (C) Lynn Tran, Jiachen Zhang 2018
+#
+# utils-gdb.py --
+#
+# Author           : Lynn Tran
+# Created On       : Mon Oct 1 22:06:09 2018
+# Last Modified By : Peter A. Buhr
+# Last Modified On : Sat Jan 19 14:16:10 2019
+# Update Count     : 11
+#
+
+"""
+To run this extension, the python name has to be as same as one of the loaded library
+Additionally, the file must exist in a folder which is in gdb's safe path
+"""
+import collections
+import gdb
+import re
+
+# set these signal handlers with some settings (nostop, noprint, pass)
+gdb.execute('handle SIGALRM nostop noprint pass')
+gdb.execute('handle SIGUSR1 nostop noprint pass')
+
+CfaTypes = collections.namedtuple('CfaTypes', 'cluster_ptr processor_ptr thread_ptr int_ptr thread_state')
+
+class ThreadInfo:
+    tid = 0
+    cluster = None
+    value = None
+
+    def __init__(self, cluster, value):
+        self.cluster = cluster
+        self.value = value
+
+    def is_system(self):
+        return False
+
+# A named tuple representing information about a stack
+StackInfo = collections.namedtuple('StackInfo', 'sp fp pc')
+
+# A global variable to keep track of stack information as one switches from one
+# task to another task
+STACK = []
+
+# A global variable to keep all system task name
+SysTask_Name = ["uLocalDebuggerReader", "uLocalDebugger", "uProcessorTask", "uBootTask", "uSystemTask",
+"uProcessorTask", "uPthread", "uProfiler"]
+
+not_supported_error_msg = "Not a supported command for this language"
+
+def is_cforall():
+    return True
+
+def get_cfa_types():
+    # GDB types for various structures/types in CFA
+    return CfaTypes(cluster_ptr = gdb.lookup_type('struct cluster').pointer(),
+                  processor_ptr = gdb.lookup_type('struct processor').pointer(),
+                     thread_ptr = gdb.lookup_type('struct $thread').pointer(),
+                        int_ptr = gdb.lookup_type('int').pointer(),
+                   thread_state = gdb.lookup_type('enum coroutine_state'))
+
+def get_addr(addr):
+    """
+    NOTE: sketchy solution to retrieve address. There is a better solution...
+    @addr: str of an address that can be in a format 0xfffff <type of the object
+    at this address>
+    Return: str of just the address
+    """
+    str_addr = str(addr)
+    ending_addr_index = str_addr.find('<')
+    if ending_addr_index == -1:
+        return str(addr)
+    return str_addr[:ending_addr_index].strip()
+
+def print_usage(obj):
+    print(obj.__doc__)
+
+def parse(args):
+    """
+    Split the argument list in string format, where each argument is separated
+    by whitespace delimiter, to a list of arguments like argv
+    @args: str of arguments
+    Return:
+        [] if args is an empty string
+        list if args is not empty
+    """
+    # parse the string format of arguments and return a list of arguments
+    argv = args.split(' ')
+    if len(argv) == 1 and argv[0] == '':
+        return []
+    return argv
+
+def get_cluster_root():
+    """
+    Return: gdb.Value of globalClusters.root (is an address)
+    """
+    cluster_root = gdb.parse_and_eval('_X11mainClusterPS7cluster_1')
+    if cluster_root.address == 0x0:
+        print('No clusters, program terminated')
+    return cluster_root
+
+def find_curr_thread():
+    # btstr = gdb.execute('bt', to_string = True).splitlines()
+    # if len(btstr) == 0:
+    #     print('error')
+    #     return None
+    # return btstr[0].split('this=',1)[1].split(',')[0].split(')')[0]
+    return None
+
+def lookup_cluster(name = None):
+    """
+    Look up a cluster given its ID
+    @name: str
+    Return: gdb.Value
+    """
+    if not is_cforall():
+        return None
+
+    root = get_cluster_root()
+    if root.address == 0x0:
+        return None
+
+    if not name:
+        return root
+
+    # lookup for the task associated with the id
+    cluster = None
+    curr = root
+    while True:
+        if curr['_X4namePKc_1'].string() == name:
+            cluster = curr.address
+            break
+        curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
+        if curr == root or curr == 0x0:
+            break
+
+    if not cluster:
+        print("Cannot find a cluster with the name: {}.".format(name))
+        return None
+
+    return cluster
+
+def lookup_threads_by_cluster(cluster):
+        # Iterate through a circular linked list of threads and accumulate them in an array
+        threads = []
+
+        cfa_t = get_cfa_types()
+        root = cluster['_X7threadsS8__dllist_S7$thread__1']['_X4headPY15__TYPE_generic__1'].cast(cfa_t.thread_ptr)
+
+        if root == 0x0 or root.address == 0x0:
+            print('There are no tasks for cluster: {}'.format(cluster))
+            return threads
+
+        curr = root
+        tid = 0
+        sid = -1
+
+        while True:
+            t = ThreadInfo(cluster, curr)
+            if t.is_system():
+                t.tid = sid
+                sid -= 1
+            else:
+                t.tid = tid
+                tid += 1
+
+            threads.append(t)
+
+            curr = curr['node']['next']
+            if curr == root or curr == 0x0:
+                break
+
+        return threads
+
+def system_thread(thread):
+    return False
+
+def adjust_stack(pc, fp, sp):
+    # pop sp, fp, pc from global stack
+    gdb.execute('set $pc = {}'.format(pc))
+    gdb.execute('set $rbp = {}'.format(fp))
+    gdb.execute('set $sp = {}'.format(sp))
+
+############################ COMMAND IMPLEMENTATION #########################
+
+class Clusters(gdb.Command):
+    """Cforall: Display currently known clusters
+Usage:
+    info clusters                 : print out all the clusters
+"""
+
+    def __init__(self):
+        super(Clusters, self).__init__('info clusters', gdb.COMMAND_USER)
+
+    def print_cluster(self, cluster_name, cluster_address):
+        print('{:>20}  {:>20}'.format(cluster_name, cluster_address))
+
+    #entry point from gdb
+    def invoke(self, arg, from_tty):
+        if not is_cforall():
+            return
+
+        if arg:
+            print("info clusters does not take arguments")
+            print_usage(self)
+            return
+
+        cluster_root = get_cluster_root()
+        if cluster_root.address == 0x0:
+            return
+
+        curr = cluster_root
+        self.print_cluster('Name', 'Address')
+
+        while True:
+            self.print_cluster(curr['_X4namePKc_1'].string(), str(curr))
+            curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
+            if curr == cluster_root:
+                break
+
+        print("")
+
+############
+class Processors(gdb.Command):
+    """Cforall: Display currently known processors
+Usage:
+    info processors                 : print out all the processors in the Main Cluster
+    info processors <cluster_name>  : print out all processors in a given cluster
+"""
+
+    def __init__(self):
+        super(Processors, self).__init__('info processors', gdb.COMMAND_USER)
+
+    def print_processor(self, name, status, pending, address):
+        print('{:>20}  {:>11}  {:>13}  {:>20}'.format(name, status, pending, address))
+
+    def iterate_procs(self, root, active):
+        if root == 0x0:
+            return
+
+        cfa_t = get_cfa_types()
+        curr = root
+
+        while True:
+            processor = curr
+            should_stop = processor['_X12do_terminateVb_1']
+            stop_count  = processor['_X10terminatedS9semaphore_1']['_X5counti_1']
+            if not should_stop:
+                status = 'Active' if active else 'Idle'
+            else:
+                status_str  = 'Last Thread' if stop_count >= 0 else 'Terminating'
+                status      = '{}({},{})'.format(status_str, should_stop, stop_count)
+
+            self.print_processor(processor['_X4namePKc_1'].string(),
+                    status, str(processor['_X18pending_preemptionb_1']), str(processor)
+                )
+
+            curr = curr['_X4nodeS28__processor____dbg_node_proc_1']['_X4nextPS9processor_1']
+
+            if curr == root or curr == 0x0:
+                break
+
+    #entry point from gdb
+    def invoke(self, arg, from_tty):
+        if not is_cforall():
+            return
+
+        cluster = lookup_cluster(arg if arg else None)
+
+        if not cluster:
+            print("No Cluster matching arguments found")
+            return
+
+        cfa_t = get_cfa_types()
+        print('Cluster: "{}"({})'.format(cluster['_X4namePKc_1'].string(), cluster.cast(cfa_t.cluster_ptr)))
+
+        active_root = cluster.cast(cfa_t.cluster_ptr) \
+                ['_X5procsS8__dllist_S9processor__1'] \
+                ['_X4headPY15__TYPE_generic__1'] \
+                .cast(cfa_t.processor_ptr)
+
+        idle_root = cluster.cast(cfa_t.cluster_ptr) \
+                ['_X5idlesS8__dllist_S9processor__1'] \
+                ['_X4headPY15__TYPE_generic__1'] \
+                .cast(cfa_t.processor_ptr)
+
+        if idle_root != 0x0 or active_root != 0x0:
+            self.print_processor('Name', 'Status', 'Pending Yield', 'Address')
+            self.iterate_procs(active_root, True)
+            self.iterate_procs(idle_root, False)
+        else:
+            print("No processors on cluster")
+
+        print()
+
+############
+class Threads(gdb.Command):
+    """Cforall: Display currently known threads
+Usage:
+    cfathreads                           : print Main Cluster threads, application threads only
+    cfathreads all                       : print all clusters, all threads
+    cfathreads <clusterName>             : print cluster threads, application threads only
+    """
+    def __init__(self):
+        # The first parameter of the line below is the name of the command. You
+        # can call it 'uc++ task'
+        super(Threads, self).__init__('info cfathreads', gdb.COMMAND_USER)
+
+    def print_formatted(self, marked, tid, name, state, address):
+        print('{:>1}  {:>4}  {:>20}  {:>10}  {:>20}'.format('*' if marked else ' ', tid, name, state, address))
+
+    def print_thread(self, thread, tid, marked):
+        cfa_t = get_cfa_types()
+        self.print_formatted(marked, tid, thread['self_cor']['name'].string(), str(thread['state'].cast(cfa_t.thread_state)), str(thread))
+
+    def print_formatted_cluster(self, str_format, cluster_name, cluster_addr):
+        print(str_format.format(cluster_name, cluster_addr))
+
+    def print_threads_by_cluster(self, cluster, print_system = False):
+        # Iterate through a circular linked list of tasks and print out its
+        # name along with address associated to each cluster
+        threads = lookup_threads_by_cluster(cluster)
+        if not threads:
+            return
+
+        running_thread = find_curr_thread()
+        if running_thread is None:
+            print('Could not identify current thread')
+
+        self.print_formatted(False, '', 'Name', 'State', 'Address')
+
+        for t in threads:
+            if not t.is_system() or print_system:
+                self.print_thread(t.value, t.tid, t.value == running_thread if running_thread else False)
+
+        print()
+
+    def print_all_threads(self):
+        print("Not implemented")
+
+    def invoke(self, arg, from_tty):
+        """
+        @arg: str
+        @from_tty: bool
+        """
+        if not is_cforall():
+            return
+
+        if not arg:
+            cluster = lookup_cluster()
+            if not cluster:
+                print("Could not find Main Cluster")
+                return
+
+            # only tasks and main
+            self.print_threads_by_cluster(cluster, False)
+
+        elif arg == 'all':
+            # all threads, all clusters
+            self.print_all_threads()
+
+        else:
+            cluster = lookup_cluster(arg)
+            if not cluster:
+                print("Could not find cluster '{}'".format(arg))
+                return
+
+            # all tasks, specified cluster
+            self.print_threads_by_cluster(cluster, True)
+
+
+############
+class Thread(gdb.Command):
+    def __init__(self):
+        # The first parameter of the line below is the name of the command. You
+        # can call it 'uc++ task'
+        super(Threads, self).__init__('cfathread', gdb.COMMAND_USER)
+
+    def print_usage(self):
+        print_usage("""
+    cfathread                            : print userCluster tasks, application tasks only
+    cfathread <clusterName>              : print cluster tasks, application tasks only
+    cfathread all                        : print all clusters, all tasks
+    cfathread <id>                       : switch stack to thread id on userCluster
+    cfathread 0x<address>	             : switch stack to thread on any cluster
+    cfathread <id> <clusterName>         : switch stack to thread on specified cluster
+    """)
+
+    ############################ AUXILIARY FUNCTIONS #########################
+
+    def print_formatted(self, marked, tid, name, state, address):
+        print('{:>1}  {:>4}  {:>20}  {:>10}  {:>20}'.format('*' if marked else ' ', tid, name, state, address))
+
+    def print_thread(self, thread, tid, marked):
+        cfa_t = get_cfa_types()
+        self.print_formatted(marked, tid, thread['self_cor']['name'].string(), str(thread['state'].cast(cfa_t.thread_state)), str(thread))
+
+    def print_formatted_cluster(self, str_format, cluster_name, cluster_addr):
+        print(str_format.format(cluster_name, cluster_addr))
+
+    def print_tasks_by_cluster_all(self, cluster_address):
+        """
+        Display a list of all info about all available tasks on a particular cluster
+        @cluster_address: gdb.Value
+        """
+        cluster_address = cluster_address.cast(uCPPTypes.ucluster_ptr)
+        task_root = cluster_address['tasksOnCluster']['root']
+
+        if task_root == 0x0 or task_root.address == 0x0:
+            print('There are no tasks for cluster at address: {}'.format(cluster_address))
+            return
+
+        self.print_formatted_task('', 'Task Name', 'Address', 'State')
+        curr = task_root
+        task_id = 0
+        systask_id = -1
+
+        breakpoint_addr = self.find_curr_breakpoint_addr()
+        if breakpoint_addr is None:
+            return
+
+        while True:
+            global SysTask_Name
+            if (curr['task_']['name'].string() in SysTask_Name):
+                self.print_formatted_tasks(systask_id, breakpoint_addr, curr)
+                systask_id -= 1
+            else:
+                self.print_formatted_tasks(task_id, breakpoint_addr, curr)
+                task_id += 1
+
+            curr = curr['next'].cast(uCPPTypes.uBaseTaskDL_ptr_type)
+            if curr == task_root:
+                break
+
+    def print_tasks_by_cluster_address_all(self, cluster_address):
+        """
+        Display a list of all info about all available tasks on a particular cluster
+        @cluster_address: str
+        """
+        # Iterate through a circular linked list of tasks and print out its
+        # name along with address associated to each cluster
+
+        # convert hex string to hex number
+        try:
+            hex_addr = int(cluster_address, 16)
+        except:
+            self.print_usage()
+            return
+
+        cluster_address = gdb.Value(hex_addr)
+        if not self.print_tasks_by_cluster_all(cluster_address):
+            return
+
+    def print_threads_by_cluster(self, cluster, print_system = False):
+        """
+        Display a list of limited info about all available threads on a particular cluster
+        @cluster: str
+        @print_system: bool
+        """
+        # Iterate through a circular linked list of tasks and print out its
+        # name along with address associated to each cluster
+
+        threads = self.threads_by_cluster(cluster)
+        if not threads:
+            return
+
+        running_thread = self.find_curr_thread()
+        if running_thread is None:
+            print('Could not identify current thread')
+
+        self.print_formatted(False, '', 'Name', 'State', 'Address')
+
+        for t in threads:
+            if not t.is_system() or print_system:
+                self.print_thread(t.value, t.tid, t.value == running_thread if running_thread else False)
+
+        print()
+
+    ############################ COMMAND FUNCTIONS #########################
+
+    def print_all_threads(self):
+        """Iterate through each cluster, iterate through all tasks and  print out info about all the tasks
+        in those clusters"""
+        uCPPTypes = None
+        try:
+            uCPPTypes = get_uCPP_types()
+        except gdb.error:
+            print(not_supported_error_msg)
+            print(gdb.error)
+            return
+
+        cluster_root = get_cluster_root()
+        if cluster_root.address == 0x0:
+            return
+
+        curr = cluster_root
+        self.print_formatted_cluster(self.cluster_str_format, 'Cluster Name', 'Address')
+
+        while True:
+            addr = str(curr['cluster_'].reference_value())[1:]
+            self.print_formatted_cluster(self.cluster_str_format, curr['cluster_']['name'].string(), addr)
+
+            self.print_tasks_by_cluster_address_all(addr)
+            curr = curr['next'].cast(uCPPTypes.uClusterDL_ptr_type)
+            if curr == cluster_root:
+                break
+
+    def switchto(self, thread):
+        """Change to a new task by switching to a different stack and manually
+        adjusting sp, fp and pc
+        @task_address: str
+            2 supported format:
+                in hex format
+                    <hex_address>: literal hexadecimal address
+                    Ex: 0xffffff
+                in name of the pointer to the task
+                    "task_name": pointer of the variable name of the cluster
+                        Ex: T* s -> task_name = s
+            Return: gdb.value of the cluster's address
+        """
+        # uCPPTypes = None
+        # try:
+        #     uCPPTypes = get_uCPP_types()
+        # except gdb.error:
+        #     print(not_supported_error_msg)
+        #     print(gdb.error)
+        #     return
+
+        # # Task address has a format "task_address", which implies that it is the
+        # # name of the variable, and it needs to be evaluated
+        # if task_address.startswith('"') and task_address.endswith('"'):
+        #     task = gdb.parse_and_eval(task_address.replace('"', ''))
+        # else:
+        # # Task address format does not include the quotation marks, which implies
+        # # that it is a hex address
+        #     # convert hex string to hex number
+        #     try:
+        #         hex_addr = int(task_address, 16)
+        #     except:
+        #         self.print_usage()
+        #         return
+        #     task_address = gdb.Value(hex_addr)
+        #     task = task_address.cast(uCPPTypes.uBaseTask_ptr_type)
+        try:
+            if not gdb.lookup_symbol('__cfactx_switch'):
+                print('__cfactx_switch symbol is unavailable')
+                return
+        except:
+            print('here 3')
+
+        cfa_t = get_cfa_types()
+
+        state = thread['state'].cast(cfa_t.thread_state)
+        try:
+            if state == gdb.parse_and_eval('Halted'):
+                print('Cannot switch to a terminated thread')
+                return
+
+            if state == gdb.parse_and_eval('Start'):
+                print('Cannjot switch to a thread not yet run')
+                return
+        except:
+            print("here 2")
+            return
+
+
+        context = thread['context']
+
+        # lookup for sp,fp and uSwitch
+        xsp = context['SP'] + 48
+        xfp = context['FP']
+
+        # convert string so we can strip out the address
+        try:
+            xpc = get_addr(gdb.parse_and_eval('__cfactx_switch').address + 28)
+        except:
+            print("here")
+            return
+
+        # must be at frame 0 to set pc register
+        gdb.execute('select-frame 0')
+
+        # push sp, fp, pc into a global stack
+        global STACK
+        sp = gdb.parse_and_eval('$sp')
+        fp = gdb.parse_and_eval('$fp')
+        pc = gdb.parse_and_eval('$pc')
+        stack_info = StackInfo(sp = sp, fp = fp, pc = pc)
+        STACK.append(stack_info)
+
+        # update registers for new task
+        print('switching to ')
+        gdb.execute('set $rsp={}'.format(xsp))
+        gdb.execute('set $rbp={}'.format(xfp))
+        gdb.execute('set $pc={}'.format(xpc))
+
+    def find_matching_gdb_thread_id():
+        """
+        Parse the str from info thread to get the number
+        """
+        info_thread_str = gdb.execute('info thread', to_string=True).splitlines()
+        for thread_str in info_thread_str:
+            if thread_str.find('this={}'.format(task)) != -1:
+                thread_id_pattern = r'^\*?\s+(\d+)\s+Thread'
+                # retrive gdb thread id
+                return re.match(thread_id_pattern, thread_str).group(1)
+
+            # check if the task is running or not
+            if task_state == gdb.parse_and_eval('uBaseTask::Running'):
+                # find the equivalent thread from info thread
+                gdb_thread_id = find_matching_gdb_thread_id()
+                if gdb_thread_id is None:
+                    print('cannot find the thread id to switch to')
+                    return
+                # switch to that thread based using thread command
+                gdb.execute('thread {}'.format(gdb_thread_id))
+
+    def switchto_id(self, tid, cluster):
+        """
+        @cluster: cluster object
+        @tid: int
+        """
+        threads = self.threads_by_cluster( cluster )
+
+        for t in threads:
+            if t.tid == tid:
+                self.switchto(t.value)
+                return
+
+        print("Cound not find thread by id '{}'".format(tid))
+
+    def invoke(self, arg, from_tty):
+        """
+        @arg: str
+        @from_tty: bool
+        """
+        if not is_cforall():
+            return
+
+        argv = parse(arg)
+        print(argv)
+        if len(argv) == 0:
+            """
+            Iterate only Main Thread, print only tasks and main
+            """
+            cluster = lookup_cluster()
+            if not cluster:
+                print("Could not find Main Cluster")
+                return
+
+            # only tasks and main
+            self.print_threads_by_cluster(cluster, False)
+
+        elif len(argv) == 1:
+            if argv[0] == 'help':
+                self.print_usage()
+            # push task
+            elif argv[0].isdigit():
+                cluster = lookup_cluster()
+                if not cluster:
+                    print("Could not find Main Cluster")
+                    return
+
+                try:
+                    tid = int(argv[0])
+                except:
+                    print("'{}' not a valid thread id".format(argv[0]))
+                    self.print_usage()
+                    return
+
+                 # by id, userCluster
+                self.switchto_id(tid, cluster)
+
+            elif argv[0].startswith('0x') or argv[0].startswith('0X'):
+                self.switchto(argv[0]) # by address, any cluster
+            # print tasks
+            elif argv[0] == 'all':
+                self.print_all_threads() # all tasks, all clusters
+            else:
+                """
+                Print out all the tasks available in the specified cluster
+                @cluster_name: str
+                """
+                print("cfathread by name")
+                cluster = lookup_cluster(argv[0])
+                if not cluster:
+                    return
+
+                # all tasks, specified cluster
+                self.print_threads_by_cluster(cluster, True)
+
+        elif len(argv) == 2:
+            # push task
+            self.pushtask_by_id(argv[0], argv[1]) # by id, specified cluster
+        else:
+            print('Invalid arguments')
+            self.print_usage()
+
+############
+class PrevThread(gdb.Command):
+    """Switch back to previous task on the stack"""
+    usage_msg = 'prevtask'
+
+    def __init__(self):
+        super(PrevThread, self).__init__('prevtask', gdb.COMMAND_USER)
+
+    def invoke(self, arg, from_tty):
+        """
+        @arg: str
+        @from_tty: bool
+        """
+        global STACK
+        if len(STACK) != 0:
+            # must be at frame 0 to set pc register
+            gdb.execute('select-frame 0')
+
+            # pop stack
+            stack_info = STACK.pop()
+            pc = get_addr(stack_info.pc)
+            sp = stack_info.sp
+            fp = stack_info.fp
+
+            # pop sp, fp, pc from global stack
+            adjust_stack(pc, fp, sp)
+
+            # must be at C++ frame to access C++ vars
+            gdb.execute('frame 1')
+        else:
+            print('empty stack')
+
+class ResetOriginFrame(gdb.Command):
+    """Reset to the origin frame prior to continue execution again"""
+    usage_msg = 'resetOriginFrame'
+    def __init__(self):
+        super(ResetOriginFrame, self).__init__('reset', gdb.COMMAND_USER)
+
+    def invoke(self, arg, from_tty):
+        """
+        @arg: str
+        @from_tty: bool
+        """
+        global STACK
+        if len(STACK) != 0:
+            stack_info = STACK.pop(0)
+            STACK.clear()
+            pc = get_addr(stack_info.pc)
+            sp = stack_info.sp
+            fp = stack_info.fp
+
+            # pop sp, fp, pc from global stack
+            adjust_stack(pc, fp, sp)
+
+            # must be at C++ frame to access C++ vars
+            gdb.execute('frame 1')
+        #else:
+            #print('reset: empty stack') #probably does not have to print msg
+
+Clusters()
+Processors()
+ResetOriginFrame()
+PrevThread()
+Threads()
+
+# Local Variables: #
+# mode: Python #
+# End: #
