Index: benchmark/readyQ/yield.cfa
===================================================================
--- benchmark/readyQ/yield.cfa	(revision 37ba6628ac22d4f11d3fb42a2c573168ac19bd03)
+++ benchmark/readyQ/yield.cfa	(revision 04b5cef29b60d0ea2bc49962adba72c424a3e55e)
@@ -22,5 +22,5 @@
 volatile unsigned long long global_counter;
 
-thread __attribute__((aligned(64))) Yielder {
+thread __attribute__((aligned(128))) Yielder {
 	unsigned long long counter;
 };
@@ -113,9 +113,9 @@
 			{
 				Yielder threads[nthreads];
+				bool is_tty = isatty(STDOUT_FILENO);
 				printf("Starting\n");
 				start = getTime();
 				run = true;
 
-				bool is_tty = isatty(STDOUT_FILENO);
 				for(i; nthreads) {
 					unpark( threads[i] __cfaabi_dbg_ctx2 );
Index: benchmark/readyQ/yield.cpp
===================================================================
--- benchmark/readyQ/yield.cpp	(revision 04b5cef29b60d0ea2bc49962adba72c424a3e55e)
+++ benchmark/readyQ/yield.cpp	(revision 04b5cef29b60d0ea2bc49962adba72c424a3e55e)
@@ -0,0 +1,167 @@
+#include <cassert>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <climits>
+
+extern "C" {
+	#include <locale.h>
+	#include <getopt.h>
+}
+
+#include <unistd.h>
+
+#include <chrono>
+
+using Clock = std::chrono::high_resolution_clock;
+using duration_t = std::chrono::duration<double>;
+using std::chrono::nanoseconds;
+
+
+template<typename Ratio, typename T>
+T duration_cast(T seconds) {
+	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
+}
+
+volatile bool run = false;
+volatile unsigned long long global_counter;
+
+#include "libfibre/fibre.h"
+
+FibreBarrier * barrier;
+struct __attribute__((aligned(128))) counter_t {
+	int value = 0;
+};
+
+void fibre_main( counter_t * counter ) {
+	barrier->wait();
+	// /* paranoid */ assert( true == __atomic_load_n(&run, __ATOMIC_RELAXED) );
+
+	while(__atomic_load_n(&run, __ATOMIC_RELAXED)) {
+		Fibre::forceYield();
+		// fibre_yield();
+		counter->value++;
+	}
+	__atomic_fetch_add(&global_counter, counter->value, __ATOMIC_SEQ_CST);
+}
+
+int main(int argc, char * argv[]) {
+	double duration = 5;
+	int nprocs = 1;
+	int nthreads = 1;
+
+	std::cout.imbue(std::locale(""));
+	setlocale(LC_ALL, "");
+
+	for(;;) {
+		static struct option options[] = {
+			{"duration",  required_argument, 0, 'd'},
+			{"nprocs",    required_argument, 0, 'p'},
+			{"nthreads",  required_argument, 0, 't'},
+			{0, 0, 0, 0}
+		};
+
+		int idx = 0;
+		int opt = getopt_long(argc, argv, "d:p:t:", options, &idx);
+
+		const char * arg = optarg ? optarg : "";
+		size_t len = 0;
+		char * end;
+		switch(opt) {
+			case -1:
+				goto run;
+			// Numeric Arguments
+			case 'd':
+				duration = strtod(arg, &end);
+				if(*end != '\0') {
+					fprintf(stderr, "Duration must be a valid double, was %s\n", arg);
+					goto usage;
+				}
+				break;
+			case 't':
+				nthreads = strtoul(arg, &end, 10);
+				if(*end != '\0' || nthreads < 1) {
+					fprintf(stderr, "Number of threads must be a positive integer, was %s\n", arg);
+					goto usage;
+				}
+				break;
+			case 'p':
+				nprocs = strtoul(arg, &end, 10);
+				if(*end != '\0' || nprocs < 1) {
+					fprintf(stderr, "Number of processors must be a positive integer, was %s\n", arg);
+					goto usage;
+				}
+				break;
+			// Other cases
+			default: /* ? */
+				fprintf( stderr, "Unkown option '%c'\n", opt);
+			usage:
+				fprintf( stderr, "Usage: %s [options]\n", argv[0]);
+				fprintf( stderr, "\n" );
+				fprintf( stderr, "  -d, --duration=DURATION  Duration of the experiment, in seconds\n" );
+				fprintf( stderr, "  -t, --nthreads=NTHREADS  Number of kernel threads\n" );
+				fprintf( stderr, "  -q, --nqueues=NQUEUES    Number of queues per threads\n" );
+				exit(1);
+		}
+	}
+	run:
+
+	{
+		printf("Running %d threads on %d processors for %lf seconds\n", nthreads, nprocs, duration);
+
+		FibreInit();
+		barrier = new FibreBarrier(nthreads + 1);
+		{
+			Context::CurrCluster().addWorkers(nprocs);
+			{
+				counter_t counters[nthreads];
+				Fibre threads[nthreads];
+				for(int i = 0; i < nthreads; i++) {
+					threads[i].run(fibre_main, &counters[i]);
+				}
+				printf("Starting\n");
+				bool is_tty = isatty(STDOUT_FILENO);
+				auto before = Clock::now();
+				run = true;
+
+				barrier->wait();
+				for(;;) {
+					usleep(500'000);
+					auto now = Clock::now();
+					duration_t durr = now - before;
+					if( durr.count() > duration ) {
+						break;
+					}
+					if(is_tty) {
+						std::cout << "\r" << std::setprecision(4) << durr.count();
+						std::cout.flush();
+					}
+				}
+
+				auto after = Clock::now();
+				duration_t durr = after - before;
+				duration = durr.count();
+				run = false;
+				printf("\nDone\n");
+				for(auto & thread : threads) {
+					thread.join();
+				}
+
+				// for(const auto & counter : counters) {
+				// 	std::cout << counter.value << std::endl;
+				// }
+			}
+		}
+
+		auto dur_nano = duration_cast<std::nano>(duration);
+
+		std::cout << "Took " << duration << " s\n";
+		printf("Total yields        : %'15llu\n", global_counter );
+		printf("Yields per procs    : %'15llu\n", global_counter / nprocs );
+		printf("Yields per second   : %'18.2lf\n", ((double)global_counter) / duration );
+		printf("Yields/sec/procs    : %'18.2lf\n", (((double)global_counter) / nprocs) / duration );
+		printf("ns per yields       : %'18.2lf\n", dur_nano / global_counter );
+		printf("ns per yields/procs : %'18.2lf\n", dur_nano / (global_counter / nprocs) );
+
+	}
+}
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 37ba6628ac22d4f11d3fb42a2c573168ac19bd03)
+++ libcfa/src/concurrency/kernel.cfa	(revision 04b5cef29b60d0ea2bc49962adba72c424a3e55e)
@@ -342,4 +342,5 @@
 				/* paranoid */ verifyf( readyThread->state == Ready || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
 				/* paranoid */ verifyf( readyThread->link.next == 0p, "Expected null got %p", readyThread->link.next );
+				__builtin_prefetch( readyThread->context.SP );
 
 				// We found a thread run it
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 37ba6628ac22d4f11d3fb42a2c573168ac19bd03)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 04b5cef29b60d0ea2bc49962adba72c424a3e55e)
@@ -24,4 +24,6 @@
 #include "math.hfa"
 
+#include <unistd.h>
+
 static const size_t cache_line_size = 64;
 
@@ -31,4 +33,6 @@
 	#define __CFA_MAX_PROCESSORS__ 1024
 #endif
+
+#define BIAS 64
 
 // returns the maximum number of processors the RWLock support
@@ -568,5 +572,23 @@
 	do {
 		// Pick the index of a lane
-		i = __tls_rand() % lanes.count;
+		#if defined(BIAS)
+			unsigned r = __tls_rand();
+			unsigned rlow  = r % BIAS;
+			unsigned rhigh = r / BIAS;
+			if(0 != (rlow % BIAS) && kernelTLS.this_processor) {
+				// (BIAS - 1) out of BIAS chances
+				// Use perferred queues
+				i = (kernelTLS.this_processor->id * 4) + (rhigh % 4);
+			}
+			else {
+				// 1 out of BIAS chances
+				// Use all queues
+				i = rhigh;
+			}
+		#else
+			i = __tls_rand();
+		#endif
+
+		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
 
 		#if !defined(__CFA_NO_STATISTICS__)
@@ -666,6 +688,22 @@
 	while( query(snzi) ) {
 		// Pick two lists at random
-		int i = __tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
-		int j = __tls_rand() % __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+		#if defined(BIAS)
+			unsigned i = __tls_rand();
+			unsigned j = __tls_rand();
+
+			if(0 == (i % BIAS)) {
+				i = i / BIAS;
+			}
+			else {
+				i = ((kernelTLS.this_processor->id * 4) + ((i / BIAS) % 4));
+				j = ((kernelTLS.this_processor->id * 4) + ((j / BIAS) % 4));
+			}
+		#else
+			unsigned i = __tls_rand();
+			unsigned j = __tls_rand();
+		#endif
+
+		i %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
+ 		j %= __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
 
 		// try popping from the 2 picked lists
Index: libcfa/src/containers/stackLockFree.hfa
===================================================================
--- libcfa/src/containers/stackLockFree.hfa	(revision 37ba6628ac22d4f11d3fb42a2c573168ac19bd03)
+++ libcfa/src/containers/stackLockFree.hfa	(revision 04b5cef29b60d0ea2bc49962adba72c424a3e55e)
@@ -1,9 +1,9 @@
-// 
+//
 // Cforall Version 1.0.0 Copyright (C) 2017 University of Waterloo
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 //
-// stackLockFree.hfa -- 
-// 
+// stackLockFree.hfa --
+//
 // Author           : Peter A. Buhr
 // Created On       : Wed May 13 20:58:58 2020
@@ -11,5 +11,5 @@
 // Last Modified On : Mon May 18 13:30:08 2020
 // Update Count     : 55
-// 
+//
 
 #pragma once
@@ -20,5 +20,5 @@
 union Link {
 	struct {									// 32/64-bit x 2
-		T * top;								// pointer to stack top
+		T * volatile top;								// pointer to stack top
 		uintptr_t count;						// count each push
 	};