Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision d29255c47c5f62b1d3bcf2ca5a9677a99652b965)
+++ libcfa/src/concurrency/invoke.h	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -48,5 +48,6 @@
 		extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
 			struct $thread    * volatile this_thread;
-			struct processor      * volatile this_processor;
+			struct processor  * volatile this_processor;
+			struct __stats_t  * volatile this_stats;
 
 			struct {
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision d29255c47c5f62b1d3bcf2ca5a9677a99652b965)
+++ libcfa/src/concurrency/io.cfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -135,26 +135,4 @@
 		void * ring_ptr;
 		size_t ring_sz;
-
-		// Statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-			struct {
-				struct {
-					volatile unsigned long long int rdy;
-					volatile unsigned long long int csm;
-					volatile unsigned long long int avl;
-					volatile unsigned long long int cnt;
-				} submit_avg;
-				struct {
-					volatile unsigned long long int val;
-					volatile unsigned long long int cnt;
-					volatile unsigned long long int block;
-				} look_avg;
-				struct {
-					volatile unsigned long long int val;
-					volatile unsigned long long int cnt;
-					volatile unsigned long long int block;
-				} alloc_avg;
-			} stats;
-		#endif
 	};
 
@@ -177,15 +155,4 @@
 		void * ring_ptr;
 		size_t ring_sz;
-
-		// Statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-			struct {
-				struct {
-					unsigned long long int val;
-					unsigned long long int slow_cnt;
-					unsigned long long int fast_cnt;
-				} completed_avg;
-			} stats;
-		#endif
 	};
 
@@ -331,21 +298,4 @@
 		(this.io->submit){ min(*sq.num, *cq.num) };
 
-		// Initialize statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-			this.io->submit_q.stats.submit_avg.rdy = 0;
-			this.io->submit_q.stats.submit_avg.csm = 0;
-			this.io->submit_q.stats.submit_avg.avl = 0;
-			this.io->submit_q.stats.submit_avg.cnt = 0;
-			this.io->submit_q.stats.look_avg.val   = 0;
-			this.io->submit_q.stats.look_avg.cnt   = 0;
-			this.io->submit_q.stats.look_avg.block = 0;
-			this.io->submit_q.stats.alloc_avg.val   = 0;
-			this.io->submit_q.stats.alloc_avg.cnt   = 0;
-			this.io->submit_q.stats.alloc_avg.block = 0;
-			this.io->completion_q.stats.completed_avg.val = 0;
-			this.io->completion_q.stats.completed_avg.slow_cnt = 0;
-			this.io->completion_q.stats.completed_avg.fast_cnt = 0;
-		#endif
-
 		if(!main_cluster) {
 			__kernel_io_finish_start( this );
@@ -437,58 +387,4 @@
 			__kernel_io_prepare_stop( this );
 		}
-
-		// print statistics
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(this.print_stats) {
-				with(this.io->submit_q.stats, this.io->completion_q.stats) {
-					double avgrdy = ((double)submit_avg.rdy) / submit_avg.cnt;
-					double avgcsm = ((double)submit_avg.csm) / submit_avg.cnt;
-					double avgavl = ((double)submit_avg.avl) / submit_avg.cnt;
-
-					double lavgv = 0;
-					double lavgb = 0;
-					if(look_avg.cnt != 0) {
-						lavgv = ((double)look_avg.val  ) / look_avg.cnt;
-						lavgb = ((double)look_avg.block) / look_avg.cnt;
-					}
-
-					double aavgv = 0;
-					double aavgb = 0;
-					if(alloc_avg.cnt != 0) {
-						aavgv = ((double)alloc_avg.val  ) / alloc_avg.cnt;
-						aavgb = ((double)alloc_avg.block) / alloc_avg.cnt;
-					}
-
-					__cfaabi_bits_print_safe( STDOUT_FILENO,
-						"----- I/O uRing Stats -----\n"
-						"- total submit calls     : %'15llu\n"
-						"- avg ready entries      : %'18.2lf\n"
-						"- avg submitted entries  : %'18.2lf\n"
-						"- avg available entries  : %'18.2lf\n"
-						"- total ready search     : %'15llu\n"
-						"- avg ready search len   : %'18.2lf\n"
-						"- avg ready search block : %'18.2lf\n"
-						"- total alloc search     : %'15llu\n"
-						"- avg alloc search len   : %'18.2lf\n"
-						"- avg alloc search block : %'18.2lf\n"
-						"- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
-						"- avg completion/wait    : %'18.2lf\n",
-						submit_avg.cnt,
-						avgrdy,
-						avgcsm,
-						avgavl,
-						look_avg.cnt,
-						lavgv,
-						lavgb,
-						alloc_avg.cnt,
-						aavgv,
-						aavgb,
-						completed_avg.slow_cnt + completed_avg.fast_cnt,
-						completed_avg.slow_cnt,  completed_avg.fast_cnt,
-						((double)completed_avg.val) / (completed_avg.slow_cnt + completed_avg.fast_cnt)
-					);
-				}
-			}
-		#endif
 
 		// Shutdown the io rings
@@ -578,8 +474,8 @@
 		// update statistics
 		#if !defined(__CFA_NO_STATISTICS__)
-			ring.submit_q.stats.submit_avg.rdy += to_submit;
-			ring.submit_q.stats.submit_avg.csm += ret;
-			ring.submit_q.stats.submit_avg.avl += avail;
-			ring.submit_q.stats.submit_avg.cnt += 1;
+			__tls_stats()->io.submit_q.stats.submit_avg.rdy += to_submit;
+			__tls_stats()->io.submit_q.stats.submit_avg.csm += ret;
+			__tls_stats()->io.submit_q.stats.submit_avg.avl += avail;
+			__tls_stats()->io.submit_q.stats.submit_avg.cnt += 1;
 		#endif
 
@@ -655,6 +551,6 @@
 				// Update statistics
 				#if !defined(__CFA_NO_STATISTICS__)
-					ring.completion_q.stats.completed_avg.val += count;
-					ring.completion_q.stats.completed_avg.slow_cnt += 1;
+					__tls_stats()->io.complete_q.stats.completed_avg.val += count;
+					__tls_stats()->io.complete_q.stats.completed_avg.slow_cnt += 1;
 				#endif
 
@@ -675,6 +571,6 @@
 				// Update statistics
 				#if !defined(__CFA_NO_STATISTICS__)
-					ring.completion_q.stats.completed_avg.val += count;
-					ring.completion_q.stats.completed_avg.slow_cnt += 1;
+					__tls_stats()->io.complete_q.stats.completed_avg.val += count;
+					__tls_stats()->io.complete_q.stats.completed_avg.slow_cnt += 1;
 				#endif
 			}
@@ -708,6 +604,6 @@
 			// Update statistics
 			#if !defined(__CFA_NO_STATISTICS__)
-				this.ring->completion_q.stats.completed_avg.val += count;
-				this.ring->completion_q.stats.completed_avg.fast_cnt += 1;
+				__tls_stats()->io.complete_q.stats.completed_avg.val += count;
+				__tls_stats()->io.complete_q.stats.completed_avg.fast_cnt += 1;
 			#endif
 
@@ -792,7 +688,7 @@
 					// update statistics
 					#if !defined(__CFA_NO_STATISTICS__)
-						__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.val,   len,   __ATOMIC_RELAXED );
-						__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.block, block, __ATOMIC_RELAXED );
-						__atomic_fetch_add( &ring.submit_q.stats.alloc_avg.cnt,   1,     __ATOMIC_RELAXED );
+						__tls_stats()->io.submit_q.stats.alloc_avg.val   += len;
+						__tls_stats()->io.submit_q.stats.alloc_avg.block += block;
+						__tls_stats()->io.submit_q.stats.alloc_avg.cnt   += 1;
 					#endif
 
@@ -847,7 +743,7 @@
 			// update statistics
 			#if !defined(__CFA_NO_STATISTICS__)
-				__atomic_fetch_add( &ring.submit_q.stats.look_avg.val,   len,   __ATOMIC_RELAXED );
-				__atomic_fetch_add( &ring.submit_q.stats.look_avg.block, block, __ATOMIC_RELAXED );
-				__atomic_fetch_add( &ring.submit_q.stats.look_avg.cnt,   1,     __ATOMIC_RELAXED );
+				__tls_stats()->io.submit_q.stats.look_avg.val   += len;
+				__tls_stats()->io.submit_q.stats.look_avg.block += block;
+				__tls_stats()->io.submit_q.stats.look_avg.cnt   += 1;
 			#endif
 
@@ -876,6 +772,6 @@
 			// update statistics
 			#if !defined(__CFA_NO_STATISTICS__)
-				ring.submit_q.stats.submit_avg.csm += 1;
-				ring.submit_q.stats.submit_avg.cnt += 1;
+				__tls_stats()->io.submit_q.stats.submit_avg.csm += 1;
+				__tls_stats()->io.submit_q.stats.submit_avg.cnt += 1;
 			#endif
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision d29255c47c5f62b1d3bcf2ca5a9677a99652b965)
+++ libcfa/src/concurrency/kernel.cfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -130,4 +130,7 @@
 KERNEL_STORAGE(__stack_t,            mainThreadCtx);
 KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
+#if !defined(__CFA_NO_STATISTICS__)
+KERNEL_STORAGE(__stats_t, mainProcStats);
+#endif
 
 cluster              * mainCluster;
@@ -146,4 +149,5 @@
 thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) = {
 	NULL,												// cannot use 0p
+	NULL,
 	NULL,
 	{ 1, false, false },
@@ -268,4 +272,6 @@
 	#if !defined(__CFA_NO_STATISTICS__)
 		print_stats = false;
+		stats = alloc();
+		__init_stats( stats );
 	#endif
 
@@ -281,4 +287,11 @@
 void ^?{}(cluster & this) {
 	__kernel_io_shutdown( this, &this == mainCluster );
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		if(this.print_stats) {
+			__print_stats( this.stats );
+		}
+		free( this.stats );
+	#endif
 
 	unregister(this);
@@ -357,6 +370,4 @@
 
 	__cfadbg_print_safe(runtime_core, "Kernel : core %p terminated\n", this);
-
-	stats_tls_tally(this->cltr);
 }
 
@@ -479,4 +490,10 @@
 // It effectively constructs a coroutine by stealing the pthread stack
 static void * __invoke_processor(void * arg) {
+	#if !defined( __CFA_NO_STATISTICS__ )
+		__stats_t local_stats;
+		__init_stats( &local_stats );
+		kernelTLS.this_stats = &local_stats;
+	#endif
+
 	processor * proc = (processor *) arg;
 	kernelTLS.this_processor = proc;
@@ -509,4 +526,8 @@
 	// Main routine of the core returned, the core is now fully terminated
 	__cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		__tally_stats(proc->cltr->stats, &local_stats);
+	#endif
 
 	return 0p;
@@ -794,4 +815,9 @@
 	kernelTLS.this_thread    = mainThread;
 
+	#if !defined( __CFA_NO_STATISTICS__ )
+		kernelTLS.this_stats = (__stats_t *)& storage_mainProcStats;
+		__init_stats( kernelTLS.this_stats );
+	#endif
+
 	// Enable preemption
 	kernel_start_preemption();
@@ -874,34 +900,34 @@
 //=============================================================================================
 static $thread * __halt(processor * this) with( *this ) {
-	if( do_terminate ) return 0p;
-
-	// First, lock the cluster idle
-	lock( cltr->idle_lock __cfaabi_dbg_ctx2 );
-
-	// Check if we can find a thread
-	if( $thread * found = __next_thread( cltr ) ) {
-		unlock( cltr->idle_lock );
-		return found;
-	}
-
-	// Move this processor from the active list to the idle list
-	move_to_front(cltr->procs, cltr->idles, *this);
-
-	// Unlock the idle lock so we don't go to sleep with a lock
-	unlock    (cltr->idle_lock);
-
-	// We are ready to sleep
-	__cfadbg_print_safe(runtime_core, "Kernel : Processor %p ready to sleep\n", this);
-	wait( idle );
-
-	// We have woken up
-	__cfadbg_print_safe(runtime_core, "Kernel : Processor %p woke up and ready to run\n", this);
-
-	// Get ourself off the idle list
-	with( *cltr ) {
-		lock  (idle_lock __cfaabi_dbg_ctx2);
-		move_to_front(idles, procs, *this);
-		unlock(idle_lock);
-	}
+	// if( do_terminate ) return 0p;
+
+	// // First, lock the cluster idle
+	// lock( cltr->idle_lock __cfaabi_dbg_ctx2 );
+
+	// // Check if we can find a thread
+	// if( $thread * found = __next_thread( cltr ) ) {
+	// 	unlock( cltr->idle_lock );
+	// 	return found;
+	// }
+
+	// // Move this processor from the active list to the idle list
+	// move_to_front(cltr->procs, cltr->idles, *this);
+
+	// // Unlock the idle lock so we don't go to sleep with a lock
+	// unlock    (cltr->idle_lock);
+
+	// // We are ready to sleep
+	// __cfadbg_print_safe(runtime_core, "Kernel : Processor %p ready to sleep\n", this);
+	// wait( idle );
+
+	// // We have woken up
+	// __cfadbg_print_safe(runtime_core, "Kernel : Processor %p woke up and ready to run\n", this);
+
+	// // Get ourself off the idle list
+	// with( *cltr ) {
+	// 	lock  (idle_lock __cfaabi_dbg_ctx2);
+	// 	move_to_front(idles, procs, *this);
+	// 	unlock(idle_lock);
+	// }
 
 	// Don't check the ready queue again, we may not be in a position to run a thread
@@ -911,34 +937,38 @@
 // Wake a thread from the front if there are any
 static bool __wake_one(cluster * this) {
-	// First, lock the cluster idle
-	lock( this->idle_lock __cfaabi_dbg_ctx2 );
-
-	// Check if there is someone to wake up
-	if( !this->idles.head ) {
-		// Nope unlock and return false
-		unlock( this->idle_lock );
-		return false;
-	}
-
-	// Wake them up
-	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this->idles.head);
-	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-	post( this->idles.head->idle );
-
-	// Unlock and return true
-	unlock( this->idle_lock );
-	return true;
+	// // First, lock the cluster idle
+	// lock( this->idle_lock __cfaabi_dbg_ctx2 );
+
+	// // Check if there is someone to wake up
+	// if( !this->idles.head ) {
+	// 	// Nope unlock and return false
+	// 	unlock( this->idle_lock );
+	// 	return false;
+	// }
+
+	// // Wake them up
+	// __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this->idles.head);
+	// /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	// post( this->idles.head->idle );
+
+	// // Unlock and return true
+	// unlock( this->idle_lock );
+	// return true;
+
+	return false;
 }
 
 // Unconditionnaly wake a thread
 static bool __wake_proc(processor * this) {
-	__cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
-
-	disable_interrupts();
-		/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
-		bool ret = post( this->idle );
-	enable_interrupts( __cfaabi_dbg_ctx );
-
-	return ret;
+	// __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
+
+	// disable_interrupts();
+	// 	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	// 	bool ret = post( this->idle );
+	// enable_interrupts( __cfaabi_dbg_ctx );
+
+	// return ret;
+
+	return false;
 }
 
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision d29255c47c5f62b1d3bcf2ca5a9677a99652b965)
+++ libcfa/src/concurrency/kernel.hfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -50,4 +50,8 @@
 struct __processor_id_t {
 	unsigned id;
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		struct __stats_t * stats;
+	#endif
 };
 
@@ -165,43 +169,4 @@
 		volatile size_t count;
 	} lanes;
-
-	// Statistics
-	#if !defined(__CFA_NO_STATISTICS__)
-		struct __attribute__((aligned(64))) {
-			struct {
-				// Push statistic
-				struct {
-					// number of attemps at pushing something
-					volatile size_t attempt;
-
-					// number of successes at pushing
-					volatile size_t success;
-				} push;
-
-				// Pop statistic
-				struct {
-					// number of reads of the mask
-					// picking an empty __cfa_readyQ_mask_t counts here
-					// but not as an attempt
-					volatile size_t maskrds;
-
-					// number of attemps at poping something
-					volatile size_t attempt;
-
-					// number of successes at poping
-					volatile size_t success;
-				} pop;
-			} pick;
-
-			// stats on the "used" struct of the queue
-			// tracks average number of queues that are not empty
-			// when pushing / poping
-			struct {
-				volatile size_t value;
-				volatile size_t count;
-			} used;
-		} global_stats;
-
-	#endif
 };
 
@@ -242,4 +207,5 @@
 	#if !defined(__CFA_NO_STATISTICS__)
 		bool print_stats;
+		struct __stats_t * stats;
 	#endif
 };
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision d29255c47c5f62b1d3bcf2ca5a9677a99652b965)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -20,4 +20,5 @@
 
 #include "alarm.hfa"
+#include "stats.hfa"
 
 
@@ -237,7 +238,9 @@
 // Statics call at the end of each thread to register statistics
 #if !defined(__CFA_NO_STATISTICS__)
-void stats_tls_tally(struct cluster * cltr);
-#else
-static inline void stats_tls_tally(struct cluster * cltr) {}
+static inline struct __stats_t * __tls_stats() {
+	/* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+	/* paranoid */ verify( kernelTLS.this_stats );
+	return kernelTLS.this_stats;
+}
 #endif
 
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision d29255c47c5f62b1d3bcf2ca5a9677a99652b965)
+++ libcfa/src/concurrency/preemption.cfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -269,4 +269,7 @@
 // reserved for future use
 static void timeout( struct __processor_id_t * id, $thread * this ) {
+	#if !defined( __CFA_NO_STATISTICS__ )
+		kernelTLS.this_stats = this->curr_cluster->stats;
+	#endif
 	__unpark( id, this __cfaabi_dbg_ctx2 );
 }
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision d29255c47c5f62b1d3bcf2ca5a9677a99652b965)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -535,33 +535,4 @@
 //=======================================================================
 
-// Thread local mirror of ready queue statistics
-#if !defined(__CFA_NO_STATISTICS__)
-static __attribute__((aligned(128))) thread_local struct {
-	struct {
-		struct {
-			size_t attempt;
-			size_t success;
-		} push;
-		struct {
-			size_t maskrds;
-			size_t attempt;
-			size_t success;
-		} pop;
-	} pick;
-	struct {
-		size_t value;
-		size_t count;
-	} used;
-} tls = {
-	/* pick */{
-		/* push */{ 0, 0 },
-		/* pop  */{ 0, 0, 0 },
-	},
-	/* used */{ 0, 0 }
-};
-#endif
-
-//-----------------------------------------------------------------------
-
 void ?{}(__ready_queue_t & this) with (this) {
 
@@ -572,15 +543,4 @@
 	lanes.count = 4;
 	snzi{ log2( lanes.count / 8 ) };
-
-	#if !defined(__CFA_NO_STATISTICS__)
-		global_stats.pick.push.attempt = 0;
-		global_stats.pick.push.success = 0;
-		global_stats.pick.pop .maskrds = 0;
-		global_stats.pick.pop .attempt = 0;
-		global_stats.pick.pop .success = 0;
-
-		global_stats.used.value = 0;
-		global_stats.used.count = 0;
-	#endif
 }
 
@@ -611,5 +571,5 @@
 
 		#if !defined(__CFA_NO_STATISTICS__)
-			tls.pick.push.attempt++;
+			__tls_stats()->ready.pick.push.attempt++;
 		#endif
 
@@ -638,5 +598,5 @@
 	// Update statistics
 	#if !defined(__CFA_NO_STATISTICS__)
-		tls.pick.push.success++;
+		__tls_stats()->ready.pick.push.success++;
 	#endif
 
@@ -649,5 +609,5 @@
 static struct $thread * try_pop(struct cluster * cltr, unsigned i, unsigned j) with (cltr->ready_queue) {
 	#if !defined(__CFA_NO_STATISTICS__)
-		tls.pick.pop.attempt++;
+		__tls_stats()->ready.pick.pop.attempt++;
 	#endif
 
@@ -692,5 +652,5 @@
 	// Update statistics
 	#if !defined(__CFA_NO_STATISTICS__)
-		tls.pick.pop.success++;
+		__tls_stats()->ready.pick.pop.success++;
 	#endif
 
@@ -895,17 +855,2 @@
 	ready_mutate_unlock( last_size );
 }
-
-//-----------------------------------------------------------------------
-
-#if !defined(__CFA_NO_STATISTICS__)
-void stats_tls_tally(struct cluster * cltr) with (cltr->ready_queue) {
-	__atomic_fetch_add( &global_stats.pick.push.attempt, tls.pick.push.attempt, __ATOMIC_SEQ_CST );
-	__atomic_fetch_add( &global_stats.pick.push.success, tls.pick.push.success, __ATOMIC_SEQ_CST );
-	__atomic_fetch_add( &global_stats.pick.pop .maskrds, tls.pick.pop .maskrds, __ATOMIC_SEQ_CST );
-	__atomic_fetch_add( &global_stats.pick.pop .attempt, tls.pick.pop .attempt, __ATOMIC_SEQ_CST );
-	__atomic_fetch_add( &global_stats.pick.pop .success, tls.pick.pop .success, __ATOMIC_SEQ_CST );
-
-	__atomic_fetch_add( &global_stats.used.value, tls.used.value, __ATOMIC_SEQ_CST );
-	__atomic_fetch_add( &global_stats.used.count, tls.used.count, __ATOMIC_SEQ_CST );
-}
-#endif
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision d29255c47c5f62b1d3bcf2ca5a9677a99652b965)
+++ libcfa/src/concurrency/stats.cfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -0,0 +1,128 @@
+#include <stdint.h>
+#include <stdlib.hfa>
+
+#include <unistd.h>								// STDERR_FILENO
+#include "bits/debug.hfa"
+#include "stats.hfa"
+
+#if !defined(__CFA_NO_STATISTICS__)
+	void __init_stats( struct __stats_t * stats ) {
+		stats->ready.pick.push.attempt = 0;
+		stats->ready.pick.push.success = 0;
+		stats->ready.pick.pop .probe   = 0;
+		stats->ready.pick.pop .attempt = 0;
+		stats->ready.pick.pop .success = 0;
+
+		#if defined(HAVE_LINUX_IO_URING_H)
+			stats->io.submit_q.submit_avg.rdy = 0;
+			stats->io.submit_q.submit_avg.csm = 0;
+			stats->io.submit_q.submit_avg.avl = 0;
+			stats->io.submit_q.submit_avg.cnt = 0;
+			stats->io.submit_q.look_avg.val   = 0;
+			stats->io.submit_q.look_avg.cnt   = 0;
+			stats->io.submit_q.look_avg.block = 0;
+			stats->io.submit_q.alloc_avg.val   = 0;
+			stats->io.submit_q.alloc_avg.cnt   = 0;
+			stats->io.submit_q.alloc_avg.block = 0;
+			stats->io.complete_q.completed_avg.val = 0;
+			stats->io.complete_q.completed_avg.slow_cnt = 0;
+			stats->io.complete_q.completed_avg.fast_cnt = 0;
+		#endif
+	}
+
+	void __tally_stats( struct __stats_t * cltr, struct __stats_t * proc ) {
+		__atomic_fetch_add( &cltr->ready.pick.push.attempt, proc->ready.pick.push.attempt, __ATOMIC_SEQ_CST );
+		__atomic_fetch_add( &cltr->ready.pick.push.success, proc->ready.pick.push.success, __ATOMIC_SEQ_CST );
+		__atomic_fetch_add( &cltr->ready.pick.pop .probe  , proc->ready.pick.pop .probe  , __ATOMIC_SEQ_CST );
+		__atomic_fetch_add( &cltr->ready.pick.pop .attempt, proc->ready.pick.pop .attempt, __ATOMIC_SEQ_CST );
+		__atomic_fetch_add( &cltr->ready.pick.pop .success, proc->ready.pick.pop .success, __ATOMIC_SEQ_CST );
+
+		#if defined(HAVE_LINUX_IO_URING_H)
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy          , proc->io.submit_q.submit_avg.rdy          , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm          , proc->io.submit_q.submit_avg.csm          , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl          , proc->io.submit_q.submit_avg.avl          , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt          , proc->io.submit_q.submit_avg.cnt          , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.val            , proc->io.submit_q.look_avg.val            , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt            , proc->io.submit_q.look_avg.cnt            , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.look_avg.block          , proc->io.submit_q.look_avg.block          , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val           , proc->io.submit_q.alloc_avg.val           , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt           , proc->io.submit_q.alloc_avg.cnt           , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block         , proc->io.submit_q.alloc_avg.block         , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.val     , proc->io.complete_q.completed_avg.val     , __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.slow_cnt, proc->io.complete_q.completed_avg.slow_cnt, __ATOMIC_SEQ_CST );
+			__atomic_fetch_add( &cltr->io.complete_q.completed_avg.fast_cnt, proc->io.complete_q.completed_avg.fast_cnt, __ATOMIC_SEQ_CST );
+		#endif
+	}
+
+	void __print_stats( struct __stats_t * stats ) {
+
+		double push_sur = (100.0 * ((double)stats->ready.pick.push.success) / stats->ready.pick.push.attempt);
+		double pop_sur  = (100.0 * ((double)stats->ready.pick.pop .success) / stats->ready.pick.pop .attempt);
+
+		double push_len = ((double)stats->ready.pick.push.attempt) / stats->ready.pick.push.success;
+		double pop_len  = ((double)stats->ready.pick.pop .attempt) / stats->ready.pick.pop .success;
+
+		#if defined(HAVE_LINUX_IO_URING_H)
+			double avgrdy = ((double)submit_avg.rdy) / submit_avg.cnt;
+			double avgcsm = ((double)submit_avg.csm) / submit_avg.cnt;
+			double avgavl = ((double)submit_avg.avl) / submit_avg.cnt;
+
+			double lavgv = 0;
+			double lavgb = 0;
+			if(look_avg.cnt != 0) {
+				lavgv = ((double)look_avg.val  ) / look_avg.cnt;
+				lavgb = ((double)look_avg.block) / look_avg.cnt;
+			}
+
+			double aavgv = 0;
+			double aavgb = 0;
+			if(alloc_avg.cnt != 0) {
+				aavgv = ((double)alloc_avg.val  ) / alloc_avg.cnt;
+				aavgb = ((double)alloc_avg.block) / alloc_avg.cnt;
+			}
+		#endif
+
+		__cfaabi_bits_print_safe( STDOUT_FILENO,
+			"----- Ready Q Stats -----\n"
+			"- total threads run      : %'15lu\n"
+			"- total threads scheduled: %'15lu\n"
+			"- push average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts)\n"
+			"- pop  average probe len : %'18.2lf, %'18.2lf%% (%'15lu attempts)\n"
+			#if defined(HAVE_LINUX_IO_URING_H)
+				"\n"
+				"----- I/O Stats -----\n"
+				"- total submit calls     : %'15llu\n"
+				"- avg ready entries      : %'18.2lf\n"
+				"- avg submitted entries  : %'18.2lf\n"
+				"- avg available entries  : %'18.2lf\n"
+				"- total ready search     : %'15llu\n"
+				"- avg ready search len   : %'18.2lf\n"
+				"- avg ready search block : %'18.2lf\n"
+				"- total alloc search     : %'15llu\n"
+				"- avg alloc search len   : %'18.2lf\n"
+				"- avg alloc search block : %'18.2lf\n"
+				"- total wait calls       : %'15llu   (%'llu slow, %'llu fast)\n"
+				"- avg completion/wait    : %'18.2lf\n"
+			#endif
+			, stats->ready.pick.pop.success
+			, stats->ready.pick.push.success
+			, push_len, push_sur, stats->ready.pick.push.attempt
+			, pop_len , pop_sur , stats->ready.pick.pop .attempt
+			#if defined(HAVE_LINUX_IO_URING_H)
+				, submit_avg.cnt
+				, avgrdy
+				, avgcsm
+				, avgavl
+				, look_avg.cnt
+				, lavgv
+				, lavgb
+				, alloc_avg.cnt
+				, aavgv
+				, aavgb
+				, completed_avg.slow_cnt + completed_avg.fast_cnt
+				, completed_avg.slow_cnt,  completed_avg.fast_cnt
+				, ((double)completed_avg.val) / (completed_avg.slow_cnt + completed_avg.fast_cnt)
+			#endif
+		);
+	}
+#endif
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
+++ libcfa/src/concurrency/stats.hfa	(revision 8834751449046f04c469e5691eabf7a075e5e70a)
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <stdint.h>
+
+#if defined(__CFA_NO_STATISTICS__)
+	struct __stats_t;
+	static inline void __init_stats( struct __stats_t * ) {}
+	static inline void __tally_stats( struct __stats_t *, struct __stats_t * ) {}
+	static inline void __print_stats( struct __stats_t * ) {}
+#else
+	struct __attribute__((aligned(64))) __stats_readQ_t {
+		struct {
+			// Push statistic
+			struct {
+				// number of attemps at pushing something
+				volatile uint64_t attempt;
+
+				// number of successes at pushing
+				volatile uint64_t success;
+			} push;
+
+			// Pop statistic
+			struct {
+				// number of reads of the mask
+				// picking an empty __cfa_readyQ_mask_t counts here
+				// but not as an attempt
+				volatile uint64_t probe;
+
+				// number of attemps at poping something
+				volatile uint64_t attempt;
+
+				// number of successes at poping
+				volatile uint64_t success;
+			} pop;
+		} pick;
+	};
+
+	#if defined(HAVE_LINUX_IO_URING_H)
+		struct __attribute__((aligned(64))) __stats_io_t{
+			struct {
+				struct {
+					volatile uint64_t rdy;
+					volatile uint64_t csm;
+					volatile uint64_t avl;
+					volatile uint64_t cnt;
+				} submit_avg;
+				struct {
+					volatile uint64_t val;
+					volatile uint64_t cnt;
+					volatile uint64_t block;
+				} look_avg;
+				struct {
+					volatile uint64_t val;
+					volatile uint64_t cnt;
+					volatile uint64_t block;
+				} alloc_avg;
+			} submit_q;
+			struct {
+				struct {
+					unsigned long long int val;
+					unsigned long long int slow_cnt;
+					unsigned long long int fast_cnt;
+				} completed_avg;
+			} complete_q;
+		};
+	#endif
+
+	struct __stats_t {
+		__stats_readQ_t ready;
+		#if defined(HAVE_LINUX_IO_URING_H)
+			__stats_io_t    io;
+		#endif
+	};
+
+	void __init_stats ( struct __stats_t * );
+	void __tally_stats( struct __stats_t *, struct __stats_t * );
+	void __print_stats( struct __stats_t * );
+#endif
+
