Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 9e2341b4e08bb9d51946f964b2a5490705dcc76e)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision f55d54d8755e2f432d8d2b6071ba7c1f8432b2d8)
@@ -398,13 +398,13 @@
 
 		if(proc->rdq.target == -1u) {
-			proc->rdq.target = __tls_rand() % lanes.count;
-			unsigned it1  = proc->rdq.itr;
-			unsigned it2  = proc->rdq.itr + 1;
-			unsigned idx1 = proc->rdq.id + (it1 % READYQ_SHARD_FACTOR);
-			unsigned idx2 = proc->rdq.id + (it2 % READYQ_SHARD_FACTOR);
+			_Static_assert(READYQ_SHARD_FACTOR == 2);
+			unsigned idx1 = proc->rdq.id + 0;
+			unsigned idx2 = proc->rdq.id + 1;
 			unsigned long long tsc1 = ts(lanes.data[idx1]);
 			unsigned long long tsc2 = ts(lanes.data[idx2]);
-			proc->rdq.cutoff = min(tsc1, tsc2);
-			if(proc->rdq.cutoff == 0) proc->rdq.cutoff = -1ull;
+			proc->rdq.target = __tls_rand() % lanes.count;
+
+			// WARNING: std::min is polymorphic and therefore causes 500% slowdown instead of the expected 2%
+			proc->rdq.cutoff = tsc1 < tsc2 ? tsc1 : tsc2; 
 		}
 		else {
@@ -418,5 +418,5 @@
 
 		for(READYQ_SHARD_FACTOR) {
-			unsigned i = proc->rdq.id + (--proc->rdq.itr % READYQ_SHARD_FACTOR);
+			unsigned i = proc->rdq.id + (proc->rdq.itr++ % READYQ_SHARD_FACTOR);
 			if($thread * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
 		}
