Index: libcfa/src/Makefile.am
===================================================================
--- libcfa/src/Makefile.am	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/Makefile.am	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -62,6 +62,5 @@
 	containers/array.hfa \
 	containers/list.hfa \
-	containers/queueLockFree.hfa \
-	containers/stackLockFree.hfa \
+	containers/lockfree.hfa \
 	containers/string_sharectx.hfa \
 	containers/vector2.hfa \
@@ -112,4 +111,5 @@
 	concurrency/invoke.h \
 	concurrency/future.hfa \
+	concurrency/once.hfa \
 	concurrency/kernel/fwd.hfa \
 	concurrency/mutex_stmt.hfa
@@ -127,4 +127,5 @@
 
 thread_libsrc = ${inst_thread_headers_src} ${inst_thread_headers_src:.hfa=.cfa} \
+	interpose_thread.cfa \
 	bits/signal.hfa \
 	concurrency/clib/cfathread.cfa \
@@ -145,5 +146,6 @@
 	concurrency/stats.cfa \
 	concurrency/stats.hfa \
-	concurrency/stats.hfa
+	concurrency/stats.hfa \
+	concurrency/pthread.cfa
 
 else
Index: libcfa/src/bits/containers.hfa
===================================================================
--- libcfa/src/bits/containers.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/bits/containers.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -152,4 +152,5 @@
 
 		void append( __queue(T) & this, T * val ) with(this) {
+			verify(get_next( *val ) == 0p);
 			verify(this.tail != 0p);
 			verify(*this.tail == 1p);
Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/bits/defs.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -30,8 +30,6 @@
 #ifdef __cforall
 #define __cfa_anonymous_object(x) inline struct x
-#define __cfa_dlink(x) inline dlink(x)
 #else
 #define __cfa_anonymous_object(x) struct x __cfa_anonymous_object
-#define __cfa_dlink(x) struct { struct x * next; struct x * back; } __dlink_substitute
 #endif
 
Index: libcfa/src/concurrency/clib/cfathread.cfa
===================================================================
--- libcfa/src/concurrency/clib/cfathread.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/clib/cfathread.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -172,9 +172,9 @@
 
 		pthread_attr_t attr;
-		if (int ret = pthread_attr_init(&attr); 0 != ret) {
+		if (int ret = __cfaabi_pthread_attr_init(&attr); 0 != ret) {
 			abort | "failed to create master epoll thread attr: " | ret | strerror(ret);
 		}
 
-		if (int ret = pthread_create(&master_poller, &attr, master_epoll, 0p); 0 != ret) {
+		if (int ret = __cfaabi_pthread_create(&master_poller, &attr, master_epoll, 0p); 0 != ret) {
 			abort | "failed to create master epoll thread: " | ret | strerror(ret);
 		}
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/invoke.h	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -146,9 +146,20 @@
 
 	// Link lists fields
-	// instrusive link field for threads
+	// instrusive link field for threads in the ready-queue
 	struct __thread_desc_link {
 		struct thread$ * next;
 		volatile unsigned long long ts;
 	};
+
+	// Link lists fields
+	// instrusive link field for threads in the user_link/cltr_link
+	struct __thread_user_link {
+		#ifdef __cforall
+			inline dlink(thread$);
+		#else
+			struct thread$ * next; struct thread$ * back;
+		#endif
+	};
+	_Static_assert(sizeof(struct __thread_user_link) == 2 * sizeof(struct thread$ *), "__thread_user_link should be consistent in C and Cforall");
 
 	struct thread$ {
@@ -159,5 +170,5 @@
 		// Link lists fields
 		// instrusive link field for threads
-		struct __thread_desc_link link;
+		struct __thread_desc_link rdy_link;
 
 		// current execution status for coroutine
@@ -195,11 +206,11 @@
 		struct __monitor_group_t monitors;
 
-		// used to put threads on dlist data structure
-		__cfa_dlink(thread$);
-
-		struct {
-			struct thread$ * next;
-			struct thread$ * prev;
-		} node;
+		// intrusive link fields, used for locks, monitors and any user defined data structure
+		// default link fields for dlist
+		struct __thread_user_link user_link;
+
+		// secondary intrusive link fields, used for global cluster list
+		// default link fields for dlist
+		struct __thread_user_link cltr_link;
 
 		// used to store state between clh lock/unlock
@@ -214,10 +225,9 @@
 
 		#if defined( __CFA_WITH_VERIFY__ )
+			struct processor * volatile executing;
 			void * canary;
 		#endif
 	};
-	#ifdef __cforall
-		P9_EMBEDDED( thread$, dlink(thread$) )
-	#endif
+
 	// Wrapper for gdb
 	struct cfathread_thread_t { struct thread$ debug; };
@@ -231,12 +241,26 @@
 	#ifdef __cforall
 	extern "Cforall" {
+		static inline thread$ * volatile & ?`next ( thread$ * this ) {
+			return this->user_link.next;
+		}
 
 		static inline thread$ *& get_next( thread$ & this ) __attribute__((const)) {
-			return this.link.next;
-		}
-
-		static inline [thread$ *&, thread$ *& ] __get( thread$ & this ) __attribute__((const)) {
-			return this.node.[next, prev];
-		}
+			return this.user_link.next;
+		}
+
+		static inline tytagref( dlink(thread$), dlink(thread$) ) ?`inner( thread$ & this ) {
+			dlink(thread$) & b = this.user_link;
+			tytagref( dlink(thread$), dlink(thread$) ) result = { b };
+			return result;
+		}
+
+		static inline tytagref(struct __thread_user_link, dlink(thread$)) ?`inner( struct thread$ & this ) {
+			struct __thread_user_link & ib = this.cltr_link;
+			dlink(thread$) & b = ib`inner;
+			tytagref(struct __thread_user_link, dlink(thread$)) result = { b };
+			return result;
+		}
+
+		P9_EMBEDDED(struct __thread_user_link, dlink(thread$))
 
 		static inline void ?{}(__monitor_group_t & this) {
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/io.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -610,5 +610,5 @@
 		if( we ) {
 			sigval_t value = { PREEMPT_IO };
-			pthread_sigqueue(ctx->proc->kernel_thread, SIGUSR1, value);
+			__cfaabi_pthread_sigqueue(ctx->proc->kernel_thread, SIGUSR1, value);
 		}
 
@@ -639,91 +639,3 @@
 		}
 	}
-
-	#if defined(CFA_WITH_IO_URING_IDLE)
-		bool __kernel_read(struct processor * proc, io_future_t & future, iovec & iov, int fd) {
-			io_context$ * ctx = proc->io.ctx;
-			/* paranoid */ verify( ! __preemption_enabled() );
-			/* paranoid */ verify( proc == __cfaabi_tls.this_processor );
-			/* paranoid */ verify( ctx );
-
-			__u32 idx;
-			struct io_uring_sqe * sqe;
-
-			// We can proceed to the fast path
-			if( !__alloc(ctx, &idx, 1) ) {
-				/* paranoid */ verify( false ); // for now check if this happens, next time just abort the sleep.
-				return false;
-			}
-
-			// Allocation was successful
-			__fill( &sqe, 1, &idx, ctx );
-
-			sqe->user_data = (uintptr_t)&future;
-			sqe->flags = 0;
-			sqe->fd = fd;
-			sqe->off = 0;
-			sqe->ioprio = 0;
-			sqe->fsync_flags = 0;
-			sqe->__pad2[0] = 0;
-			sqe->__pad2[1] = 0;
-			sqe->__pad2[2] = 0;
-
-			#if defined(CFA_HAVE_IORING_OP_READ)
-				sqe->opcode = IORING_OP_READ;
-				sqe->addr = (uint64_t)iov.iov_base;
-				sqe->len = iov.iov_len;
-			#elif defined(CFA_HAVE_READV) && defined(CFA_HAVE_IORING_OP_READV)
-				sqe->opcode = IORING_OP_READV;
-				sqe->addr = (uintptr_t)&iov;
-				sqe->len = 1;
-			#else
-				#error CFA_WITH_IO_URING_IDLE but none of CFA_HAVE_READV, CFA_HAVE_IORING_OP_READV or CFA_HAVE_IORING_OP_READ defined
-			#endif
-
-			asm volatile("": : :"memory");
-
-			/* paranoid */ verify( sqe->user_data == (uintptr_t)&future );
-			__submit_only( ctx, &idx, 1 );
-
-			/* paranoid */ verify( proc == __cfaabi_tls.this_processor );
-			/* paranoid */ verify( ! __preemption_enabled() );
-
-			return true;
-		}
-
-		void __cfa_io_idle( struct processor * proc ) {
-			iovec iov;
-			__atomic_acquire( &proc->io.ctx->cq.lock );
-
-			__attribute__((used)) volatile bool was_reset = false;
-
-			with( proc->idle_wctx) {
-
-				// Do we already have a pending read
-				if(available(*ftr)) {
-					// There is no pending read, we need to add one
-					reset(*ftr);
-
-					iov.iov_base = rdbuf;
-					iov.iov_len  = sizeof(eventfd_t);
-					__kernel_read(proc, *ftr, iov, evfd );
-					ftr->result = 0xDEADDEAD;
-					*((eventfd_t *)rdbuf) = 0xDEADDEADDEADDEAD;
-					was_reset = true;
-				}
-			}
-
-			if( !__atomic_load_n( &proc->do_terminate, __ATOMIC_SEQ_CST ) ) {
-				__ioarbiter_flush( *proc->io.ctx );
-				proc->idle_wctx.sleep_time = rdtscl();
-				ioring_syscsll( *proc->io.ctx, 1, IORING_ENTER_GETEVENTS);
-			}
-
-			ready_schedule_lock();
-			__cfa_do_drain( proc->io.ctx, proc->cltr );
-			ready_schedule_unlock();
-
-			asm volatile ("" :: "m" (was_reset));
-		}
-	#endif
 #endif
Index: libcfa/src/concurrency/io/setup.cfa
===================================================================
--- libcfa/src/concurrency/io/setup.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/io/setup.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -34,5 +34,4 @@
 	bool __cfa_io_flush( processor * proc ) { return false; }
 	bool __cfa_io_drain( processor * proc ) __attribute__((nonnull (1))) { return false; }
-	void __cfa_io_idle ( processor * ) __attribute__((nonnull (1))) {}
 	void __cfa_io_stop ( processor * proc ) {}
 
@@ -317,45 +316,4 @@
 	}
 
-//=============================================================================================
-// I/O Context Sleep
-//=============================================================================================
-	// static inline void __epoll_ctl(io_context$ & ctx, int op, const char * error) {
-	// 	struct epoll_event ev;
-	// 	ev.events = EPOLLIN | EPOLLONESHOT;
-	// 	ev.data.u64 = (__u64)&ctx;
-	// 	int ret = epoll_ctl(iopoll.epollfd, op, ctx.efd, &ev);
-	// 	if (ret < 0) {
-	// 		abort( "KERNEL ERROR: EPOLL %s - (%d) %s\n", error, (int)errno, strerror(errno) );
-	// 	}
-	// }
-
-	// static void __epoll_register(io_context$ & ctx) {
-	// 	__epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
-	// }
-
-	// static void __epoll_unregister(io_context$ & ctx) {
-	// 	// Read the current epoch so we know when to stop
-	// 	size_t curr = __atomic_load_n(&iopoll.epoch, __ATOMIC_SEQ_CST);
-
-	// 	// Remove the fd from the iopoller
-	// 	__epoll_ctl(ctx, EPOLL_CTL_DEL, "REMOVE");
-
-	// 	// Notify the io poller thread of the shutdown
-	// 	iopoll.run = false;
-	// 	sigval val = { 1 };
-	// 	pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
-
-	// 	// Make sure all this is done
-	// 	__atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-	// 	// Wait for the next epoch
-	// 	while(curr == iopoll.epoch && !iopoll.stopped) Pause();
-	// }
-
-	// void __ioctx_prepare_block(io_context$ & ctx) {
-	// 	__cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.fd, &ctx);
-	// 	__epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
-	// }
-
 
 //=============================================================================================
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/kernel.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -138,9 +138,5 @@
 extern bool __cfa_io_drain( processor * proc ) __attribute__((nonnull (1)));
 extern bool __cfa_io_flush( processor * ) __attribute__((nonnull (1)));
-extern void __cfa_io_idle( processor * ) __attribute__((nonnull (1)));
-
-#if defined(CFA_WITH_IO_URING_IDLE)
-	extern bool __kernel_read(processor * proc, io_future_t & future, iovec &, int fd);
-#endif
+
 
 extern void __disable_interrupts_hard();
@@ -162,11 +158,4 @@
 	verify(this);
 
-	/* paranoid */ verify( this->idle_wctx.ftr   != 0p );
-	/* paranoid */ verify( this->idle_wctx.rdbuf != 0p );
-
-	// used for idle sleep when io_uring is present
-	// mark it as already fulfilled so we know if there is a pending request or not
-	this->idle_wctx.ftr->self.ptr = 1p;
-
 	__cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -291,5 +280,5 @@
 	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
-	/* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+	/* paranoid */ verifyf( thrd_dst->rdy_link.next == 0p, "Expected null got %p", thrd_dst->rdy_link.next );
 	__builtin_prefetch( thrd_dst->context.SP );
 
@@ -321,4 +310,5 @@
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ) || thrd_dst->curr_cor == proc_cor || thrd_dst->corctx_flag, "ERROR : Destination thread$ %p has been corrupted.\n StackPointer too small.\n", thrd_dst ); // add escape condition if we are setting up the processor
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit) || thrd_dst->curr_cor == proc_cor || thrd_dst->corctx_flag, "ERROR : Destination thread$ %p has been corrupted.\n StackPointer too large.\n", thrd_dst ); // add escape condition if we are setting up the processor
+		/* paranoid */ verify( __atomic_exchange_n( &thrd_dst->executing, this, __ATOMIC_SEQ_CST) == 0p );
 		/* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
 
@@ -332,8 +322,9 @@
 
 		/* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
+		/* paranoid */ verify( __atomic_exchange_n( &thrd_dst->executing, 0p, __ATOMIC_SEQ_CST) == this );
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit) || thrd_dst->corctx_flag, "ERROR : Destination thread$ %p has been corrupted.\n StackPointer too large.\n", thrd_dst );
 		/* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ) || thrd_dst->corctx_flag, "ERROR : Destination thread$ %p has been corrupted.\n StackPointer too small.\n", thrd_dst );
+		/* paranoid */ verify( thrd_dst->state != Halted );
 		/* paranoid */ verify( thrd_dst->context.SP );
-		/* paranoid */ verify( thrd_dst->curr_cluster == this->cltr );
 		/* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
 		/* paranoid */ verify( ! __preemption_enabled() );
@@ -452,5 +443,5 @@
 					"Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
 	/* paranoid */ #endif
-	/* paranoid */ verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+	/* paranoid */ verifyf( thrd->rdy_link.next == 0p, "Expected null got %p", thrd->rdy_link.next );
 	/* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
 
@@ -600,8 +591,10 @@
 		/* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) < ((uintptr_t)__get_stack(thrd->curr_cor)->base ), "ERROR : thread$ %p has been corrupted.\n StackPointer too small.\n", thrd );
 
-		thrd->state = Halting;
 		if( TICKET_RUNNING != thrd->ticket ) { abort( "Thread terminated with pending unpark" ); }
 		if( thrd != this->owner ) { abort( "Thread internal monitor has incorrect owner" ); }
 		if( this->recursion != 1) { abort( "Thread internal monitor has unbalanced recursion" ); }
+
+		thrd->state = Halting;
+		thrd->ticket = TICKET_DEAD;
 
 		// Leave the thread
@@ -624,5 +617,5 @@
 		// If that is the case, abandon the preemption.
 		bool preempted = false;
-		if(thrd->link.next == 0p) {
+		if(thrd->rdy_link.next == 0p) {
 			preempted = true;
 			thrd->preempted = reason;
@@ -726,38 +719,34 @@
 
 
-	#if !defined(CFA_WITH_IO_URING_IDLE)
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(this->print_halts) {
-				__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl());
+	#if !defined(__CFA_NO_STATISTICS__)
+		if(this->print_halts) {
+			__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl());
+		}
+	#endif
+
+	__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle_fd);
+
+	{
+		eventfd_t val;
+		ssize_t ret = read( this->idle_wctx.evfd, &val, sizeof(val) );
+		if(ret < 0) {
+			switch((int)errno) {
+			case EAGAIN:
+			#if EAGAIN != EWOULDBLOCK
+				case EWOULDBLOCK:
+			#endif
+			case EINTR:
+				// No need to do anything special here, just assume it's a legitimate wake-up
+				break;
+			default:
+				abort( "KERNEL : internal error, read failure on idle eventfd, error(%d) %s.", (int)errno, strerror( (int)errno ) );
 			}
-		#endif
-
-		__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle_fd);
-
-		{
-			eventfd_t val;
-			ssize_t ret = read( this->idle_wctx.evfd, &val, sizeof(val) );
-			if(ret < 0) {
-				switch((int)errno) {
-				case EAGAIN:
-				#if EAGAIN != EWOULDBLOCK
-					case EWOULDBLOCK:
-				#endif
-				case EINTR:
-					// No need to do anything special here, just assume it's a legitimate wake-up
-					break;
-				default:
-					abort( "KERNEL : internal error, read failure on idle eventfd, error(%d) %s.", (int)errno, strerror( (int)errno ) );
-				}
-			}
-		}
-
-		#if !defined(__CFA_NO_STATISTICS__)
-			if(this->print_halts) {
-				__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl());
-			}
-		#endif
-	#else
-		__cfa_io_idle( this );
+		}
+	}
+
+	#if !defined(__CFA_NO_STATISTICS__)
+		if(this->print_halts) {
+			__cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl());
+		}
 	#endif
 }
@@ -775,4 +764,5 @@
 		insert_first(this.idles, proc);
 
+		// update the pointer to the head wait context, which should now point to this proc.
 		__atomic_store_n(&this.fdw, &proc.idle_wctx, __ATOMIC_SEQ_CST);
 	unlock( this );
@@ -791,4 +781,5 @@
 
 		{
+			// update the pointer to the head wait context
 			struct __fd_waitctx * wctx = 0;
 			if(!this.idles`isEmpty) wctx = &this.idles`first.idle_wctx;
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/kernel.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -64,4 +64,5 @@
 	// 1 - means the proc should wake-up immediately
 	// FD - means the proc is going asleep and should be woken by writing to the FD.
+	//      The FD value should always be the evfd field just below.
 	volatile int sem;
 
@@ -69,12 +70,5 @@
 	int evfd;
 
-	// buffer into which the proc will read from evfd
-	// unused if not using io_uring for idle sleep
-	void * rdbuf;
-
-	// future use to track the read of the eventfd
-	// unused if not using io_uring for idle sleep
-	io_future_t * ftr;
-
+	// Used for debugging, should be removed eventually.
 	volatile unsigned long long wake__time;
 	volatile unsigned long long sleep_time;
@@ -160,7 +154,7 @@
 // P9_EMBEDDED( processor, dlink(processor) )
 static inline tytagref( dlink(processor), dlink(processor) ) ?`inner( processor & this ) {
-    dlink(processor) & b = this.link;
-    tytagref( dlink(processor), dlink(processor) ) result = { b };
-    return result;
+	dlink(processor) & b = this.link;
+	tytagref( dlink(processor), dlink(processor) ) result = { b };
+	return result;
 }
 
@@ -256,5 +250,5 @@
 	// List of threads
 	__spinlock_t thread_list_lock;
-	__dllist_t(struct thread$) threads;
+	dlist(struct thread$, struct __thread_user_link) threads;
 	unsigned int nthreads;
 
@@ -269,4 +263,9 @@
 		io_context_params params;
 	} io;
+
+	struct {
+		struct processor ** procs;
+		unsigned cnt;
+	} managed;
 
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -298,4 +297,8 @@
 static inline struct cluster   * active_cluster  () { return publicTLS_get( this_processor )->cltr; }
 
+// set the number of internal processors
+// these processors are in addition to any explicitly declared processors
+unsigned set_concurrency( cluster & this, unsigned new_count );
+
 #if !defined(__CFA_NO_STATISTICS__)
 	void print_stats_now( cluster & this, int flags );
Index: libcfa/src/concurrency/kernel/cluster.cfa
===================================================================
--- libcfa/src/concurrency/kernel/cluster.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/kernel/cluster.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -483,11 +483,11 @@
 
 	// We add a boat-load of assertions here because the anchor code is very fragile
-	/* paranoid */ _Static_assert( offsetof( thread$, link ) == nested_offsetof(__intrusive_lane_t, l.anchor) );
-	/* paranoid */ verify( offsetof( thread$, link ) == nested_offsetof(__intrusive_lane_t, l.anchor) );
-	/* paranoid */ verify( ((uintptr_t)( mock_head(this) ) + offsetof( thread$, link )) == (uintptr_t)(&this.l.anchor) );
-	/* paranoid */ verify( &mock_head(this)->link.next == &this.l.anchor.next );
-	/* paranoid */ verify( &mock_head(this)->link.ts   == &this.l.anchor.ts   );
-	/* paranoid */ verify( mock_head(this)->link.next == 0p );
-	/* paranoid */ verify( mock_head(this)->link.ts   == MAX );
+	/* paranoid */ _Static_assert( offsetof( thread$, rdy_link ) == nested_offsetof(__intrusive_lane_t, l.anchor) );
+	/* paranoid */ verify( offsetof( thread$, rdy_link ) == nested_offsetof(__intrusive_lane_t, l.anchor) );
+	/* paranoid */ verify( ((uintptr_t)( mock_head(this) ) + offsetof( thread$, rdy_link )) == (uintptr_t)(&this.l.anchor) );
+	/* paranoid */ verify( &mock_head(this)->rdy_link.next == &this.l.anchor.next );
+	/* paranoid */ verify( &mock_head(this)->rdy_link.ts   == &this.l.anchor.ts   );
+	/* paranoid */ verify( mock_head(this)->rdy_link.next == 0p );
+	/* paranoid */ verify( mock_head(this)->rdy_link.ts   == MAX );
 	/* paranoid */ verify( mock_head(this) == this.l.prev );
 	/* paranoid */ verify( __alignof__(__intrusive_lane_t) == 64 );
Index: libcfa/src/concurrency/kernel/private.hfa
===================================================================
--- libcfa/src/concurrency/kernel/private.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/kernel/private.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -20,4 +20,6 @@
 #endif
 
+#include <signal.h>
+
 #include "kernel.hfa"
 #include "thread.hfa"
@@ -39,14 +41,4 @@
 }
 
-// Defines whether or not we *want* to use io_uring_enter as the idle_sleep blocking call
-// #define CFA_WANT_IO_URING_IDLE
-
-// Defines whether or not we *can* use io_uring_enter as the idle_sleep blocking call
-#if defined(CFA_WANT_IO_URING_IDLE) && defined(CFA_HAVE_LINUX_IO_URING_H)
-	#if defined(CFA_HAVE_IORING_OP_READ) || (defined(CFA_HAVE_READV) && defined(CFA_HAVE_IORING_OP_READV))
-		#define CFA_WITH_IO_URING_IDLE
-	#endif
-#endif
-
 // #define READYQ_USE_LINEAR_AVG
 #define READYQ_USE_LOGDBL_AVG
@@ -63,4 +55,16 @@
 #endif
 
+extern "C" {
+	__attribute__((visibility("protected"))) int __cfaabi_pthread_create(pthread_t *_thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
+	__attribute__((visibility("protected"))) int __cfaabi_pthread_join(pthread_t _thread, void **retval);
+	__attribute__((visibility("protected"))) pthread_t __cfaabi_pthread_self(void);
+	__attribute__((visibility("protected"))) int __cfaabi_pthread_attr_init(pthread_attr_t *attr);
+	__attribute__((visibility("protected"))) int __cfaabi_pthread_attr_destroy(pthread_attr_t *attr);
+	__attribute__((visibility("protected"))) int __cfaabi_pthread_attr_setstack( pthread_attr_t *attr, void *stackaddr, size_t stacksize );
+	__attribute__((visibility("protected"))) int __cfaabi_pthread_attr_getstacksize( const pthread_attr_t *attr, size_t *stacksize );
+	__attribute__((visibility("protected"))) int __cfaabi_pthread_sigqueue(pthread_t _thread, int sig, const union sigval value);
+	__attribute__((visibility("protected"))) int __cfaabi_pthread_sigmask( int how, const sigset_t *set, sigset_t *oset);
+}
+
 //-----------------------------------------------------------------------------
 // Scheduler
@@ -153,4 +157,5 @@
 #define TICKET_RUNNING ( 0) // thread is running
 #define TICKET_UNBLOCK ( 1) // thread should ignore next block
+#define TICKET_DEAD    (0xDEAD) // thread should never be unparked
 
 //-----------------------------------------------------------------------------
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -16,4 +16,6 @@
 #define __cforall_thread__
 #define _GNU_SOURCE
+
+// #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 
 // C Includes
@@ -113,7 +115,4 @@
 KERNEL_STORAGE(thread$,	             mainThread);
 KERNEL_STORAGE(__stack_t,            mainThreadCtx);
-// KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
-KERNEL_STORAGE(eventfd_t,            mainIdleEventFd);
-KERNEL_STORAGE(io_future_t,          mainIdleFuture);
 #if !defined(__CFA_NO_STATISTICS__)
 KERNEL_STORAGE(__stats_t, mainProcStats);
@@ -222,5 +221,5 @@
 		( this.runner ){};
 		init( this, "Main Processor", *mainCluster, 0p );
-		kernel_thread = pthread_self();
+		kernel_thread = __cfaabi_pthread_self();
 
 		runner{ &this };
@@ -232,8 +231,4 @@
 	mainProcessor = (processor *)&storage_mainProcessor;
 	(*mainProcessor){};
-
-	mainProcessor->idle_wctx.rdbuf = &storage_mainIdleEventFd;
-	mainProcessor->idle_wctx.ftr   = (io_future_t*)&storage_mainIdleFuture;
-	/* paranoid */ verify( sizeof(storage_mainIdleEventFd) == sizeof(eventfd_t) );
 
 	__cfa_io_start( mainProcessor );
@@ -283,6 +278,15 @@
 }
 
+extern "C"{
+	void pthread_delete_kernel_threads_();
+}
+
+
 static void __kernel_shutdown(void) {
 	if(!cfa_main_returned) return;
+
+	//delete kernel threads for pthread_concurrency
+	pthread_delete_kernel_threads_();
+
 	/* paranoid */ verify( __preemption_enabled() );
 	disable_interrupts();
@@ -327,5 +331,5 @@
 
 		/* paranoid */ verify( this.do_terminate == true );
-		__cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+		__cfadbg_print_safe(runtime_core, "Kernel : destroyed main processor context %p\n", &runner);
 	}
 
@@ -373,11 +377,4 @@
 	register_tls( proc );
 
-	// used for idle sleep when io_uring is present
-	io_future_t future;
-	eventfd_t idle_buf;
-	proc->idle_wctx.ftr = &future;
-	proc->idle_wctx.rdbuf = &idle_buf;
-
-
 	// SKULLDUGGERY: We want to create a context for the processor coroutine
 	// which is needed for the 2-step context switch. However, there is no reason
@@ -388,5 +385,5 @@
 	(proc->runner){ proc, &info };
 
-	__cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
+	__cfadbg_print_safe(runtime_core, "Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
 
 	//Set global state
@@ -514,15 +511,14 @@
 	self_mon.recursion = 1;
 	self_mon_p = &self_mon;
-	link.next = 0p;
-	link.ts   = MAX;
+	rdy_link.next = 0p;
+	rdy_link.ts   = MAX;
 	preferred = ready_queue_new_preferred();
 	last_proc = 0p;
 	random_state = __global_random_mask ? __global_random_prime : __global_random_prime ^ rdtscl();
 	#if defined( __CFA_WITH_VERIFY__ )
+		executing = 0p;
 		canary = 0x0D15EA5E0D15EA5Ep;
 	#endif
 
-	node.next = 0p;
-	node.prev = 0p;
 	doregister(curr_cluster, this);
 
@@ -647,8 +643,11 @@
 	#endif
 
-	threads{ __get };
+	threads{};
 
 	io.arbiter = create();
 	io.params = io_params;
+
+	managed.procs = 0p;
+	managed.cnt = 0;
 
 	doregister(this);
@@ -667,4 +666,6 @@
 
 void ^?{}(cluster & this) libcfa_public {
+	set_concurrency( this, 0 );
+
 	destroy(this.io.arbiter);
 
@@ -722,5 +723,5 @@
 	lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
 	cltr->nthreads += 1;
-	push_front(cltr->threads, thrd);
+	insert_first(cltr->threads, thrd);
 	unlock    (cltr->thread_list_lock);
 }
@@ -728,6 +729,10 @@
 void unregister( cluster * cltr, thread$ & thrd ) {
 	lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
-	remove(cltr->threads, thrd );
-	cltr->nthreads -= 1;
+	{
+		tytagref( dlink(thread$), dlink(thread$) ) ?`inner( thread$ & this ) = void;
+		with( DLINK_VIA( thread$, struct __thread_user_link ) )
+			remove( thrd );
+		cltr->nthreads -= 1;
+	}
 	unlock(cltr->thread_list_lock);
 }
@@ -777,5 +782,5 @@
 	pthread_attr_t attr;
 
-	check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+	check( __cfaabi_pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
 
 	size_t stacksize = max( PTHREAD_STACK_MIN, DEFAULT_STACK_SIZE );
@@ -804,11 +809,11 @@
 	#endif
 
-	check( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
-	check( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
+	check( __cfaabi_pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+	check( __cfaabi_pthread_create( pthread, &attr, start, arg ), "pthread_create" );
 	return stack;
 }
 
 void __destroy_pthread( pthread_t pthread, void * stack, void ** retval ) {
-	int err = pthread_join( pthread, retval );
+	int err = __cfaabi_pthread_join( pthread, retval );
 	if( err != 0 ) abort("KERNEL ERROR: joining pthread %p caused error %s\n", (void*)pthread, strerror(err));
 
@@ -816,9 +821,9 @@
 		pthread_attr_t attr;
 
-		check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+		check( __cfaabi_pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
 
 		size_t stacksize;
 		// default stack size, normally defined by shell limit
-		check( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
+		check( __cfaabi_pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
 		assert( stacksize >= PTHREAD_STACK_MIN );
 		stacksize += __page_size;
@@ -838,4 +843,29 @@
 }
 
+unsigned set_concurrency( cluster & this, unsigned new ) libcfa_public {
+	unsigned old = this.managed.cnt;
+
+	__cfadbg_print_safe(runtime_core, "Kernel : resizing cluster from %u to %u\n", old, (unsigned)new);
+
+	// Delete all the old unneeded procs
+	if(old > new) for(i; (unsigned)new ~ old) {
+		__cfadbg_print_safe(runtime_core, "Kernel : destroying %u\n", i);
+		delete( this.managed.procs[i] );
+	}
+
+	// Allocate new array (uses realloc and memcpies the data)
+	this.managed.procs = alloc( new, this.managed.procs`realloc );
+	this.managed.cnt = new;
+
+	// Create the desired new procs
+	if(old < new) for(i; old ~ new) {
+		__cfadbg_print_safe(runtime_core, "Kernel : constructing %u\n", i);
+		(*(this.managed.procs[i] = alloc())){ this };
+	}
+
+	// return the old count
+	return old;
+}
+
 #if defined(__CFA_WITH_VERIFY__)
 static bool verify_fwd_bck_rng(void) {
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/locks.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -21,5 +21,5 @@
 
 #include "bits/weakso_locks.hfa"
-#include "containers/queueLockFree.hfa"
+#include "containers/lockfree.hfa"
 #include "containers/list.hfa"
 
@@ -498,5 +498,5 @@
 }
 
-static inline size_t on_wait(simple_owner_lock & this) with(this) { 
+static inline size_t on_wait(simple_owner_lock & this) with(this) {
 	lock( lock __cfaabi_dbg_ctx2 );
 	/* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/monitor.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -122,7 +122,7 @@
 
 		// Some one else has the monitor, wait in line for it
-		/* paranoid */ verify( thrd->link.next == 0p );
+		/* paranoid */ verify( thrd->user_link.next == 0p );
 		append( this->entry_queue, thrd );
-		/* paranoid */ verify( thrd->link.next == 1p );
+		/* paranoid */ verify( thrd->user_link.next == 1p );
 
 		unlock( this->lock );
@@ -233,7 +233,7 @@
 
 		// Some one else has the monitor, wait in line for it
-		/* paranoid */ verify( thrd->link.next == 0p );
+		/* paranoid */ verify( thrd->user_link.next == 0p );
 		append( this->entry_queue, thrd );
-		/* paranoid */ verify( thrd->link.next == 1p );
+		/* paranoid */ verify( thrd->user_link.next == 1p );
 		unlock( this->lock );
 
@@ -791,5 +791,5 @@
 	thread$ * new_owner = pop_head( this->entry_queue );
 	/* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
-	/* paranoid */ verify( !new_owner || new_owner->link.next == 0p );
+	/* paranoid */ verify( !new_owner || new_owner->user_link.next == 0p );
 	__set_owner( this, new_owner );
 
@@ -935,9 +935,17 @@
 	__queue_t(thread$) & entry_queue = monitors[0]->entry_queue;
 
+	#if defined( __CFA_WITH_VERIFY__ )
+		thread$ * last = 0p;
+	#endif
 	// For each thread in the entry-queue
 	for(	thread$ ** thrd_it = &entry_queue.head;
 		(*thrd_it) != 1p;
-		thrd_it = &(*thrd_it)->link.next
+		thrd_it = &get_next(**thrd_it)
 	) {
+		thread$ * curr = *thrd_it;
+
+		/* paranoid */ verifyf( !last || last->user_link.next == curr, "search not making progress, from %p (%p) to %p", last, last->user_link.next, curr );
+		/* paranoid */ verifyf( curr != last, "search not making progress, from %p to %p", last, curr );
+
 		// For each acceptable check if it matches
 		int i = 0;
@@ -946,5 +954,5 @@
 		for( __acceptable_t * it = begin; it != end; it++, i++ ) {
 			// Check if we have a match
-			if( *it == (*thrd_it)->monitors ) {
+			if( *it == curr->monitors ) {
 
 				// If we have a match return it
@@ -953,4 +961,8 @@
 			}
 		}
+
+		#if defined( __CFA_WITH_VERIFY__ )
+			last = curr;
+		#endif
 	}
 
@@ -1025,7 +1037,7 @@
 
 		// Some one else has the monitor, wait in line for it
-		/* paranoid */ verify( thrd->link.next == 0p );
+		/* paranoid */ verify( thrd->user_link.next == 0p );
 		append( this->entry_queue, thrd );
-		/* paranoid */ verify( thrd->link.next == 1p );
+		/* paranoid */ verify( thrd->user_link.next == 1p );
 
 		unlock( this->lock );
Index: libcfa/src/concurrency/once.hfa
===================================================================
--- libcfa/src/concurrency/once.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
+++ libcfa/src/concurrency/once.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -0,0 +1,106 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// once.hfa -- Algorithms to prevent concurrent calls to cause duplicate calls
+//
+// Author           : Thierry Delisle
+// Created On       : Thu Oct 11:40:47 2022
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#pragma once
+
+#include "containers/lockfree.hfa"
+#include "kernel/fwd.hfa"
+
+enum once_state {
+	ARMED = 0,
+	IN_PROGRESS,
+	READY
+};
+
+struct once_flag {
+	volatile int state;
+	poison_list( thread$ ) waiters;
+};
+
+static inline {
+	void ?{}(once_flag & this) { this.state = ARMED; }
+
+	void once_wait$(once_flag & this) {
+		// just push the thread to the list
+		if(push( this.waiters, active_thread() )) {
+			// the list wasn't poisoned, push was successful, just park.
+			park();
+		}
+	}
+
+	void once_call$( once_flag & this, void (*func)(void) ) {
+		/* paranoid */ verify( once_state.IN_PROGRESS == __atomic_load_n(&this.state, __ATOMIC_RELAXED) );
+		/* paranoid */ verify( ! is_poisoned(this.waiters) );
+
+		// call the thing we are here for!
+		func();
+
+		/* paranoid */ verify( ! is_poisoned(this.waiters) );
+		/* paranoid */ verify( once_state.IN_PROGRESS == __atomic_load_n(&this.state, __ATOMIC_RELAXED) );
+
+		// Mark the call as being done.
+		__atomic_store_n( &this.state, (int)once_state.IN_PROGRESS, __ATOMIC_SEQ_CST );
+
+		// wake up the sleepers and make sure no new sleeper arrives
+		thread$ * sleeper = poison( this.waiters );
+
+		/* paranoid */ verify( ! is_poisoned(this.waiters) );
+		/* paranoid */ verify( once_state.READY == __atomic_load_n(&this.state, __ATOMIC_RELAXED) );
+
+		while(sleeper != 0p) {
+			// find the next thread now because unpark invalidates the pointer
+			thread$ * next = advance(sleeper);
+
+			// wake-up the thread, invalidates pointer
+			unpark( sleeper );
+
+			// update the current
+			sleeper = next;
+		}
+	}
+
+	bool call_once( once_flag & this, void (*func)(void) ) {
+		// is the call already done?
+		if(likely(once_state.READY == __atomic_load_n(&this.state, __ATOMIC_RELAXED))) {
+			/* paranoid */ verify( is_poisoned(this.waiters) );
+			return false;
+		}
+
+		// Try to CAS ourself as the thread that will actually call the function
+		int expected = ARMED;
+		if( __atomic_compare_exchange_n( &this.state, &expected, (int)once_state.IN_PROGRESS, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) {
+
+			// we won the race, call the function
+			once_call$( this, func );
+
+			/* paranoid */ verify( is_poisoned(this.waiters) );
+			/* paranoid */ verify( once_state.READY == __atomic_load_n(&this.state, __ATOMIC_RELAXED) );
+
+			// in case someone cares, this call did do the underlying call
+			return true;
+		}
+		else {
+
+			// someone else is doing the call, just wait
+			once_wait$( this );
+
+			/* paranoid */ verify( is_poisoned(this.waiters) );
+			/* paranoid */ verify( once_state.READY == __atomic_load_n(&this.state, __ATOMIC_RELAXED) );
+
+			// in case someone cares, someone else did the call
+			return false;
+		}
+	}
+}
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/preemption.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -352,5 +352,5 @@
 	sigset_t oldset;
 	int ret;
-	ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+	ret = __cfaabi_pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
 	if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
 
@@ -385,5 +385,5 @@
 	sigaddset( &mask, sig );
 
-	if ( pthread_sigmask( SIG_UNBLOCK, &mask, 0p ) == -1 ) {
+	if ( __cfaabi_pthread_sigmask( SIG_UNBLOCK, &mask, 0p ) == -1 ) {
 	    abort( "internal error, pthread_sigmask" );
 	}
@@ -396,5 +396,5 @@
 	sigaddset( &mask, sig );
 
-	if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
+	if ( __cfaabi_pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
 		abort( "internal error, pthread_sigmask" );
 	}
@@ -404,5 +404,5 @@
 static void preempt( processor * this ) {
 	sigval_t value = { PREEMPT_NORMAL };
-	pthread_sigqueue( this->kernel_thread, SIGUSR1, value );
+	__cfaabi_pthread_sigqueue( this->kernel_thread, SIGUSR1, value );
 }
 
@@ -415,5 +415,5 @@
 	sigset_t oldset;
 	int ret;
-	ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+	ret = __cfaabi_pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
 	if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
 
@@ -434,5 +434,5 @@
 	sigset_t oldset;
 	int ret;
-	ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+	ret = __cfaabi_pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
 	if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
 
@@ -505,5 +505,5 @@
 	sigval val;
 	val.sival_int = 0;
-	pthread_sigqueue( alarm_thread, SIGALRM, val );
+	__cfaabi_pthread_sigqueue( alarm_thread, SIGALRM, val );
 
 	// Wait for the preemption thread to finish
@@ -579,5 +579,5 @@
 	static_assert( sizeof( sigset_t ) == sizeof( cxt->uc_sigmask ), "Expected cxt->uc_sigmask to be of sigset_t" );
 	#endif
-	if ( pthread_sigmask( SIG_SETMASK, (sigset_t *)&(cxt->uc_sigmask), 0p ) == -1 ) {
+	if ( __cfaabi_pthread_sigmask( SIG_SETMASK, (sigset_t *)&(cxt->uc_sigmask), 0p ) == -1 ) {
 		abort( "internal error, sigprocmask" );
 	}
@@ -607,5 +607,5 @@
 	sigset_t mask;
 	sigfillset(&mask);
-	if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
+	if ( __cfaabi_pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
 	    abort( "internal error, pthread_sigmask" );
 	}
Index: libcfa/src/concurrency/pthread.cfa
===================================================================
--- libcfa/src/concurrency/pthread.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
+++ libcfa/src/concurrency/pthread.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -0,0 +1,920 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2019 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// pthread.cfa --
+//
+// Author           : Zhenyan Zhu
+// Created On       : Sat Aug 6 16:29:18 2022
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#define __cforall_thread__
+#define _GNU_SOURCE
+
+#include <signal.h>
+#include <pthread.h>
+#include <errno.h>
+#include "locks.hfa"
+#include "bits/stack.hfa"
+
+
+#define check_nonnull(x) asm("": "+rm"(x)); if( x == 0p ) return EINVAL;
+
+/* pthread key, pthread once inner routine mutual exclusion */
+static simple_owner_lock once_lock,key_lock,magic_mutex_check, concurrency_lock;
+
+//######################### Local Storage Helpers #########################
+
+enum { PTHREAD_KEYS_MAX = 1024 };
+
+struct pthread_values{
+	inline Seqable;
+	void* value;
+	bool in_use;
+};
+
+static inline {
+	pthread_values *& Back( pthread_values * n ) {
+		return (pthread_values *)Back( (Seqable *)n );
+	}
+
+	pthread_values *& Next( pthread_values * n ) {
+		return (pthread_values *)Next( (Colable *)n );
+	}
+}
+
+struct pthread_keys {
+	bool in_use;
+	void (*destructor)( void * );
+	Sequence(pthread_values) threads;
+};
+
+static void ?{}(pthread_keys& k){
+	k.threads{};
+}
+
+// Create storage separately to ensure no constructors are called.
+static pthread_keys cfa_pthread_keys_storage[PTHREAD_KEYS_MAX] __attribute__((aligned (16)));
+
+static void init_pthread_storage(){
+	for (int i = 0; i < PTHREAD_KEYS_MAX; i++){
+		cfa_pthread_keys_storage[i]{};
+	}
+}
+
+#define cfa_pthread_keys ((pthread_keys *)cfa_pthread_keys_storage)
+
+/* Controlling the iterations of destructors for thread-specific data.  */
+#define _POSIX_THREAD_DESTRUCTOR_ITERATIONS	4
+/* Number of iterations this implementation does.  */
+#define PTHREAD_DESTRUCTOR_ITERATIONS	_POSIX_THREAD_DESTRUCTOR_ITERATIONS
+
+//######################### Parallelism Helpers #########################
+
+struct Pthread_kernel_threads{
+	inline Colable;
+	processor p;
+};
+
+Pthread_kernel_threads *& Next( Pthread_kernel_threads * n ) {
+	return (Pthread_kernel_threads *)Next( (Colable *)n );
+}
+
+static Stack(Pthread_kernel_threads) cfa_pthreads_kernel_threads;
+static bool cfa_pthreads_kernel_threads_zero = false;	// set to zero ?
+static int cfa_pthreads_no_kernel_threads = 1;	// number of kernel threads
+
+
+//######################### Cond Helpers #########################
+
+typedef pthread_cond_var(simple_owner_lock) cfa2pthr_cond_var_t;
+
+/* condvar helper routines */
+static void init(pthread_cond_t* pcond){
+	static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
+	cfa2pthr_cond_var_t* _cond = (cfa2pthr_cond_var_t*)pcond;
+	?{}(*_cond);
+}
+
+static cfa2pthr_cond_var_t* get(pthread_cond_t* pcond){
+	static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
+	return (cfa2pthr_cond_var_t*)pcond;
+}
+
+static void destroy(pthread_cond_t* cond){
+	static_assert(sizeof(pthread_cond_t) >= sizeof(cfa2pthr_cond_var_t),"sizeof(pthread_t) < sizeof(cfa2pthr_cond_var_t)");
+	^?{}(*get(cond));
+}
+
+
+//######################### Mutex Helper #########################
+
+/* mutex helper routines */
+static void mutex_check(pthread_mutex_t* t){
+	// Use double check to improve performance.
+	// Check is safe on x86; volatile prevents compiler reordering
+	volatile pthread_mutex_t *const mutex_ = t;
+
+	// SKULLDUGGERY: not a portable way to access the kind field, /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h
+	int _lock_val = ((pthread_mutex_t *)mutex_)->__data.__lock;
+
+	// if pthread_mutex_t is initialized by PTHREAD_MUTEX_INITIALIZER, _lock_val should be 0
+	if ( _lock_val == 0 ) {
+		lock(magic_mutex_check);
+		_lock_val = ((pthread_mutex_t *)mutex_)->__data.__lock;
+		if ( _lock_val == 0 ) {
+			pthread_mutex_init( t, NULL );
+		}
+		unlock(magic_mutex_check);
+	}
+} // mutex_check
+
+
+static void init(pthread_mutex_t* plock){
+	static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
+	simple_owner_lock* _lock = (simple_owner_lock*)plock;
+	?{}(*_lock);
+}
+
+static simple_owner_lock* get(pthread_mutex_t* plock){
+	static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
+	return (simple_owner_lock*)plock;
+}
+
+static void destroy(pthread_mutex_t* plock){
+	static_assert(sizeof(pthread_mutex_t) >= sizeof(simple_owner_lock),"sizeof(pthread_mutex_t) < sizeof(simple_owner_lock)");
+	^?{}(*get(plock));
+}
+
+//######################### Attr helpers #########################
+struct cfaPthread_attr_t {								// thread attributes
+		int contentionscope;
+		int detachstate;
+		size_t stacksize;
+		void *stackaddr;
+		int policy;
+		int inheritsched;
+		struct sched_param param;
+} typedef cfaPthread_attr_t;
+
+static const cfaPthread_attr_t default_attrs{
+	0,
+	0,
+	(size_t)65000,
+	(void *)NULL,
+	0,
+	0,
+	{0}
+};
+
+static cfaPthread_attr_t* get(const pthread_attr_t* attr){
+	static_assert(sizeof(pthread_attr_t) >= sizeof(cfaPthread_attr_t),"sizeof(pthread_attr_t) < sizeof(cfaPthread_attr_t)");
+	return (cfaPthread_attr_t*)attr;
+}
+
+
+//######################### Threads Helper #########################
+
+// exception for cancel_stack in pthread_exit
+exception pthread_exit_exp {};
+static vtable(pthread_exit_exp) exp_vt;
+
+thread cfaPthread{
+	cfaPthread_attr_t attr;
+	pthread_t pthreadId;
+
+	// pthreads return value
+	void *joinval;
+
+	// pthread attributes
+	pthread_attr_t pthread_attr;
+
+	void *(*start_routine)(void *);
+	void *start_arg;
+
+	// thread local data
+	pthread_values* pthreadData;
+
+	// flag used for tryjoin
+	bool isTerminated;
+};
+
+/* thread part routines */
+//  cfaPthread entry point
+void main(cfaPthread& _thread) with(_thread){
+	joinval =  start_routine(start_arg);
+	isTerminated = true;
+}
+
+static cfaPthread *lookup( pthread_t p ){
+	static_assert(sizeof(pthread_t) >= sizeof(cfaPthread*),"sizeof(pthread_t) < sizeof(cfaPthread*)");
+	return (cfaPthread*)p;
+}
+
+static void pthread_deletespecific_( pthread_values* values )  { // see uMachContext::invokeTask
+	pthread_values* value;
+	pthread_keys* key;
+	bool destcalled = true;
+	if (values != NULL){
+		for ( int attempts = 0; attempts < PTHREAD_DESTRUCTOR_ITERATIONS && destcalled ; attempts += 1 ) {
+			destcalled = false;
+			lock(key_lock);
+			for (int i = 0; i < PTHREAD_KEYS_MAX; i++){
+				// for each valid key
+				if ( values[i].in_use){
+					value = &values[i];
+					key = &cfa_pthread_keys[i];
+					value->in_use = false;
+					remove(key->threads, *value);
+					// if  a  key  value  has  a  non-NULL  destructor pointer,  and  the  thread  has  a  non-NULL  value associated with that key,
+					// the value of the key is set to NULL, and then the function pointed to is called with the previously associated value as its sole argument.
+					if (value->value != NULL && key->destructor != NULL){
+						unlock(key_lock);
+						key->destructor(value->value); // run destructor
+						lock(key_lock);
+						destcalled = true;
+					}   // if
+					value->value = NULL;
+				}   // if
+			}   // for
+			unlock(key_lock);
+		}   // for
+		free(values);
+	}   // if
+}
+
+static void ^?{}(cfaPthread & mutex t){
+	// delete pthread local storage
+	pthread_values * values = t.pthreadData;
+	pthread_deletespecific_(values);
+}
+
+static void ?{}(cfaPthread &t, pthread_t* _thread, const pthread_attr_t * _attr,void *(*start_routine)(void *), void * arg) {
+	static_assert(sizeof(pthread_t) >= sizeof(cfaPthread*), "pthread_t too small to hold a pointer: sizeof(pthread_t) < sizeof(cfaPthread*)");
+
+	// set up user thread stackSize
+	cfaPthread_attr_t * attr = get(_attr);
+	((thread&)t){ attr ? attr->stacksize: DEFAULT_STACK_SIZE };
+
+	// initialize _thread & cfaPthread id
+	*_thread = t.pthreadId = (pthread_t)(&t);
+
+	// if attr null, self attr will be set as default_attrs; else set to attr
+	t.attr = (attr != NULL ? *attr : default_attrs);
+
+	// init start routine and arguments
+	t.start_routine = start_routine;
+	t.start_arg = arg;
+	t.pthreadData = NULL;
+}
+
+
+extern "C"{
+	//######################### Pthread Attrs #########################
+
+	int pthread_attr_init(pthread_attr_t *attr) libcfa_public __THROW {
+		cfaPthread_attr_t* _attr = get(attr);
+		?{}(*_attr, default_attrs);
+		return 0;
+	}
+	int pthread_attr_destroy(pthread_attr_t *attr) libcfa_public __THROW {
+		^?{}(*get(attr));
+		return 0;
+	}
+
+	int pthread_attr_setscope( pthread_attr_t *attr, int contentionscope ) libcfa_public __THROW {
+		get( attr )->contentionscope = contentionscope;
+		return 0;
+	} // pthread_attr_setscope
+
+	int pthread_attr_getscope( const pthread_attr_t *attr, int *contentionscope ) libcfa_public __THROW {
+		*contentionscope = get( attr )->contentionscope;
+		return 0;
+	} // pthread_attr_getscope
+
+	int pthread_attr_setdetachstate( pthread_attr_t *attr, int detachstate ) libcfa_public __THROW {
+		get( attr )->detachstate = detachstate;
+		return 0;
+	} // pthread_attr_setdetachstate
+
+	int pthread_attr_getdetachstate( const pthread_attr_t *attr, int *detachstate ) libcfa_public __THROW {
+		*detachstate = get( attr )->detachstate;
+		return 0;
+	} // pthread_attr_getdetachstate
+
+	int pthread_attr_setstacksize( pthread_attr_t *attr, size_t stacksize ) libcfa_public __THROW {
+		get( attr )->stacksize = stacksize;
+		return 0;
+	} // pthread_attr_setstacksize
+
+	int pthread_attr_getstacksize( const pthread_attr_t *attr, size_t *stacksize ) libcfa_public __THROW {
+		*stacksize = get( attr )->stacksize;
+		return 0;
+	} // pthread_attr_getstacksize
+
+	int pthread_attr_getguardsize( const pthread_attr_t * /* attr */, size_t * /* guardsize */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_attr_getguardsize
+
+	int pthread_attr_setguardsize( pthread_attr_t * /* attr */, size_t /* guardsize */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_attr_setguardsize
+
+	int pthread_attr_setstackaddr( pthread_attr_t *attr, void *stackaddr ) libcfa_public __THROW {
+		get( attr )->stackaddr = stackaddr;
+		return 0;
+	} // pthread_attr_setstackaddr
+
+	int pthread_attr_getstackaddr( const pthread_attr_t *attr, void **stackaddr ) libcfa_public __THROW {
+		*stackaddr = get( attr )->stackaddr;
+		return 0;
+	} // pthread_attr_getstackaddr
+
+	int pthread_attr_setstack( pthread_attr_t *attr, void *stackaddr, size_t stacksize ) libcfa_public __THROW {
+		get( attr )->stackaddr = stackaddr;
+		get( attr )->stacksize = stacksize;
+		return 0;
+	} // pthread_attr_setstack
+
+	int pthread_attr_getstack( const pthread_attr_t *attr, void **stackaddr, size_t *stacksize ) libcfa_public __THROW {
+		*stackaddr = get( attr )->stackaddr;
+		*stacksize = get( attr )->stacksize;
+		return 0;
+	} // pthread_attr_getstack
+
+	// Initialize thread attribute *attr with attributes corresponding to the
+	// already running thread threadID. It shall be called on unitialized attr
+	// and destroyed with pthread_attr_destroy when no longer needed.
+	int pthread_getattr_np( pthread_t threadID, pthread_attr_t *attr ) libcfa_public __THROW { // GNU extension
+		check_nonnull(attr);
+
+		// copy all fields
+		*get(attr) = lookup( threadID )->attr;
+
+		return 0;
+	} // pthread_getattr_np
+
+
+	//######################### Threads #########################
+
+	int pthread_create(pthread_t * _thread, const pthread_attr_t * attr, void *(*start_routine)(void *), void * arg) libcfa_public __THROW {
+		cfaPthread *t = alloc();
+		(*t){_thread, attr, start_routine, arg};
+		return 0;
+	}
+
+
+	int pthread_join(pthread_t _thread, void **value_ptr) libcfa_public __THROW {
+		// if thread is invalid
+		if (_thread == NULL) return EINVAL;
+		if (_thread == pthread_self()) return EDEADLK;
+
+		// get user thr pointer
+		cfaPthread* p = lookup(_thread);
+		try {
+			join(*p);
+		}
+		// if thread called pthread_exit
+		catchResume (ThreadCancelled(cfaPthread) * cancel) {}
+
+		// fetch result
+		if (value_ptr != NULL ) *value_ptr = p->joinval;
+		delete(p);
+		return 0;
+	}
+
+	int pthread_tryjoin_np(pthread_t _thread, void **value_ptr) libcfa_public __THROW {
+		// if thread is invalid
+		if (_thread == NULL) return EINVAL;
+		if (_thread == pthread_self()) return EDEADLK;
+
+		cfaPthread* p = lookup(_thread);
+
+		// thread not finished ?
+		if (!p->isTerminated) return EBUSY;
+
+		join( *p );
+
+		if (value_ptr != NULL ) *value_ptr = p->joinval;
+		delete(p);
+		return 0;
+	}
+
+	pthread_t pthread_self(void) libcfa_public __THROW {
+		return (pthread_t)((uintptr_t)active_thread() - (sizeof(cfaPthread) - sizeof(thread$)));
+	}
+
+	void pthread_exit(void * status) libcfa_public __THROW {
+		pthread_t pid = pthread_self();
+		cfaPthread* _thread = (cfaPthread*)pid;
+		_thread->joinval = status;  // set return value
+		_thread->isTerminated = 1;  // set terminated flag
+		cancel_stack((pthread_exit_exp){&exp_vt});
+	}   //pthread_exit_
+
+	int pthread_yield( void ) __THROW {			// GNU extension
+		yield();
+		return 0;
+	}
+
+
+	//######################### Mutex #########################
+
+	int pthread_mutex_init(pthread_mutex_t *_mutex, const pthread_mutexattr_t *attr) libcfa_public __THROW {
+		check_nonnull(_mutex);
+		init(_mutex);
+		return 0;
+	}   //pthread_mutex_init_
+
+
+	int pthread_mutex_destroy(pthread_mutex_t *_mutex) libcfa_public __THROW {
+		check_nonnull(_mutex);
+		simple_owner_lock* _lock = get(_mutex);
+		if (_lock->owner != NULL){
+			return EBUSY;
+		}
+		destroy(_mutex);
+		return 0;
+	}   //pthread_mutex_destroy_
+
+	int pthread_mutex_lock(pthread_mutex_t *_mutex) libcfa_public __THROW {
+		check_nonnull(_mutex);
+		mutex_check(_mutex);
+		simple_owner_lock* _lock = get(_mutex);
+		lock(*_lock);
+		return 0;
+	}   //pthread_mutex_lock_
+
+	int pthread_mutex_unlock(pthread_mutex_t *_mutex) libcfa_public __THROW {
+		check_nonnull(_mutex);
+		simple_owner_lock* _lock = get(_mutex);
+		if (_lock->owner != active_thread()){
+			return EPERM;
+		} // current thread does not hold the mutex
+		unlock(*_lock);
+		return 0;
+	}   //pthread_mutex_unlock_
+
+	int pthread_mutex_trylock(pthread_mutex_t *_mutex) libcfa_public __THROW {
+		check_nonnull(_mutex);
+		simple_owner_lock* _lock = get(_mutex);
+		if (_lock->owner != active_thread() && _lock->owner != NULL){
+			return EBUSY;
+		}   // if mutex is owned
+		lock(*_lock);
+		return 0;
+	}   //pthread_mutex_trylock_
+
+	//######################### Conditional Variable #########################
+
+	/* conditional variable routines */
+	int pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr) libcfa_public __THROW {
+		check_nonnull(cond);
+		init(cond);
+		return 0;
+	}  //pthread_cond_init
+
+	int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *_mutex) libcfa_public __THROW {
+		check_nonnull(_mutex);
+		check_nonnull(cond);
+		wait(*get(cond), *get(_mutex));
+		return 0;
+	} // pthread_cond_wait
+
+	int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * _mutex, const struct timespec * abstime) libcfa_public __THROW {
+		check_nonnull(_mutex);
+		check_nonnull(cond);
+		wait(*get(cond), *get(_mutex), *abstime);
+		return 0;
+	} // pthread_cond_timedwait
+
+	int pthread_cond_signal(pthread_cond_t *cond) libcfa_public __THROW {
+		check_nonnull(cond);
+		return notify_one(*get(cond));
+	} // pthread_cond_signal
+
+	int pthread_cond_broadcast(pthread_cond_t *cond) libcfa_public __THROW {
+		check_nonnull(cond);
+		return notify_all(*get(cond));
+	} // pthread_cond_broadcast
+
+	int pthread_cond_destroy(pthread_cond_t *cond) libcfa_public __THROW {
+		check_nonnull(cond);
+		destroy(cond);
+		return 0;
+	} // pthread_cond_destroy
+
+
+
+	//######################### Local storage #########################
+
+	int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)) libcfa_public __THROW {
+		static_assert(sizeof(pthread_once_t) >= sizeof(int),"sizeof(pthread_once_t) < sizeof(int)");
+		check_nonnull(once_control);
+		check_nonnull(init_routine);
+		lock(once_lock);
+		if ( *((int *)once_control) == 0 ) {
+			init_routine();
+			*((int *)once_control) = 1;
+		} // if
+		unlock(once_lock);
+		return 0;
+	} // pthread_once
+
+	int pthread_key_create( pthread_key_t *key, void (*destructor)( void * ) ) libcfa_public __THROW {
+		lock(key_lock);
+		for ( int i = 0; i < PTHREAD_KEYS_MAX; i += 1 ) {
+			if ( ! cfa_pthread_keys[i].in_use ) {
+				cfa_pthread_keys[i].in_use = true;
+				cfa_pthread_keys[i].destructor = destructor;
+				unlock( key_lock );
+				*key = i;
+				return 0;
+			} // if
+		} // for
+		unlock(key_lock);
+		return EAGAIN;
+	}   // pthread_key_create
+
+	int pthread_key_delete( pthread_key_t key ) libcfa_public __THROW {
+		lock(key_lock);
+		if ( key >= PTHREAD_KEYS_MAX || ! cfa_pthread_keys[key].in_use ) {
+			unlock( key_lock );
+			return EINVAL;
+		} // if
+		cfa_pthread_keys[key].in_use = false;
+		cfa_pthread_keys[key].destructor = NULL;
+
+		// Remove key from all threads with a value.
+		pthread_values& p;
+		Sequence(pthread_values)& head = cfa_pthread_keys[key].threads;
+		for ( SeqIter(pthread_values) iter = { head }; iter | p; ) {
+			remove(head, p);
+			p.in_use = false;
+		}
+		unlock(key_lock);
+		return 0;
+	}   // pthread_key_delete
+
+	int pthread_setspecific( pthread_key_t key, const void *value ) libcfa_public __THROW {
+		// get current thread
+		cfaPthread* t = lookup(pthread_self());
+		// if current thread's pthreadData is NULL; initialize it
+		pthread_values* values;
+		if (t->pthreadData == NULL){
+			values = anew( PTHREAD_KEYS_MAX);
+			t->pthreadData = values;
+			for (int i = 0;i < PTHREAD_KEYS_MAX; i++){
+				t->pthreadData[i].in_use = false;
+			}   // for
+		}   else {
+			values = t->pthreadData;
+		}   // if
+		// find corresponding key and set value
+		lock(key_lock);
+		// if invalid key
+		if ( key >= PTHREAD_KEYS_MAX || ! cfa_pthread_keys[key].in_use ) {
+			unlock( key_lock );
+			return EINVAL;
+		} // if
+		pthread_values &entry = values[key];
+		if ( ! entry.in_use ) {
+			entry.in_use = true;
+			add(cfa_pthread_keys[key].threads, entry);
+		} // if
+		entry.value = (void *)value;
+		unlock(key_lock);
+		return 0;
+	} //pthread_setspecific
+
+	void* pthread_getspecific(pthread_key_t key) libcfa_public __THROW {
+		if (key >= PTHREAD_KEYS_MAX || ! cfa_pthread_keys[key].in_use) return NULL;
+
+		// get current thread
+		cfaPthread* t = lookup(pthread_self());
+		if (t->pthreadData == NULL) return NULL;
+		lock(key_lock);
+		pthread_values &entry = ((pthread_values *)t->pthreadData)[key];
+		if ( ! entry.in_use ) {
+			unlock( key_lock );
+			return NULL;
+		} // if
+		void *value = entry.value;
+		unlock(key_lock);
+
+		return value;
+	}   //pthread_get_specific
+
+	//######################### Parallelism #########################
+	void pthread_delete_kernel_threads_() __THROW {	// see uMain::~uMain
+		Pthread_kernel_threads& p;
+		for ( StackIter(Pthread_kernel_threads) iter = {cfa_pthreads_kernel_threads}; iter | p; ) {
+			delete(&p);
+		} // for
+	} // pthread_delete_kernel_threads_
+
+	int pthread_getconcurrency( void ) __THROW {	// XOPEN extension
+		return cfa_pthreads_kernel_threads_zero ? 0 : cfa_pthreads_no_kernel_threads;
+	} // pthread_getconcurrency
+
+	int pthread_setconcurrency( int new_level ) libcfa_public __THROW { // XOPEN extension
+		if ( new_level < 0 ) return EINVAL;
+		if ( new_level == 0 ) {
+			cfa_pthreads_kernel_threads_zero = true;	// remember set to zero, but ignore
+			return 0;					// do not do kernel thread management
+		} // exit
+		cfa_pthreads_kernel_threads_zero = false;
+		lock( concurrency_lock );
+		for ( ; new_level > cfa_pthreads_no_kernel_threads; cfa_pthreads_no_kernel_threads += 1 ) { // add processors ?
+			push(cfa_pthreads_kernel_threads, *new() );
+		} // for
+		for ( ; new_level < cfa_pthreads_no_kernel_threads; cfa_pthreads_no_kernel_threads -= 1 ) { // remove processors ?
+			delete(&pop(cfa_pthreads_kernel_threads));
+		} // for
+		unlock( concurrency_lock );
+		return 0;
+	} // pthread_setconcurrency
+
+	//######################### Signal #########################
+
+
+	 int pthread_sigmask( int /* how */, const sigset_t * /* set */, sigset_t * /* oset */ ) libcfa_public __THROW {
+		abort( "pthread_sigmask : not implemented" );
+		return 0;
+	 } // pthread_sigmask
+
+	int pthread_kill( pthread_t _thread __attribute__(( unused )), int sig ) libcfa_public __THROW {
+		if ( sig == 0 ) {
+			return 0;
+		} else {
+			abort( "pthread_kill : not implemented" );
+		} // if
+		return 0;
+	} // pthread_kill
+
+	int pthread_sigqueue(pthread_t , int sig, const union sigval) libcfa_public __THROW {
+		abort( "pthread_sigqueue : not implemented" );
+		return 0;
+	} // pthread_sigqueue
+
+	//######################### Scheduling #########################
+	int pthread_detach( pthread_t threadID ) __THROW {
+		abort( "pthread_detach : not implemented" );
+		return 0;
+	} // pthread_detach
+
+	int pthread_setschedparam( pthread_t /* thread */, int /* policy */, const struct sched_param * /* param */ ) libcfa_public __THROW {
+		abort( "pthread_setschedparam : not implemented" );
+		return 0;
+	} // pthread_setschedparam
+
+	int pthread_getschedparam( pthread_t /* thread */, int */* policy */, struct sched_param * /* param */ ) libcfa_public __THROW {
+		abort( "pthread_getschedparam : not implemented" );
+		return 0;
+	} // pthread_getschedparam
+
+	 //######################### Mutex Attr #########################
+
+	int pthread_mutexattr_init( pthread_mutexattr_t * /* attr */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_init
+
+	int pthread_mutexattr_destroy( pthread_mutexattr_t * /* attr */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_destroy
+
+	int pthread_mutexattr_setpshared( pthread_mutexattr_t * /* attr */, int /* pshared */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_setpshared
+
+	int pthread_mutexattr_getpshared( const pthread_mutexattr_t * /* attr */, int * /* pshared */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_getpshared
+
+	int pthread_mutexattr_setprotocol( pthread_mutexattr_t * /* attr */, int /* protocol */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_setprotocol
+
+	int pthread_mutexattr_getprotocol( const pthread_mutexattr_t * /* attr */, int * /* protocol */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_getprotocol
+
+	int pthread_mutexattr_setprioceiling( pthread_mutexattr_t * /* attr */, int /* prioceiling */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_setprioceiling
+
+	int pthread_mutexattr_getprioceiling( const pthread_mutexattr_t * /* attr */, int * /* ceiling */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_getprioceiling
+
+	int pthread_mutex_setprioceiling( pthread_mutex_t * /* mutex */, int /* prioceiling */, int * /* old_ceiling */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutex_setprioceiling
+
+	int pthread_mutex_getprioceiling( const pthread_mutex_t * /* mutex */, int * /* ceiling */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutex_getprioceiling
+
+	int pthread_mutexattr_gettype( __const pthread_mutexattr_t * __restrict /* __attr */, int * __restrict /* __kind */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_gettype
+
+	int pthread_mutexattr_settype( pthread_mutexattr_t * /* __attr */, int /* __kind */ ) libcfa_public __THROW {
+		return 0;
+	} // pthread_mutexattr_settype
+
+	//######################### Mutex #########################
+
+	int pthread_mutex_timedlock( pthread_mutex_t *__restrict /* __mutex */, __const struct timespec *__restrict /* __abstime */ ) libcfa_public __THROW {
+		abort( "pthread_mutex_timedlock" );
+	} // pthread_mutex_timedlock
+
+	//######################### Condition #########################
+
+	int pthread_condattr_getclock( __const pthread_condattr_t * __restrict /* __attr */, __clockid_t *__restrict /* __clock_id */ ) libcfa_public __THROW {
+		abort( "pthread_condattr_getclock" );
+	} // pthread_condattr_getclock
+
+	int pthread_condattr_setclock( pthread_condattr_t * /* __attr */, __clockid_t /* __clock_id */ ) libcfa_public __THROW {
+		abort( "pthread_condattr_setclock" );
+	} // pthread_condattr_setclock
+
+	//######################### Spinlock #########################
+
+	int pthread_spin_init( pthread_spinlock_t * /* __lock */, int /*__pshared */ ) libcfa_public __THROW {
+		abort( "pthread_spin_init" );
+	} // pthread_spin_init
+
+	int pthread_spin_destroy( pthread_spinlock_t * /* __lock */ ) libcfa_public __THROW {
+		abort( "pthread_spin_destroy" );
+	} // pthread_spin_destroy
+
+	int pthread_spin_lock( pthread_spinlock_t * /* __lock */ ) libcfa_public __THROW {
+		abort( "pthread_spin_lock" );
+	} // pthread_spin_lock
+
+	int pthread_spin_trylock( pthread_spinlock_t * /* __lock */ ) libcfa_public __THROW {
+		abort( "pthread_spin_trylock" );
+	} // pthread_spin_trylock
+
+	int pthread_spin_unlock( pthread_spinlock_t * /* __lock */ ) libcfa_public __THROW {
+		abort( "pthread_spin_unlock" );
+	} // pthread_spin_unlock
+
+	//######################### Barrier #########################
+
+	int pthread_barrier_init( pthread_barrier_t *__restrict /* __barrier */, __const pthread_barrierattr_t *__restrict /* __attr */, unsigned int /* __count */ ) libcfa_public __THROW {
+		abort( "pthread_barrier_init" );
+	} // pthread_barrier_init
+
+	int pthread_barrier_destroy( pthread_barrier_t * /* __barrier */ ) libcfa_public  __THROW {
+		abort( "pthread_barrier_destroy" );
+	} // pthread_barrier_destroy
+
+	int pthread_barrier_wait( pthread_barrier_t * /* __barrier */ ) libcfa_public __THROW {
+		abort( "pthread_barrier_wait" );
+	} // pthread_barrier_wait
+
+	int pthread_barrierattr_init( pthread_barrierattr_t * /* __attr */ ) libcfa_public __THROW {
+		abort( "pthread_barrierattr_init" );
+	} // pthread_barrierattr_init
+
+	int pthread_barrierattr_destroy( pthread_barrierattr_t * /* __attr */ ) libcfa_public __THROW {
+		abort( "pthread_barrierattr_destroy" );
+	} // pthread_barrierattr_destroy
+
+	int pthread_barrierattr_getpshared( __const pthread_barrierattr_t * __restrict /* __attr */, int *__restrict /* __pshared */ ) libcfa_public __THROW {
+		abort( "pthread_barrierattr_getpshared" );
+	} // pthread_barrierattr_getpshared
+
+	int pthread_barrierattr_setpshared( pthread_barrierattr_t * /* __attr */, int /* __pshared */ ) libcfa_public __THROW {
+		abort( "pthread_barrierattr_setpshared" );
+	} // pthread_barrierattr_setpshared
+
+	//######################### Clock #########################
+
+	int pthread_getcpuclockid( pthread_t /* __thread_id */, __clockid_t * /* __clock_id */ ) libcfa_public __THROW {
+		abort( "pthread_getcpuclockid" );
+	} // pthread_getcpuclockid
+
+	// pthread_atfork()
+
+// UNIX98
+
+	//######################### Read/Write #########################
+
+	int pthread_rwlock_init( pthread_rwlock_t *__restrict /* __rwlock */, __const pthread_rwlockattr_t *__restrict /* __attr */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_init" );
+	} // pthread_rwlock_init
+
+	int pthread_rwlock_destroy( pthread_rwlock_t * /* __rwlock */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_destroy" );
+	} // pthread_rwlock_destroy
+
+	int pthread_rwlock_rdlock( pthread_rwlock_t * /* __rwlock */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_rdlock" );
+	} // pthread_rwlock_rdlock
+
+	int pthread_rwlock_tryrdlock( pthread_rwlock_t * /* __rwlock */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_tryrdlock" );
+	} // pthread_rwlock_tryrdlock
+
+	int pthread_rwlock_wrlock( pthread_rwlock_t * /* __rwlock */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_wrlock" );
+	} // pthread_rwlock_wrlock
+
+	int pthread_rwlock_trywrlock( pthread_rwlock_t * /* __rwlock */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_trywrlock" );
+	} // pthread_rwlock_trywrlock
+
+	int pthread_rwlock_unlock( pthread_rwlock_t * /* __rwlock */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_unlock" );
+	} // pthread_rwlock_unlock
+
+	int pthread_rwlockattr_init( pthread_rwlockattr_t * /* __attr */ ) libcfa_public __THROW {
+		abort( "pthread_rwlockattr_init" );
+	} // pthread_rwlockattr_init
+
+	int pthread_rwlockattr_destroy( pthread_rwlockattr_t * /*__attr */ ) libcfa_public __THROW {
+		abort( "pthread_rwlockattr_destroy" );
+	} // pthread_rwlockattr_destroy
+
+	int pthread_rwlockattr_getpshared( __const pthread_rwlockattr_t * __restrict /* __attr */, int *__restrict /* __pshared */ ) libcfa_public __THROW {
+		abort( "pthread_rwlockattr_getpshared" );
+	} // pthread_rwlockattr_getpshared
+
+	int pthread_rwlockattr_setpshared( pthread_rwlockattr_t * /* __attr */, int /* __pshared */ ) libcfa_public __THROW {
+		abort( "pthread_rwlockattr_setpshared" );
+	} // pthread_rwlockattr_setpshared
+
+	int pthread_rwlockattr_getkind_np( __const pthread_rwlockattr_t * /* __attr */, int * /* __pref */ ) libcfa_public __THROW {
+		abort( "pthread_rwlockattr_getkind_np" );
+	} // pthread_rwlockattr_getkind_np
+
+	int pthread_rwlockattr_setkind_np( pthread_rwlockattr_t * /* __attr */, int /* __pref */ ) libcfa_public __THROW {
+		abort( "pthread_rwlockattr_setkind_np" );
+	} // pthread_rwlockattr_setkind_np
+
+// UNIX98 + XOPEN
+
+	int pthread_rwlock_timedrdlock( pthread_rwlock_t *__restrict  /* __rwlock */, __const struct timespec *__restrict /* __abstime */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_timedrdlock" );
+	} // pthread_rwlock_timedrdlock
+
+	int pthread_rwlock_timedwrlock( pthread_rwlock_t *__restrict  /* __rwlock */, __const struct timespec *__restrict /* __abstime */ ) libcfa_public __THROW {
+		abort( "pthread_rwlock_timedwrlock" );
+	} // pthread_rwlock_timedwrlock
+
+// GNU
+
+	//######################### Parallelism #########################
+
+	int pthread_setaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+		abort( "pthread_setaffinity_np" );
+	} // pthread_setaffinity_np
+
+	int pthread_getaffinity_np( pthread_t /* __th */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+		abort( "pthread_getaffinity_np" );
+	} // pthread_getaffinity_np
+
+	int pthread_attr_setaffinity_np( pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, __const cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+		abort( "pthread_attr_setaffinity_np" );
+	} // pthread_attr_setaffinity_np
+
+	int pthread_attr_getaffinity_np( __const pthread_attr_t * /* __attr */, size_t /* __cpusetsize */, cpu_set_t * /* __cpuset */ ) libcfa_public __THROW {
+		abort( "pthread_attr_getaffinity_np" );
+	} // pthread_attr_getaffinity_np
+
+	//######################### Cancellation #########################
+
+	void _pthread_cleanup_push_defer( struct _pthread_cleanup_buffer * /* __buffer */, void( * /* __routine */ )( void * ), void * /* __arg */ ) libcfa_public __THROW {
+		abort( "_pthread_cleanup_push_defer" );
+	} // _pthread_cleanup_push_defer
+
+	void _pthread_cleanup_pop_restore( struct _pthread_cleanup_buffer * /* __buffer */, int /* __execute */ ) libcfa_public __THROW {
+		abort( "_pthread_cleanup_pop_restore" );
+	} // _pthread_cleanup_pop_res
+
+	int pthread_cancel( pthread_t threadID ) libcfa_public __THROW {
+		abort("pthread cancel not implemented");
+		return 0;
+	} // pthread_cancel
+
+	int pthread_setcancelstate( int state, int *oldstate ) libcfa_public __THROW {
+		abort("pthread_setcancelstate not implemented");
+		return 0;
+	} // pthread_setcancelstate
+
+	int pthread_setcanceltype( int type, int *oldtype ) libcfa_public __THROW {
+		abort("pthread_setcanceltype not implemented");
+		return 0;
+	} // pthread_setcanceltype
+} // extern "C"
+
+#pragma GCC diagnostic pop
+
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -25,5 +25,5 @@
 static inline thread$ * mock_head(const __intrusive_lane_t & this) {
 	thread$ * rhead = (thread$ *)(
-		(uintptr_t)( &this.l.anchor ) - __builtin_offsetof( thread$, link )
+		(uintptr_t)( &this.l.anchor ) - __builtin_offsetof( thread$, rdy_link )
 	);
 	return rhead;
@@ -34,8 +34,8 @@
 static inline void push( __intrusive_lane_t & this, thread$ * node ) {
 	/* paranoid */ verify( this.l.lock );
-	/* paranoid */ verify( node->link.next == 0p );
-	/* paranoid */ verify( __atomic_load_n(&node->link.ts, __ATOMIC_RELAXED) == MAX  );
-	/* paranoid */ verify( this.l.prev->link.next == 0p );
-	/* paranoid */ verify( __atomic_load_n(&this.l.prev->link.ts, __ATOMIC_RELAXED)   == MAX  );
+	/* paranoid */ verify( node->rdy_link.next == 0p );
+	/* paranoid */ verify( __atomic_load_n(&node->rdy_link.ts, __ATOMIC_RELAXED) == MAX  );
+	/* paranoid */ verify( this.l.prev->rdy_link.next == 0p );
+	/* paranoid */ verify( __atomic_load_n(&this.l.prev->rdy_link.ts, __ATOMIC_RELAXED)   == MAX  );
 	if( this.l.anchor.next == 0p ) {
 		/* paranoid */ verify( this.l.anchor.next == 0p );
@@ -51,6 +51,6 @@
 
 	// Get the relevant nodes locally
-	this.l.prev->link.next = node;
-	__atomic_store_n(&this.l.prev->link.ts, rdtscl(), __ATOMIC_RELAXED);
+	this.l.prev->rdy_link.next = node;
+	__atomic_store_n(&this.l.prev->rdy_link.ts, rdtscl(), __ATOMIC_RELAXED);
 	this.l.prev = node;
 	#if !defined(__CFA_NO_STATISTICS__)
@@ -70,9 +70,9 @@
 	// Get the relevant nodes locally
 	thread$ * node = this.l.anchor.next;
-	this.l.anchor.next = node->link.next;
-	__atomic_store_n(&this.l.anchor.ts, __atomic_load_n(&node->link.ts, __ATOMIC_RELAXED), __ATOMIC_RELAXED);
+	this.l.anchor.next = node->rdy_link.next;
+	__atomic_store_n(&this.l.anchor.ts, __atomic_load_n(&node->rdy_link.ts, __ATOMIC_RELAXED), __ATOMIC_RELAXED);
 	bool is_empty = this.l.anchor.next == 0p;
-	node->link.next = 0p;
-	__atomic_store_n(&node->link.ts, ULLONG_MAX, __ATOMIC_RELAXED);
+	node->rdy_link.next = 0p;
+	__atomic_store_n(&node->rdy_link.ts, ULLONG_MAX, __ATOMIC_RELAXED);
 	#if !defined(__CFA_NO_STATISTICS__)
 		this.l.cnt--;
@@ -83,7 +83,7 @@
 
 	unsigned long long ats = __atomic_load_n(&this.l.anchor.ts, __ATOMIC_RELAXED);
-	/* paranoid */ verify( node->link.next == 0p );
-	/* paranoid */ verify( __atomic_load_n(&node->link.ts , __ATOMIC_RELAXED) == MAX );
-	/* paranoid */ verify( __atomic_load_n(&node->link.ts , __ATOMIC_RELAXED) != 0   );
+	/* paranoid */ verify( node->rdy_link.next == 0p );
+	/* paranoid */ verify( __atomic_load_n(&node->rdy_link.ts , __ATOMIC_RELAXED) == MAX );
+	/* paranoid */ verify( __atomic_load_n(&node->rdy_link.ts , __ATOMIC_RELAXED) != 0   );
 	/* paranoid */ verify( ats != 0 );
 	/* paranoid */ verify( (ats == MAX) == is_empty );
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/thread.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -44,15 +44,13 @@
 	self_mon_p = &self_mon;
 	curr_cluster = &cl;
-	link.next = 0p;
-	link.ts   = MAX;
+	rdy_link.next = 0p;
+	rdy_link.ts   = MAX;
 	preferred = ready_queue_new_preferred();
 	last_proc = 0p;
 	random_state = __global_random_mask ? __global_random_prime : __global_random_prime ^ rdtscl();
 	#if defined( __CFA_WITH_VERIFY__ )
+		executing = 0p;
 		canary = 0x0D15EA5E0D15EA5Ep;
 	#endif
-
-	node.next = 0p;
-	node.prev = 0p;
 
 	clh_node = malloc( );
@@ -177,4 +175,50 @@
 
 //-----------------------------------------------------------------------------
+bool migrate( thread$ * thrd, struct cluster & cl ) {
+
+	monitor$ * tmon = get_monitor(thrd);
+	monitor$ * __monitors[] = { tmon };
+	monitor_guard_t __guard = { __monitors, 1 };
+
+
+	{
+		// if nothing needs to be done, return false
+		if( thrd->curr_cluster == &cl ) return false;
+
+		// are we migrating ourself?
+		const bool local = thrd == active_thread();
+
+		/* paranoid */ verify( !local || &cl != active_cluster() );
+		/* paranoid */ verify( !local || thrd->curr_cluster == active_cluster() );
+		/* paranoid */ verify( !local || thrd->curr_cluster == active_processor()->cltr );
+		/* paranoid */ verify( local || tmon->signal_stack.top->owner->waiting_thread == thrd );
+		/* paranoid */ verify( local || tmon->signal_stack.top );
+
+		// make sure we aren't interrupted while doing this
+		// not as important if we aren't local
+		disable_interrupts();
+
+		// actually move the thread
+		unregister( thrd->curr_cluster, *thrd );
+		thrd->curr_cluster = &cl;
+		doregister( thrd->curr_cluster, *thrd );
+
+		// restore interrupts
+		enable_interrupts();
+
+		// if this is the local thread, we are still running on the old cluster
+		if(local) yield();
+
+		/* paranoid */ verify( !local || &cl == active_cluster() );
+		/* paranoid */ verify( !local || thrd->curr_cluster == active_cluster() );
+		/* paranoid */ verify( !local || thrd->curr_cluster == active_processor()->cltr );
+		/* paranoid */ verify(  local || tmon->signal_stack.top );
+		/* paranoid */ verify(  local || tmon->signal_stack.top->owner->waiting_thread == thrd );
+
+		return true;
+	}
+}
+
+//-----------------------------------------------------------------------------
 #define GENERATOR LCG
 
Index: libcfa/src/concurrency/thread.hfa
===================================================================
--- libcfa/src/concurrency/thread.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/concurrency/thread.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -132,4 +132,12 @@
 
 //----------
+// misc
+bool migrate( thread$ * thrd, struct cluster & cl );
+
+forall( T & | is_thread(T) )
+static inline bool migrate( T & mutex thrd, struct cluster & cl ) { return migrate( &(thread&)thrd, cl ); }
+
+
+//----------
 // prng
 static inline {
Index: libcfa/src/containers/array.hfa
===================================================================
--- libcfa/src/containers/array.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/containers/array.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -1,2 +1,4 @@
+#pragma once
+
 #include <assert.h>
 
@@ -18,7 +20,12 @@
     // About the choice of integral types offered as subscript overloads:
     // Intent is to cover these use cases:
+    //    a[0]                                                // i : zero_t
+    //    a[1]                                                // i : one_t
+    //    a[2]                                                // i : int
     //    float foo( ptrdiff_t i ) { return a[i]; }           // i : ptrdiff_t
+    //    float foo( size_t i ) { return a[i]; }              // i : size_t
     //    forall( [N] ) ... for( i; N ) { total += a[i]; }    // i : typeof( sizeof(42) )
     //    for( i; 5 ) { total += a[i]; }                      // i : int
+    //
     // It gets complicated by:
     // -  CFA does overloading on concrete types, like int and unsigned int, not on typedefed
@@ -28,9 +35,28 @@
     //    should give them type size_t.
     //
-    //                          gcc -m32         cfa -m32 given bug         gcc -m64
+    //                          gcc -m32         cfa -m32 given bug         gcc -m64 (and cfa)
     // ptrdiff_t                int              int                        long int
     // size_t                   unsigned int     unsigned int               unsigned long int
     // typeof( sizeof(42) )     unsigned int     unsigned long int          unsigned long int
     // int                      int              int                        int
+    //
+    // So the solution must support types {zero_t, one_t, int, unsigned int, long int, unsigned long int}
+    //
+    // The solution cannot rely on implicit conversions (e.g. just have one overload for ptrdiff_t)
+    // because assertion satisfaction requires types to match exacly.  Both higher-dimensional
+    // subscripting and operations on slices use asserted subscript operators.  The test case
+    // array-container/array-sbscr-cases covers the combinations.  Mike beleives that commenting out
+    // any of the current overloads leads to one of those cases failing, either on 64- or 32-bit.
+    // Mike is open to being shown a smaller set of overloads that still passes the test.
+
+    static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, zero_t ) {
+        assert( 0 < N );
+        return (Timmed &) a.strides[0];
+    }
+
+    static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, one_t ) {
+        assert( 1 < N );
+        return (Timmed &) a.strides[1];
+    }
 
     static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, int i ) {
@@ -77,4 +103,6 @@
         return N;
     }
+
+    static inline void __taglen( tag(arpk(N, S, Timmed, Tbase)), tag(N) ) {}
 
     // workaround #226 (and array relevance thereof demonstrated in mike102/otype-slow-ndims.cfa)
@@ -156,4 +184,11 @@
 #endif
 
+// Available for users to work around Trac #265
+// If `a[...0...]` isn't working, try `a[...ix0...]` instead.
+
+#define ix0 ((ptrdiff_t)0)
+
+
+
 //
 // Rotation
@@ -185,6 +220,24 @@
 //
 
-trait ar(A &, Tv &) {
-    Tv& ?[?]( A&, ptrdiff_t );
-    size_t ?`len( A& );
-};
+// desired:
+// trait ar(A &, Tv &, [N]) {
+//     Tv& ?[?]( A&, zero_t );
+//     Tv& ?[?]( A&, one_t  );
+//     Tv& ?[?]( A&, int    );
+//                   ...
+//     size_t ?`len( A& );
+//     void __taglen( tag(C), tag(N) );
+// };
+
+// working around N's not being accepted as arguments to traits
+
+#define ar(A, Tv, N) {                 \
+    Tv& ?[?]( A&, zero_t );            \
+    Tv& ?[?]( A&, one_t );             \
+    Tv& ?[?]( A&, int );               \
+    Tv& ?[?]( A&, unsigned int );      \
+    Tv& ?[?]( A&, long int );          \
+    Tv& ?[?]( A&, unsigned long int ); \
+    size_t ?`len( A& );                \
+    void __taglen( tag(A), tag(N) );   \
+}
Index: libcfa/src/containers/lockfree.hfa
===================================================================
--- libcfa/src/containers/lockfree.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
+++ libcfa/src/containers/lockfree.hfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -0,0 +1,252 @@
+#pragma once
+
+#include <assert.h>
+
+#include <stdint.h>
+#include <bits/defs.hfa>
+
+forall( T &) {
+	//------------------------------------------------------------
+	// Queue based on the MCS lock
+	// It is a Multi-Producer/Single-Consumer queue threads pushing
+	// elements must hold on to the elements they push
+	// Not appropriate for an async message queue for example,
+	struct mcs_queue {
+		T * volatile tail;
+	};
+
+	static inline void ?{}(mcs_queue(T) & this) { this.tail = 0p; }
+	static inline bool empty(const mcs_queue(T) & this) { return !this.tail; }
+
+ 	static inline forall(| { T * volatile & ?`next ( T * ); })
+	{
+		// Adds an element to the list
+		// Multi-Thread Safe, Lock-Free
+		T * push(mcs_queue(T) & this, T * elem) __attribute__((artificial));
+		T * push(mcs_queue(T) & this, T * elem) {
+			/* paranoid */ verify(!(elem`next));
+			// Race to add to the tail
+			T * prev = __atomic_exchange_n(&this.tail, elem, __ATOMIC_SEQ_CST);
+			// If we aren't the first, we need to tell the person before us
+			// No need to
+			if (prev) prev`next = elem;
+			return prev;
+		}
+
+		// Advances the head of the list, dropping the element given.
+		// Passing an element that is not the head is undefined behavior
+		// NOT Multi-Thread Safe, concurrent pushes are safe
+		T * advance(mcs_queue(T) & this, T * elem) __attribute__((artificial));
+		T * advance(mcs_queue(T) & this, T * elem) {
+			T * expected = elem;
+			// Check if this is already the last item
+			if (__atomic_compare_exchange_n(&this.tail, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) return 0p;
+
+			// If not wait for next item to show-up, filled by push
+			while (!(elem`next)) Pause();
+
+			// we need to return if the next link was empty
+			T * ret = elem`next;
+
+			// invalidate link to reset to initial state
+			elem`next = 0p;
+			return ret;
+		}
+	}
+
+	//------------------------------------------------------------
+	// Queue based on the MCS lock
+	// Extension of the above lock which supports 'blind' pops.
+	// i.e., popping a value from the head without knowing what the head is
+	// has no extra guarantees beyond the mcs_queue
+	struct mpsc_queue {
+		inline mcs_queue(T);
+		T * volatile head;
+	};
+
+	static inline void ?{}(mpsc_queue(T) & this) {
+		((mcs_queue(T)&)this){};
+		this.head = 0p;
+	}
+
+	static inline forall(| { T * volatile & ?`next ( T * ); })
+	{
+		// Added a new element to the queue
+		// Multi-Thread Safe, Lock-Free
+		T * push(mpsc_queue(T) & this, T * elem) __attribute__((artificial));
+		T * push(mpsc_queue(T) & this, T * elem) {
+			T * prev = push((mcs_queue(T)&)this, elem);
+			if (!prev) this.head = elem;
+			return prev;
+		}
+
+		// Pop an element from the queue
+		// return the element that was removed
+		// next is set to the new head of the queue
+		// NOT Multi-Thread Safe
+		T * pop(mpsc_queue(T) & this, T *& next) __attribute__((artificial));
+		T * pop(mpsc_queue(T) & this, T *& next) {
+			T * elem = this.head;
+			// If head is empty just return
+			if (!elem) return 0p;
+
+			// If there is already someone in the list, then it's easy
+			if (elem`next) {
+				this.head = next = elem`next;
+				// force memory sync
+				__atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+				// invalidate link to reset to initial state
+				elem`next = 0p;
+			}
+			// Otherwise, there might be a race where it only looks but someone is enqueuing
+			else {
+				// null out head here, because we linearize with push
+				// at the CAS in advance and therefore can write to head
+				// after that point, it could overwrite the write in push
+				this.head = 0p;
+				next = advance((mcs_queue(T)&)this, elem);
+
+				// Only write to the head if there is a next element
+				// it is the only way we can guarantee we are not overwriting
+				// a write made in push
+				if (next) this.head = next;
+			}
+
+			// return removed element
+			return elem;
+		}
+
+		// Same as previous function
+		T * pop(mpsc_queue(T) & this) {
+			T * _ = 0p;
+			return pop(this, _);
+		}
+	}
+
+	//------------------------------------------------------------
+	// Queue based on the MCS lock with poisoning
+	// It is a Multi-Producer/Single-Consumer queue threads pushing
+	// elements must hold on to the elements they push
+	// Not appropriate for an async message queue for example
+	// poisoning the queue prevents any new elements from being push
+	// enum(void*) poison_state {
+	// 	EMPTY = 0p,
+	// 	POISON = 1p,
+	// 	IN_PROGRESS = 1p
+	// };
+
+	struct poison_list {
+		T * volatile head;
+	};
+
+	static inline void ?{}(poison_list(T) & this) { this.head = 0p; }
+	static inline bool is_poisoned( const poison_list(T) & this ) { return 1p == this.head; }
+
+ 	static inline forall(| { T * volatile & ?`next ( T * ); })
+	{
+		// Adds an element to the list
+		// Multi-Thread Safe, Lock-Free
+		bool push(poison_list(T) & this, T * elem) __attribute__((artificial));
+		bool push(poison_list(T) & this, T * elem) {
+			/* paranoid */ verify(0p == (elem`next));
+			__atomic_store_n( &elem`next, (T*)1p, __ATOMIC_RELAXED );
+
+			// read the head up-front
+			T * expected = this.head;
+			for() {
+				// check if it's poisoned
+				if(expected == 1p) return false;
+
+				// try to CAS the elem in
+				if(__atomic_compare_exchange_n(&this.head, &expected, elem, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) {
+					// We managed to exchange in, we are done
+
+					// We should never succeed the CAS if it's poisonned and the elem should be 1p.
+					/* paranoid */ verify( expected  != 1p );
+					/* paranoid */ verify( elem`next == 1p );
+
+					// If we aren't the first, we need to tell the person before us
+					// No need to
+					elem`next = expected;
+					return true;
+				}
+			}
+		}
+
+		// Advances the head of the list, dropping the element given.
+		// Passing an element that is not the head is undefined behavior
+		// NOT Multi-Thread Safe, concurrent pushes are safe
+		T * advance(T * elem) __attribute__((artificial));
+		T * advance(T * elem) {
+			T * ret;
+
+			// Wait for next item to show-up, filled by push
+			while (1p == (ret = __atomic_load_n(&elem`next, __ATOMIC_RELAXED))) Pause();
+
+			return ret;
+		}
+
+		// Poison the queue, preveting new pushes and returning the head
+		T * poison(poison_list(T) & this) __attribute__((artificial));
+		T * poison(poison_list(T) & this) {
+			T * ret = __atomic_exchange_n( &this.head, (T*)1p, __ATOMIC_SEQ_CST );
+			/* paranoid */ verifyf( ret != (T*)1p, "Poison list %p poisoned more than once!", &this );
+			return ret;
+		}
+	}
+}
+
+forall( T & )
+union Link {
+	struct {											// 32/64-bit x 2
+		T * volatile top;								// pointer to stack top
+		uintptr_t count;								// count each push
+	};
+	#if __SIZEOF_INT128__ == 16
+	__int128											// gcc, 128-bit integer
+	#else
+	uint64_t											// 64-bit integer
+	#endif // __SIZEOF_INT128__ == 16
+	atom;
+}; // Link
+
+forall( T | sized(T) | { Link(T) * ?`next( T * ); } ) {
+	struct StackLF {
+		Link(T) stack;
+	}; // StackLF
+
+	static inline {
+		void ?{}( StackLF(T) & this ) with(this) { stack.atom = 0; }
+
+		T * top( StackLF(T) & this ) with(this) { return stack.top; }
+
+		void push( StackLF(T) & this, T & n ) with(this) {
+			*( &n )`next = stack;						// atomic assignment unnecessary, or use CAA
+			for () {									// busy wait
+			  if ( __atomic_compare_exchange_n( &stack.atom, &( &n )`next->atom, (Link(T))@{ {&n, ( &n )`next->count + 1} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) break; // attempt to update top node
+			} // for
+		} // push
+
+		T * pop( StackLF(T) & this ) with(this) {
+			Link(T) t @= stack;							// atomic assignment unnecessary, or use CAA
+			for () {									// busy wait
+			  if ( t.top == 0p ) return 0p;				// empty stack ?
+			  if ( __atomic_compare_exchange_n( &stack.atom, &t.atom, (Link(T))@{ {( t.top )`next->top, t.count} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) return t.top; // attempt to update top node
+			} // for
+		} // pop
+
+		bool unsafe_remove( StackLF(T) & this, T * node ) with(this) {
+			Link(T) * link = &stack;
+			for() {
+				T * next = link->top;
+				if( next == node ) {
+					link->top = ( node )`next->top;
+					return true;
+				}
+				if( next == 0p ) return false;
+				link = ( next )`next;
+			}
+		}
+	} // distribution
+} // distribution
Index: libcfa/src/containers/queueLockFree.hfa
===================================================================
--- libcfa/src/containers/queueLockFree.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ 	(revision )
@@ -1,125 +1,0 @@
-#pragma once
-
-#include <assert.h>
-
-#include <bits/defs.hfa>
-
-forall( T &) {
-	//------------------------------------------------------------
-	// Queue based on the MCS lock
-	// It is a Multi-Producer/Single-Consumer queue threads pushing
-	// elements must hold on to the elements they push
-	// Not appropriate for an async message queue for example,
-	struct mcs_queue {
-		T * volatile tail;
-	};
-
-	static inline void ?{}(mcs_queue(T) & this) { this.tail = 0p; }
-	static inline bool empty(const mcs_queue(T) & this) { return !this.tail; }
-
- 	static inline forall(| { T * volatile & ?`next ( T * ); })
-	{
-		// Adds an element to the list
-		// Multi-Thread Safe, Lock-Free
-		T * push(mcs_queue(T) & this, T * elem) __attribute__((artificial));
-		T * push(mcs_queue(T) & this, T * elem) {
-			/* paranoid */ verify(!(elem`next));
-			// Race to add to the tail
-			T * prev = __atomic_exchange_n(&this.tail, elem, __ATOMIC_SEQ_CST);
-			// If we aren't the first, we need to tell the person before us
-			// No need to
-			if (prev) prev`next = elem;
-			return prev;
-		}
-
-		// Advances the head of the list, dropping the element given.
-		// Passing an element that is not the head is undefined behavior
-		// NOT Multi-Thread Safe, concurrent pushes are safe
-		T * advance(mcs_queue(T) & this, T * elem) __attribute__((artificial));
-		T * advance(mcs_queue(T) & this, T * elem) {
-			T * expected = elem;
-			// Check if this is already the last item
-			if (__atomic_compare_exchange_n(&this.tail, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) return 0p;
-
-			// If not wait for next item to show-up, filled by push
-			while (!(elem`next)) Pause();
-
-			// we need to return if the next link was empty
-			T * ret = elem`next;
-
-			// invalidate link to reset to initial state
-			elem`next = 0p;
-			return ret;
-		}
-	}
-
-	//------------------------------------------------------------
-	// Queue based on the MCS lock
-	// Extension of the above lock which supports 'blind' pops.
-	// i.e., popping a value from the head without knowing what the head is
-	// has no extra guarantees beyond the mcs_queue
-	struct mpsc_queue {
-		inline mcs_queue(T);
-		T * volatile head;
-	};
-
-	static inline void ?{}(mpsc_queue(T) & this) {
-		((mcs_queue(T)&)this){};
-		this.head = 0p;
-	}
-
-	static inline forall(| { T * volatile & ?`next ( T * ); })
-	{
-		// Added a new element to the queue
-		// Multi-Thread Safe, Lock-Free
-		T * push(mpsc_queue(T) & this, T * elem) __attribute__((artificial));
-		T * push(mpsc_queue(T) & this, T * elem) {
-			T * prev = push((mcs_queue(T)&)this, elem);
-			if (!prev) this.head = elem;
-			return prev;
-		}
-
-		// Pop an element from the queue
-		// return the element that was removed
-		// next is set to the new head of the queue
-		// NOT Multi-Thread Safe
-		T * pop(mpsc_queue(T) & this, T *& next) __attribute__((artificial));
-		T * pop(mpsc_queue(T) & this, T *& next) {
-			T * elem = this.head;
-			// If head is empty just return
-			if (!elem) return 0p;
-
-			// If there is already someone in the list, then it's easy
-			if (elem`next) {
-				this.head = next = elem`next;
-				// force memory sync
-				__atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-				// invalidate link to reset to initial state
-				elem`next = 0p;
-			}
-			// Otherwise, there might be a race where it only looks but someone is enqueuing
-			else {
-				// null out head here, because we linearize with push
-				// at the CAS in advance and therefore can write to head
-				// after that point, it could overwrite the write in push
-				this.head = 0p;
-				next = advance((mcs_queue(T)&)this, elem);
-
-				// Only write to the head if there is a next element
-				// it is the only way we can guarantee we are not overwriting
-				// a write made in push
-				if (next) this.head = next;
-			}
-
-			// return removed element
-			return elem;
-		}
-
-		// Same as previous function
-		T * pop(mpsc_queue(T) & this) {
-			T * _ = 0p;
-			return pop(this, _);
-		}
-	}
-}
Index: libcfa/src/containers/stackLockFree.hfa
===================================================================
--- libcfa/src/containers/stackLockFree.hfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ 	(revision )
@@ -1,76 +1,0 @@
-//
-// Cforall Version 1.0.0 Copyright (C) 2017 University of Waterloo
-// The contents of this file are covered under the licence agreement in the
-// file "LICENCE" distributed with Cforall.
-//
-// stackLockFree.hfa --
-//
-// Author           : Peter A. Buhr
-// Created On       : Wed May 13 20:58:58 2020
-// Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Jan 20 20:40:03 2021
-// Update Count     : 67
-//
-
-#pragma once
-
-#include <stdint.h>
-
-forall( T & )
-union Link {
-	struct {											// 32/64-bit x 2
-		T * volatile top;								// pointer to stack top
-		uintptr_t count;								// count each push
-	};
-	#if __SIZEOF_INT128__ == 16
-	__int128											// gcc, 128-bit integer
-	#else
-	uint64_t											// 64-bit integer
-	#endif // __SIZEOF_INT128__ == 16
-	atom;
-}; // Link
-
-forall( T | sized(T) | { Link(T) * ?`next( T * ); } ) {
-	struct StackLF {
-		Link(T) stack;
-	}; // StackLF
-
-	static inline {
-		void ?{}( StackLF(T) & this ) with(this) { stack.atom = 0; }
-
-		T * top( StackLF(T) & this ) with(this) { return stack.top; }
-
-		void push( StackLF(T) & this, T & n ) with(this) {
-			*( &n )`next = stack;						// atomic assignment unnecessary, or use CAA
-			for () {									// busy wait
-			  if ( __atomic_compare_exchange_n( &stack.atom, &( &n )`next->atom, (Link(T))@{ {&n, ( &n )`next->count + 1} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) break; // attempt to update top node
-			} // for
-		} // push
-
-		T * pop( StackLF(T) & this ) with(this) {
-			Link(T) t @= stack;							// atomic assignment unnecessary, or use CAA
-			for () {									// busy wait
-			  if ( t.top == 0p ) return 0p;				// empty stack ?
-			  if ( __atomic_compare_exchange_n( &stack.atom, &t.atom, (Link(T))@{ {( t.top )`next->top, t.count} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) return t.top; // attempt to update top node
-			} // for
-		} // pop
-
-		bool unsafe_remove( StackLF(T) & this, T * node ) with(this) {
-			Link(T) * link = &stack;
-			for() {
-				T * next = link->top;
-				if( next == node ) {
-					link->top = ( node )`next->top;
-					return true;
-				}
-				if( next == 0p ) return false;
-				link = ( next )`next;
-			}
-		}
-	} // distribution
-} // distribution
-
-
-// Local Variables: //
-// tab-width: 4 //
-// End: //
Index: libcfa/src/heap.cfa
===================================================================
--- libcfa/src/heap.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/heap.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -10,6 +10,6 @@
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Oct 13 22:21:52 2022
-// Update Count     : 1557
+// Last Modified On : Sun Oct 30 20:56:20 2022
+// Update Count     : 1584
 //
 
@@ -43,6 +43,9 @@
 
 #define FASTLOOKUP										// use O(1) table lookup from allocation size to bucket size
-#define RETURNSPIN										// toggle spinlock / lockfree stack
 #define OWNERSHIP										// return freed memory to owner thread
+#define RETURNSPIN										// toggle spinlock / lockfree queue
+#if ! defined( OWNERSHIP ) && defined( RETURNSPIN )
+#warning "RETURNSPIN is ignored without OWNERSHIP; suggest commenting out RETURNSPIN"
+#endif // ! OWNERSHIP && RETURNSPIN
 
 #define CACHE_ALIGN 64
@@ -109,5 +112,21 @@
 
 
-//######################### Spin Lock #########################
+//######################### Helpers #########################
+
+
+// generic Bsearchl does not inline, so substitute with hand-coded binary-search.
+inline __attribute__((always_inline))
+static size_t Bsearchl( unsigned int key, const unsigned int vals[], size_t dim ) {
+	size_t l = 0, m, h = dim;
+	while ( l < h ) {
+		m = (l + h) / 2;
+		if ( (unsigned int &)(vals[m]) < key ) {		// cast away const
+			l = m + 1;
+		} else {
+			h = m;
+		} // if
+	} // while
+	return l;
+} // Bsearchl
 
 
@@ -206,14 +225,4 @@
 
 
-#define SPINLOCK 0
-#define LOCKFREE 1
-#define BUCKETLOCK SPINLOCK
-#if BUCKETLOCK == SPINLOCK
-#elif BUCKETLOCK == LOCKFREE
-#include <stackLockFree.hfa>
-#else
-	#error undefined lock type for bucket lock
-#endif // LOCKFREE
-
 // Recursive definitions: HeapManager needs size of bucket array and bucket area needs sizeof HeapManager storage.
 // Break recursion by hardcoding number of buckets and statically checking number is correct after bucket array defined.
@@ -232,13 +241,8 @@
 								void * home;			// allocated block points back to home locations (must overlay alignment)
 								size_t blockSize;		// size for munmap (must overlay alignment)
-								#if BUCKETLOCK == SPINLOCK
 								Storage * next;			// freed block points to next freed block of same size
-								#endif // SPINLOCK
 							};
 							size_t size;				// allocation size in bytes
 						};
-						#if BUCKETLOCK == LOCKFREE
-						Link(Storage) next;				// freed block points next freed block of same size (double-wide)
-						#endif // LOCKFREE
 					};
 				} real; // RealHeader
@@ -259,5 +263,4 @@
 	struct __attribute__(( aligned (8) )) FreeHeader {
 		size_t blockSize __attribute__(( aligned(8) )); // size of allocations on this list
-		#if BUCKETLOCK == SPINLOCK
 		#ifdef OWNERSHIP
 		#ifdef RETURNSPIN
@@ -266,8 +269,6 @@
 		Storage * returnList;							// other thread return list
 		#endif // OWNERSHIP
+
 		Storage * freeList;								// thread free list
-		#else
-		StackLF(Storage) freeList;
-		#endif // BUCKETLOCK
 		Heap * homeManager;								// heap owner (free storage to bucket, from bucket to heap)
 	}; // FreeHeader
@@ -290,13 +291,4 @@
 	#endif // __STATISTICS__
 }; // Heap
-
-#if BUCKETLOCK == LOCKFREE
-inline __attribute__((always_inline))
-static {
-	Link(Heap.Storage) * ?`next( Heap.Storage * this ) { return &this->header.kind.real.next; }
-	void ?{}( Heap.FreeHeader & ) {}
-	void ^?{}( Heap.FreeHeader & ) {}
-} // distribution
-#endif // LOCKFREE
 
 
@@ -385,20 +377,4 @@
 
 
-// generic Bsearchl does not inline, so substitute with hand-coded binary-search.
-inline __attribute__((always_inline))
-static size_t Bsearchl( unsigned int key, const unsigned int vals[], size_t dim ) {
-	size_t l = 0, m, h = dim;
-	while ( l < h ) {
-		m = (l + h) / 2;
-		if ( (unsigned int &)(vals[m]) < key ) {		// cast away const
-			l = m + 1;
-		} else {
-			h = m;
-		} // if
-	} // while
-	return l;
-} // Bsearchl
-
-
 void heapMasterCtor() with( heapMaster ) {
 	// Singleton pattern to initialize heap master
@@ -409,6 +385,6 @@
 	__map_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
 
-	?{}( extLock );
-	?{}( mgrLock );
+	extLock = 0;
+	mgrLock = 0;
 
 	char * end = (char *)sbrk( 0 );
@@ -497,13 +473,14 @@
 				#ifdef OWNERSHIP
 				#ifdef RETURNSPIN
-				?{}( freeLists[j].returnLock );
+				freeLists[j].returnLock = 0;
+				freeLists[j].returnList = 0p;
 				#endif // RETURNSPIN
-				freeLists[j].returnList = 0p;
 				#endif // OWNERSHIP
+
 				freeLists[j].freeList = 0p;
 				freeLists[j].homeManager = heap;
 				freeLists[j].blockSize = bucketSizes[j];
 			} // for
-	
+
 			heapBuffer = 0p;
 			heapReserve = 0;
@@ -522,5 +499,5 @@
 	if ( unlikely( ! heapMasterBootFlag ) ) heapMasterCtor();
 
-	lock( heapMaster.mgrLock );		// protect heapMaster counters
+	lock( heapMaster.mgrLock );							// protect heapMaster counters
 
 	// get storage for heap manager
@@ -710,4 +687,5 @@
 	// find the closest bucket size less than or equal to the mmapStart size
 	maxBucketsUsed = Bsearchl( mmapStart, bucketSizes, NoBucketSizes ); // binary search
+
 	verify( maxBucketsUsed < NoBucketSizes );			// subscript failure ?
 	verify( mmapStart <= bucketSizes[maxBucketsUsed] ); // search failure ?
@@ -832,8 +810,7 @@
 
 		size_t increase = ceiling2( size > heapExpand ? size : heapExpand, libAlign() );
-		// Do not call abort or strerror( errno ) as they may call malloc.
 		if ( unlikely( sbrk( increase ) == (void *)-1 ) ) {	// failed, no memory ?
 			unlock( extLock );
-			abort( NO_MEMORY_MSG, size );				// no memory
+			abort( NO_MEMORY_MSG, size );				// give up
 		} // if
 
@@ -971,26 +948,28 @@
 		#endif // __STATISTICS__
 
-		// Spin until the lock is acquired for this particular size of block.
-
-		#if BUCKETLOCK == SPINLOCK
 		block = freeHead->freeList;						// remove node from stack
-		#else
-		block = pop( freeHead->freeList );
-		#endif // BUCKETLOCK
 		if ( unlikely( block == 0p ) ) {				// no free block ?
+			// Freelist for this size is empty, so check return list (OWNERSHIP), carve it out of the heap, if there
+			// is enough left, or get some more heap storage and carve it off.
 			#ifdef OWNERSHIP
-			// Freelist for that size is empty, so carve it out of the heap, if there is enough left, or get some more
-			// and then carve it off.
-			#ifdef RETURNSPIN
-			#if BUCKETLOCK == SPINLOCK
-			lock( freeHead->returnLock );
-			block = freeHead->returnList;
-			freeHead->returnList = 0p;
-			unlock( freeHead->returnLock );
-			#else
-			block = __atomic_exchange_n( &freeHead->returnList, nullptr, __ATOMIC_SEQ_CST );
-			#endif // RETURNSPIN
-
-			if ( likely( block == 0p ) ) {			// return list also empty?
+			if ( unlikely( freeHead->returnList ) ) {	// race, get next time if lose race
+				#ifdef RETURNSPIN
+				lock( freeHead->returnLock );
+				block = freeHead->returnList;
+				freeHead->returnList = 0p;
+				unlock( freeHead->returnLock );
+				#else
+				block = __atomic_exchange_n( &freeHead->returnList, 0p, __ATOMIC_SEQ_CST );
+				#endif // RETURNSPIN
+
+				verify( block );
+				#ifdef __STATISTICS__
+				stats.return_pulls += 1;
+				#endif // __STATISTICS__
+
+				// OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
+
+				freeHead->freeList = block->header.kind.real.next; // merge returnList into freeHead
+			} else {
 			#endif // OWNERSHIP
 				// Do not leave kernel thread as manager_extend accesses heapManager.
@@ -1002,17 +981,8 @@
 
 				#ifdef __CFA_DEBUG__
-				// Scrub new memory so subsequent uninitialized usages might fail. Only scrub the first 1024 bytes.
+				// Scrub new memory so subsequent uninitialized usages might fail. Only scrub the first SCRUB_SIZE bytes.
 				memset( block->data, SCRUB, min( SCRUB_SIZE, tsize - sizeof(Heap.Storage) ) );
 				#endif // __CFA_DEBUG__
-			#endif // BUCKETLOCK
 			#ifdef OWNERSHIP
-			} else {									// merge returnList into freeHead
-				#ifdef __STATISTICS__
-				stats.return_pulls += 1;
-				#endif // __STATISTICS__
-
-				// OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
-
-				freeHead->freeList = block->header.kind.real.next;
 			} // if
 			#endif // OWNERSHIP
@@ -1026,4 +996,5 @@
   if ( unlikely( size > ULONG_MAX - __page_size ) ) return 0p;
 		tsize = ceiling2( tsize, __page_size );			// must be multiple of page size
+
 		#ifdef __STATISTICS__
 		stats.counters[STAT_NAME].alloc += tsize;
@@ -1042,11 +1013,12 @@
 			if ( errno == ENOMEM ) abort( NO_MEMORY_MSG, tsize ); // no memory
 			// Do not call strerror( errno ) as it may call malloc.
-			abort( "**** Error **** attempt to allocate large object (> %zu) of size %zu bytes and mmap failed with errno %d.", size, heapMaster.mmapStart, errno );
+			abort( "**** Error **** attempt to allocate large object (> %zu) of size %zu bytes and mmap failed with errno %d.",
+				   size, heapMaster.mmapStart, errno );
 		} // if
 		block->header.kind.real.blockSize = MarkMmappedBit( tsize ); // storage size for munmap
 
 		#ifdef __CFA_DEBUG__
-		// Scrub new memory so subsequent uninitialized usages might fail. Only scrub the first 1024 bytes.  The rest of
-		// the storage set to 0 by mmap.
+		// Scrub new memory so subsequent uninitialized usages might fail. Only scrub the first SCRUB_SIZE bytes. The
+		// rest of the storage set to 0 by mmap.
 		memset( block->data, SCRUB, min( SCRUB_SIZE, tsize - sizeof(Heap.Storage) ) );
 		#endif // __CFA_DEBUG__
@@ -1126,4 +1098,5 @@
 		#endif // __CFA_DEBUG__
 
+		#ifdef OWNERSHIP
 		if ( likely( heapManager == freeHead->homeManager ) ) { // belongs to this thread
 			header->kind.real.next = freeHead->freeList; // push on stack
@@ -1132,5 +1105,4 @@
 			verify( heapManager );
 
-			#ifdef OWNERSHIP
 			#ifdef RETURNSPIN
 			lock( freeHead->returnLock );
@@ -1141,23 +1113,24 @@
 			header->kind.real.next = freeHead->returnList; // link new node to top node
 			// CAS resets header->kind.real.next = freeHead->returnList on failure
-			while ( ! __atomic_compare_exchange_n( &freeHead->returnList, &header->kind.real.next, header,
+			while ( ! __atomic_compare_exchange_n( &freeHead->returnList, &header->kind.real.next, (Heap.Storage *)header,
 												   false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) );
 			#endif // RETURNSPIN
-
-			#else										// no OWNERSHIP
-
-			freeHead = &heap->freeLists[ClearStickyBits( header->kind.real.home ) - &freeHead->homeManager->freeLists[0]];
-			header->kind.real.next = freeHead->freeList; // push on stack
-			freeHead->freeList = (Heap.Storage *)header;
-			#endif // ! OWNERSHIP
-
-			#ifdef __U_STATISTICS__
-			stats.return_pushes += 1;
-			stats.return_storage_request += rsize;
-			stats.return_storage_alloc += size;
-			#endif // __U_STATISTICS__
-
-			// OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
-		} // if
+		} // if
+
+		#else											// no OWNERSHIP
+
+		// kind.real.home is address in owner thread's freeLists, so compute the equivalent position in this thread's freeList.
+		freeHead = &freeLists[ClearStickyBits( (Heap.FreeHeader *)(header->kind.real.home) ) - &freeHead->homeManager->freeLists[0]];
+		header->kind.real.next = freeHead->freeList;	// push on stack
+		freeHead->freeList = (Heap.Storage *)header;
+		#endif // ! OWNERSHIP
+
+		#ifdef __U_STATISTICS__
+		stats.return_pushes += 1;
+		stats.return_storage_request += rsize;
+		stats.return_storage_alloc += size;
+		#endif // __U_STATISTICS__
+
+		// OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
 	} // if
 
@@ -1186,14 +1159,5 @@
 		#endif // __STATISTICS__
 
-		#if BUCKETLOCK == SPINLOCK
 		for ( Heap.Storage * p = freeLists[i].freeList; p != 0p; p = p->header.kind.real.next ) {
-		#else
-			for(;;) {
-//		for ( Heap.Storage * p = top( freeLists[i].freeList ); p != 0p; p = (p)`next->top ) {
-//		for ( Heap.Storage * p = top( freeLists[i].freeList ); p != 0p; /* p = getNext( p )->top */) {
-//			Heap.Storage * temp = p->header.kind.real.next.top; // FIX ME: direct assignent fails, initialization works`
-//			typeof(p) temp = (( p )`next)->top;			// FIX ME: direct assignent fails, initialization works`
-//			p = temp;
-		#endif // BUCKETLOCK
 			total += size;
 			#ifdef __STATISTICS__
Index: libcfa/src/interpose.cfa
===================================================================
--- libcfa/src/interpose.cfa	(revision b77f0e1fb94f6a4a2617000cc28b1371637e1fb8)
+++ libcfa/src/interpose.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -42,8 +42,30 @@
 
 typedef void (* generic_fptr_t)(void);
+static generic_fptr_t do_interpose_symbol( void * library, const char symbol[], const char version[] ) {
+	const char * error;
+
+	union { generic_fptr_t fptr; void * ptr; } originalFunc;
+
+	#if defined( _GNU_SOURCE )
+		if ( version ) {
+			originalFunc.ptr = dlvsym( library, symbol, version );
+		} else {
+			originalFunc.ptr = dlsym( library, symbol );
+		}
+	#else
+		originalFunc.ptr = dlsym( library, symbol );
+	#endif // _GNU_SOURCE
+
+	error = dlerror();
+	if ( error ) abort( "interpose_symbol : internal error, %s\n", error );
+
+	return originalFunc.fptr;
+}
+
 static generic_fptr_t interpose_symbol( const char symbol[], const char version[] ) {
 	const char * error;
 
 	static void * library;
+	static void * pthread_library;
 	if ( ! library ) {
 		#if defined( RTLD_NEXT )
@@ -58,21 +80,18 @@
 		#endif
 	} // if
-
-	union { generic_fptr_t fptr; void * ptr; } originalFunc;
-
-	#if defined( _GNU_SOURCE )
-		if ( version ) {
-			originalFunc.ptr = dlvsym( library, symbol, version );
-		} else {
-			originalFunc.ptr = dlsym( library, symbol );
-		}
-	#else
-		originalFunc.ptr = dlsym( library, symbol );
-	#endif // _GNU_SOURCE
-
-	error = dlerror();
-	if ( error ) abort( "interpose_symbol : internal error, %s\n", error );
-
-	return originalFunc.fptr;
+	if ( ! pthread_library ) {
+		#if defined( RTLD_NEXT )
+			pthread_library = RTLD_NEXT;
+		#else
+			// missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+			pthread_library = dlopen( "libpthread.so", RTLD_LAZY );
+			error = dlerror();
+			if ( error ) {
+				abort( "interpose_symbol : failed to open libpthread, %s\n", error );
+			}
+		#endif
+	} // if
+
+	return do_interpose_symbol(library, symbol, version);
 }
 
@@ -97,4 +116,5 @@
 
 extern "C" {
+	void __cfathreadabi_interpose_startup( generic_fptr_t (*do_interpose_symbol)( void * library, const char symbol[], const char version[] ) ) __attribute__((weak));
 	void __cfaabi_interpose_startup( void ) {
 		const char *version = 0p;
@@ -108,4 +128,6 @@
 		INTERPOSE_LIBC( exit , version );
 #pragma GCC diagnostic pop
+
+		if(__cfathreadabi_interpose_startup) __cfathreadabi_interpose_startup( do_interpose_symbol );
 
 		// As a precaution (and necessity), errors that result in termination are delivered on a separate stack because
Index: libcfa/src/interpose_thread.cfa
===================================================================
--- libcfa/src/interpose_thread.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
+++ libcfa/src/interpose_thread.cfa	(revision 63be33872128e89a0dc1fe9bdb8448d00934a453)
@@ -0,0 +1,137 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2022 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// interpose_thread.c --
+//
+// Author           : Thierry Delisle
+// Created On       : Wed Sep 21 11:55:16 2022
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+
+#include <stdarg.h>										// va_start, va_end
+#include <stdio.h>
+#include <string.h>										// strlen
+#include <signal.h>
+#include <pthread.h>
+extern "C" {
+#include <dlfcn.h>										// dlopen, dlsym
+#include <execinfo.h>									// backtrace, messages
+}
+
+#include "bits/debug.hfa"
+#include "bits/defs.hfa"
+#include <assert.h>
+
+//=============================================================================================
+// Interposing helpers
+//=============================================================================================
+
+typedef void (* generic_fptr_t)(void);
+
+generic_fptr_t interpose_symbol(
+	generic_fptr_t (*do_interpose_symbol)( void * library, const char symbol[], const char version[] ),
+	const char symbol[],
+	const char version[]
+) libcfa_public {
+	const char * error;
+
+	static void * library;
+	if ( ! library ) {
+		#if defined( RTLD_NEXT )
+			library = RTLD_NEXT;
+		#else
+			// missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+			library = dlopen( "libpthread.so", RTLD_LAZY );
+			error = dlerror();
+			if ( error ) {
+				abort( "interpose_symbol : failed to open libpthread, %s\n", error );
+			}
+		#endif
+	} // if
+
+	return do_interpose_symbol(library, symbol, version);
+}
+
+#define INTERPOSE( x, ver ) __cabi_libpthread.x = (typeof(__cabi_libpthread.x))interpose_symbol( do_interpose_symbol, #x, ver )
+
+//=============================================================================================
+// Interposition Startup logic
+//=============================================================================================
+
+static struct {
+	int (*pthread_create)(pthread_t *_thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
+	int (*pthread_join)(pthread_t _thread, void **retval);
+	pthread_t (*pthread_self)(void);
+	int (*pthread_attr_init)(pthread_attr_t *attr);
+	int (*pthread_attr_destroy)(pthread_attr_t *attr);
+	int (*pthread_attr_setstack)( pthread_attr_t *attr, void *stackaddr, size_t stacksize );
+	int (*pthread_attr_getstacksize)( const pthread_attr_t *attr, size_t *stacksize );
+	int (*pthread_sigmask)(int how, const sigset_t *set, sigset_t *oldset);
+	int (*pthread_sigqueue)(pthread_t _thread, int sig, const union sigval value);
+	int (*pthread_once)(pthread_once_t *once_control, void (*init_routine)(void));
+} __cabi_libpthread;
+
+extern "C" {
+	void __cfathreadabi_interpose_startup( generic_fptr_t (*do_interpose_symbol)( void * library, const char symbol[], const char version[] ) ) libcfa_public {
+		const char *version = 0p;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
+		INTERPOSE( pthread_create , version );
+		INTERPOSE( pthread_join , version );
+		INTERPOSE( pthread_self , version );
+		INTERPOSE( pthread_attr_init , version );
+		INTERPOSE( pthread_attr_destroy , version );
+		INTERPOSE( pthread_attr_setstack , version );
+		INTERPOSE( pthread_attr_getstacksize , version );
+		INTERPOSE( pthread_sigmask , version );
+		INTERPOSE( pthread_sigqueue , version );
+		INTERPOSE( pthread_once , version );
+#pragma GCC diagnostic pop
+	}
+
+	int __cfaabi_pthread_create(pthread_t *_thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg){
+		return __cabi_libpthread.pthread_create(_thread, attr, start_routine, arg);
+	}
+
+	int __cfaabi_pthread_join(pthread_t _thread, void **retval){
+		return __cabi_libpthread.pthread_join(_thread, retval);
+	}
+
+	pthread_t __cfaabi_pthread_self(void){
+		return __cabi_libpthread.pthread_self();
+	}
+
+	int __cfaabi_pthread_attr_init(pthread_attr_t *attr){
+		return __cabi_libpthread.pthread_attr_init(attr);
+	}
+
+	int __cfaabi_pthread_attr_destroy(pthread_attr_t *attr){
+		return __cabi_libpthread.pthread_attr_destroy(attr);
+	}
+
+	int __cfaabi_pthread_attr_setstack( pthread_attr_t *attr, void *stackaddr, size_t stacksize ){
+		return __cabi_libpthread.pthread_attr_setstack(attr, stackaddr, stacksize);
+	}
+
+	int read_pthread_attr_getstacksize( const pthread_attr_t *attr, size_t *stacksize ){
+		return __cabi_libpthread.pthread_attr_getstacksize(attr, stacksize);
+	}
+
+	int __cfaabi_pthread_sigmask(int how, const sigset_t *set, sigset_t *oldset){
+		return __cabi_libpthread.pthread_sigmask(how, set, oldset);
+	}
+
+	int __cfaabi_pthread_sigqueue(pthread_t _thread, int sig, const union sigval value) {
+		return __cabi_libpthread.pthread_sigqueue(_thread, sig, value);
+	}
+
+	int __cfaabi_pthread_once(pthread_once_t *once_control, void (*init_routine)(void)) {
+		return __cabi_libpthread.pthread_once(once_control, init_routine);
+	}
+}