Index: libcfa/src/concurrency/clib/cfathread.cfa
===================================================================
--- libcfa/src/concurrency/clib/cfathread.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/clib/cfathread.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -14,4 +14,6 @@
 //
 
+// #define EPOLL_FOR_SOCKETS
+
 #include "fstream.hfa"
 #include "locks.hfa"
@@ -23,13 +25,202 @@
 #include "cfathread.h"
 
+extern "C" {
+		#include <string.h>
+		#include <errno.h>
+}
+
 extern void ?{}(processor &, const char[], cluster &, thread$ *);
 extern "C" {
       extern void __cfactx_invoke_thread(void (*main)(void *), void * this);
+	extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
 }
 
 extern Time __kernel_get_time();
+extern unsigned register_proc_id( void );
 
 //================================================================================
-// Thread run y the C Interface
+// Epoll support for sockets
+
+#if defined(EPOLL_FOR_SOCKETS)
+	extern "C" {
+		#include <sys/epoll.h>
+		#include <sys/resource.h>
+	}
+
+	static pthread_t master_poller;
+	static int master_epollfd = 0;
+	static size_t poller_cnt = 0;
+	static int * poller_fds = 0p;
+	static struct leaf_poller * pollers = 0p;
+
+	struct __attribute__((aligned)) fd_info_t {
+		int pollid;
+		size_t rearms;
+	};
+	rlim_t fd_limit = 0;
+	static fd_info_t * volatile * fd_map = 0p;
+
+	void * master_epoll( __attribute__((unused)) void * args ) {
+		unsigned id = register_proc_id();
+
+		enum { MAX_EVENTS = 5 };
+		struct epoll_event events[MAX_EVENTS];
+		for() {
+			int ret = epoll_wait(master_epollfd, events, MAX_EVENTS, -1);
+			if ( ret < 0 ) {
+				abort | "Master epoll error: " | strerror(errno);
+			}
+
+			for(i; ret) {
+				thread$ * thrd = (thread$ *)events[i].data.u64;
+				unpark( thrd );
+			}
+		}
+
+		return 0p;
+	}
+
+	static inline int epoll_rearm(int epollfd, int fd, uint32_t event) {
+		struct epoll_event eevent;
+		eevent.events = event | EPOLLET | EPOLLONESHOT;
+		eevent.data.u64 = (uint64_t)active_thread();
+
+		if(0 != epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &eevent))
+		{
+			if(errno == ENOENT) return -1;
+			abort | acquire | "epoll" | epollfd | "ctl rearm" | fd | "error: " | errno | strerror(errno);
+		}
+
+		park();
+		return 0;
+	}
+
+	thread leaf_poller {
+		int epollfd;
+	};
+
+	void ?{}(leaf_poller & this, int fd) { this.epollfd = fd; }
+
+	void main(leaf_poller & this) {
+		enum { MAX_EVENTS = 1024 };
+		struct epoll_event events[MAX_EVENTS];
+		const int max_retries = 5;
+		int retries = max_retries;
+
+		struct epoll_event event;
+		event.events = EPOLLIN | EPOLLET | EPOLLONESHOT;
+		event.data.u64 = (uint64_t)&(thread&)this;
+
+		if(0 != epoll_ctl(master_epollfd, EPOLL_CTL_ADD, this.epollfd, &event))
+		{
+			abort | "master epoll ctl add leaf: " | errno | strerror(errno);
+		}
+
+		park();
+
+		for() {
+			yield();
+			int ret = epoll_wait(this.epollfd, events, MAX_EVENTS, 0);
+			if ( ret < 0 ) {
+				abort | "Leaf epoll error: " | errno | strerror(errno);
+			}
+
+			if(ret) {
+				for(i; ret) {
+					thread$ * thrd = (thread$ *)events[i].data.u64;
+					unpark( thrd, UNPARK_REMOTE );
+				}
+			}
+			else if(0 >= --retries) {
+				epoll_rearm(master_epollfd, this.epollfd, EPOLLIN);
+			}
+		}
+	}
+
+	void setup_epoll( void ) __attribute__(( constructor ));
+	void setup_epoll( void ) {
+		if(master_epollfd) abort | "Master epoll already setup";
+
+		master_epollfd = epoll_create1(0);
+		if(master_epollfd == -1) {
+			abort | "failed to create master epoll: " | errno | strerror(errno);
+		}
+
+		struct rlimit rlim;
+		if(int ret = getrlimit(RLIMIT_NOFILE, &rlim); 0 != ret) {
+			abort | "failed to get nofile limit: " | errno | strerror(errno);
+		}
+
+		fd_limit = rlim.rlim_cur;
+		fd_map = alloc(fd_limit);
+		for(i;fd_limit) {
+			fd_map[i] = 0p;
+		}
+
+		poller_cnt = 2;
+		poller_fds = alloc(poller_cnt);
+		pollers    = alloc(poller_cnt);
+		for(i; poller_cnt) {
+			poller_fds[i] = epoll_create1(0);
+			if(poller_fds[i] == -1) {
+				abort | "failed to create leaf epoll [" | i | "]: " | errno | strerror(errno);
+			}
+
+			(pollers[i]){ poller_fds[i] };
+		}
+
+		pthread_attr_t attr;
+		if (int ret = pthread_attr_init(&attr); 0 != ret) {
+			abort | "failed to create master epoll thread attr: " | ret | strerror(ret);
+		}
+
+		if (int ret = pthread_create(&master_poller, &attr, master_epoll, 0p); 0 != ret) {
+			abort | "failed to create master epoll thread: " | ret | strerror(ret);
+		}
+	}
+
+	static inline int epoll_wait(int fd, uint32_t event) {
+		if(fd_map[fd] >= 1p) {
+			fd_map[fd]->rearms++;
+			epoll_rearm(poller_fds[fd_map[fd]->pollid], fd, event);
+			return 0;
+		}
+
+		for() {
+			fd_info_t * expected = 0p;
+			fd_info_t * sentinel = 1p;
+			if(__atomic_compare_exchange_n( &(fd_map[fd]), &expected, sentinel, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) {
+				struct epoll_event eevent;
+				eevent.events = event | EPOLLET | EPOLLONESHOT;
+				eevent.data.u64 = (uint64_t)active_thread();
+
+				int id = thread_rand() % poller_cnt;
+				if(0 != epoll_ctl(poller_fds[id], EPOLL_CTL_ADD, fd, &eevent))
+				{
+					abort | "epoll ctl add" | poller_fds[id] | fd | fd_map[fd] | expected | "error: " | errno | strerror(errno);
+				}
+
+				fd_info_t * ninfo = alloc();
+				ninfo->pollid = id;
+				ninfo->rearms = 0;
+				__atomic_store_n( &fd_map[fd], ninfo, __ATOMIC_SEQ_CST);
+
+				park();
+				return 0;
+			}
+
+			if(expected >= 0) {
+				fd_map[fd]->rearms++;
+				epoll_rearm(poller_fds[fd_map[fd]->pollid], fd, event);
+				return 0;
+			}
+
+			Pause();
+		}
+	}
+#endif
+
+//================================================================================
+// Thread run by the C Interface
 
 struct cfathread_object {
@@ -245,5 +436,5 @@
 	// Mutex
 	struct cfathread_mutex {
-		fast_lock impl;
+		linear_backoff_then_block_lock impl;
 	};
 	int cfathread_mutex_init(cfathread_mutex_t *restrict mut, const cfathread_mutexattr_t *restrict) __attribute__((nonnull (1))) { *mut = new(); return 0; }
@@ -260,5 +451,5 @@
 	// Condition
 	struct cfathread_condition {
-		condition_variable(fast_lock) impl;
+		condition_variable(linear_backoff_then_block_lock) impl;
 	};
 	int cfathread_cond_init(cfathread_cond_t *restrict cond, const cfathread_condattr_t *restrict) __attribute__((nonnull (1))) { *cond = new(); return 0; }
@@ -288,5 +479,9 @@
 	// IO operations
 	int cfathread_socket(int domain, int type, int protocol) {
-		return socket(domain, type, protocol);
+		return socket(domain, type
+		#if defined(EPOLL_FOR_SOCKETS)
+			| SOCK_NONBLOCK
+		#endif
+		, protocol);
 	}
 	int cfathread_bind(int socket, const struct sockaddr *address, socklen_t address_len) {
@@ -299,9 +494,34 @@
 
 	int cfathread_accept(int socket, struct sockaddr *restrict address, socklen_t *restrict address_len) {
-		return cfa_accept4(socket, address, address_len, 0, CFA_IO_LAZY);
+		#if defined(EPOLL_FOR_SOCKETS)
+			int ret;
+			for() {
+				yield();
+				ret = accept4(socket, address, address_len, SOCK_NONBLOCK);
+				if(ret >= 0) break;
+				if(errno != EAGAIN && errno != EWOULDBLOCK) break;
+
+				epoll_wait(socket, EPOLLIN);
+			}
+			return ret;
+		#else
+			return cfa_accept4(socket, address, address_len, 0, CFA_IO_LAZY);
+		#endif
 	}
 
 	int cfathread_connect(int socket, const struct sockaddr *address, socklen_t address_len) {
-		return cfa_connect(socket, address, address_len, CFA_IO_LAZY);
+		#if defined(EPOLL_FOR_SOCKETS)
+			int ret;
+			for() {
+				ret = connect(socket, address, address_len);
+				if(ret >= 0) break;
+				if(errno != EAGAIN && errno != EWOULDBLOCK) break;
+
+				epoll_wait(socket, EPOLLIN);
+			}
+			return ret;
+		#else
+			return cfa_connect(socket, address, address_len, CFA_IO_LAZY);
+		#endif
 	}
 
@@ -315,10 +535,38 @@
 
 	ssize_t cfathread_sendmsg(int socket, const struct msghdr *message, int flags) {
-		return cfa_sendmsg(socket, message, flags, CFA_IO_LAZY);
+		#if defined(EPOLL_FOR_SOCKETS)
+			ssize_t ret;
+			__STATS__( false, io.ops.sockwrite++; )
+			for() {
+				ret = sendmsg(socket, message, flags);
+				if(ret >= 0) break;
+				if(errno != EAGAIN && errno != EWOULDBLOCK) break;
+
+				__STATS__( false, io.ops.epllwrite++; )
+				epoll_wait(socket, EPOLLOUT);
+			}
+		#else
+			ssize_t ret = cfa_sendmsg(socket, message, flags, CFA_IO_LAZY);
+		#endif
+		return ret;
 	}
 
 	ssize_t cfathread_write(int fildes, const void *buf, size_t nbyte) {
 		// Use send rather then write for socket since it's faster
-		return cfa_send(fildes, buf, nbyte, 0, CFA_IO_LAZY);
+		#if defined(EPOLL_FOR_SOCKETS)
+			ssize_t ret;
+			// __STATS__( false, io.ops.sockwrite++; )
+			for() {
+				ret = send(fildes, buf, nbyte, 0);
+				if(ret >= 0) break;
+				if(errno != EAGAIN && errno != EWOULDBLOCK) break;
+
+				// __STATS__( false, io.ops.epllwrite++; )
+				epoll_wait(fildes, EPOLLOUT);
+			}
+		#else
+			ssize_t ret = cfa_send(fildes, buf, nbyte, 0, CFA_IO_LAZY);
+		#endif
+		return ret;
 	}
 
@@ -336,5 +584,17 @@
 		msg.msg_controllen = 0;
 
-		ssize_t ret = cfa_recvmsg(socket, &msg, flags, CFA_IO_LAZY);
+		#if defined(EPOLL_FOR_SOCKETS)
+			ssize_t ret;
+			yield();
+			for() {
+				ret = recvmsg(socket, &msg, flags);
+				if(ret >= 0) break;
+				if(errno != EAGAIN && errno != EWOULDBLOCK) break;
+
+				epoll_wait(socket, EPOLLIN);
+			}
+		#else
+			ssize_t ret = cfa_recvmsg(socket, &msg, flags, CFA_IO_LAZY);
+		#endif
 
 		if(address_len) *address_len = msg.msg_namelen;
@@ -344,6 +604,21 @@
 	ssize_t cfathread_read(int fildes, void *buf, size_t nbyte) {
 		// Use recv rather then read for socket since it's faster
-		return cfa_recv(fildes, buf, nbyte, 0, CFA_IO_LAZY);
-	}
-
-}
+		#if defined(EPOLL_FOR_SOCKETS)
+			ssize_t ret;
+			__STATS__( false, io.ops.sockread++; )
+			yield();
+			for() {
+				ret = recv(fildes, buf, nbyte, 0);
+				if(ret >= 0) break;
+				if(errno != EAGAIN && errno != EWOULDBLOCK) break;
+
+				__STATS__( false, io.ops.epllread++; )
+				epoll_wait(fildes, EPOLLIN);
+			}
+		#else
+			ssize_t ret = cfa_recv(fildes, buf, nbyte, 0, CFA_IO_LAZY);
+		#endif
+		return ret;
+	}
+
+}
Index: libcfa/src/concurrency/invoke.h
===================================================================
--- libcfa/src/concurrency/invoke.h	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/invoke.h	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -170,6 +170,4 @@
 		bool corctx_flag;
 
-		int last_cpu;
-
 		//SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it
 
@@ -177,5 +175,5 @@
 		struct cluster * curr_cluster;
 
-		// preferred ready-queue
+		// preferred ready-queue or CPU
 		unsigned preferred;
 
Index: libcfa/src/concurrency/io.cfa
===================================================================
--- libcfa/src/concurrency/io.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/io.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -90,5 +90,5 @@
 	static inline unsigned __flush( struct $io_context & );
 	static inline __u32 __release_sqes( struct $io_context & );
-	extern void __kernel_unpark( thread$ * thrd );
+	extern void __kernel_unpark( thread$ * thrd, unpark_hint );
 
 	bool __cfa_io_drain( processor * proc ) {
@@ -118,5 +118,5 @@
 			__cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
 
-			__kernel_unpark( fulfil( *future, cqe.res, false ) );
+			__kernel_unpark( fulfil( *future, cqe.res, false ), UNPARK_LOCAL );
 		}
 
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/kernel.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -341,11 +341,24 @@
 				}
 
-					__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl()); )
+				__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl()); )
 				__cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle);
 
-				// __disable_interrupts_hard();
-				eventfd_t val;
-				eventfd_read( this->idle, &val );
-				// __enable_interrupts_hard();
+				{
+					eventfd_t val;
+					ssize_t ret = read( this->idle, &val, sizeof(val) );
+					if(ret < 0) {
+						switch((int)errno) {
+						case EAGAIN:
+						#if EAGAIN != EWOULDBLOCK
+							case EWOULDBLOCK:
+						#endif
+						case EINTR:
+							// No need to do anything special here, just assume it's a legitimate wake-up
+							break;
+						default:
+							abort( "KERNEL : internal error, read failure on idle eventfd, error(%d) %s.", (int)errno, strerror( (int)errno ) );
+						}
+					}
+				}
 
 					__STATS( if(this->print_halts) __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl()); )
@@ -409,14 +422,4 @@
 	/* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
 	__builtin_prefetch( thrd_dst->context.SP );
-
-	int curr = __kernel_getcpu();
-	if(thrd_dst->last_cpu != curr) {
-		int64_t l = thrd_dst->last_cpu;
-		int64_t c = curr;
-		int64_t v = (l << 32) | c;
-		__push_stat( __tls_stats(), v, false, "Processor", this );
-	}
-
-	thrd_dst->last_cpu = curr;
 
 	__cfadbg_print_safe(runtime_core, "Kernel : core %p running thread %p (%s)\n", this, thrd_dst, thrd_dst->self_cor.name);
@@ -473,5 +476,5 @@
 		if(unlikely(thrd_dst->preempted != __NO_PREEMPTION)) {
 			// The thread was preempted, reschedule it and reset the flag
-			schedule_thread$( thrd_dst );
+			schedule_thread$( thrd_dst, UNPARK_LOCAL );
 			break RUNNING;
 		}
@@ -557,5 +560,5 @@
 // Scheduler routines
 // KERNEL ONLY
-static void __schedule_thread( thread$ * thrd ) {
+static void __schedule_thread( thread$ * thrd, unpark_hint hint ) {
 	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( ready_schedule_islocked());
@@ -577,8 +580,8 @@
 	// Dereference the thread now because once we push it, there is not guaranteed it's still valid.
 	struct cluster * cl = thrd->curr_cluster;
-	__STATS(bool outside = thrd->last_proc && thrd->last_proc != kernelTLS().this_processor; )
+	__STATS(bool outside = hint == UNPARK_LOCAL && thrd->last_proc && thrd->last_proc != kernelTLS().this_processor; )
 
 	// push the thread to the cluster ready-queue
-	push( cl, thrd, local );
+	push( cl, thrd, hint );
 
 	// variable thrd is no longer safe to use
@@ -605,7 +608,7 @@
 }
 
-void schedule_thread$( thread$ * thrd ) {
+void schedule_thread$( thread$ * thrd, unpark_hint hint ) {
 	ready_schedule_lock();
-		__schedule_thread( thrd );
+		__schedule_thread( thrd, hint );
 	ready_schedule_unlock();
 }
@@ -658,5 +661,5 @@
 }
 
-void __kernel_unpark( thread$ * thrd ) {
+void __kernel_unpark( thread$ * thrd, unpark_hint hint ) {
 	/* paranoid */ verify( ! __preemption_enabled() );
 	/* paranoid */ verify( ready_schedule_islocked());
@@ -666,5 +669,5 @@
 	if(__must_unpark(thrd)) {
 		// Wake lost the race,
-		__schedule_thread( thrd );
+		__schedule_thread( thrd, hint );
 	}
 
@@ -673,5 +676,5 @@
 }
 
-void unpark( thread$ * thrd ) {
+void unpark( thread$ * thrd, unpark_hint hint ) {
 	if( !thrd ) return;
 
@@ -679,5 +682,5 @@
 		disable_interrupts();
 			// Wake lost the race,
-			schedule_thread$( thrd );
+			schedule_thread$( thrd, hint );
 		enable_interrupts(false);
 	}
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/kernel.hfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -151,7 +151,15 @@
 struct __attribute__((aligned(128))) __timestamp_t {
 	volatile unsigned long long tv;
-};
-
-static inline void  ?{}(__timestamp_t & this) { this.tv = 0; }
+	volatile unsigned long long ma;
+};
+
+// Aligned timestamps which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __help_cnts_t {
+	volatile unsigned long long src;
+	volatile unsigned long long dst;
+	volatile unsigned long long tri;
+};
+
+static inline void  ?{}(__timestamp_t & this) { this.tv = 0; this.ma = 0; }
 static inline void ^?{}(__timestamp_t & this) {}
 
@@ -169,4 +177,7 @@
 		// Array of times
 		__timestamp_t * volatile tscs;
+
+		// Array of stats
+		__help_cnts_t * volatile help;
 
 		// Number of lanes (empty or not)
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -119,6 +119,9 @@
 
 	extern "Cforall" {
+		enum unpark_hint { UNPARK_LOCAL, UNPARK_REMOTE };
+
 		extern void park( void );
-		extern void unpark( struct thread$ * this );
+		extern void unpark( struct thread$ *, unpark_hint );
+		static inline void unpark( struct thread$ * thrd ) { unpark(thrd, UNPARK_LOCAL); }
 		static inline struct thread$ * active_thread () {
 			struct thread$ * t = publicTLS_get( this_thread );
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -200,4 +200,28 @@
 	__cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
 
+	// Construct the processor context of the main processor
+	void ?{}(processorCtx_t & this, processor * proc) {
+		(this.__cor){ "Processor" };
+		this.__cor.starter = 0p;
+		this.proc = proc;
+	}
+
+	void ?{}(processor & this) with( this ) {
+		( this.terminated ){};
+		( this.runner ){};
+		init( this, "Main Processor", *mainCluster, 0p );
+		kernel_thread = pthread_self();
+
+		runner{ &this };
+		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+	}
+
+	// Initialize the main processor and the main processor ctx
+	// (the coroutine that contains the processing control flow)
+	mainProcessor = (processor *)&storage_mainProcessor;
+	(*mainProcessor){};
+
+	register_tls( mainProcessor );
+
 	// Start by initializing the main thread
 	// SKULLDUGGERY: the mainThread steals the process main thread
@@ -210,31 +234,4 @@
 	__cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
 
-
-
-	// Construct the processor context of the main processor
-	void ?{}(processorCtx_t & this, processor * proc) {
-		(this.__cor){ "Processor" };
-		this.__cor.starter = 0p;
-		this.proc = proc;
-	}
-
-	void ?{}(processor & this) with( this ) {
-		( this.terminated ){};
-		( this.runner ){};
-		init( this, "Main Processor", *mainCluster, 0p );
-		kernel_thread = pthread_self();
-
-		runner{ &this };
-		__cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
-	}
-
-	// Initialize the main processor and the main processor ctx
-	// (the coroutine that contains the processing control flow)
-	mainProcessor = (processor *)&storage_mainProcessor;
-	(*mainProcessor){};
-
-	register_tls( mainProcessor );
-	mainThread->last_cpu = __kernel_getcpu();
-
 	//initialize the global state variables
 	__cfaabi_tls.this_processor = mainProcessor;
@@ -252,5 +249,5 @@
 	// Add the main thread to the ready queue
 	// once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
-	schedule_thread$(mainThread);
+	schedule_thread$(mainThread, UNPARK_LOCAL);
 
 	// SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
@@ -486,5 +483,5 @@
 	link.next = 0p;
 	link.ts   = -1llu;
-	preferred = -1u;
+	preferred = ready_queue_new_preferred();
 	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )
Index: libcfa/src/concurrency/kernel_private.hfa
===================================================================
--- libcfa/src/concurrency/kernel_private.hfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/kernel_private.hfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -46,5 +46,5 @@
 }
 
-void schedule_thread$( thread$ * ) __attribute__((nonnull (1)));
+void schedule_thread$( thread$ *, unpark_hint hint ) __attribute__((nonnull (1)));
 
 extern bool __preemption_enabled();
@@ -300,5 +300,5 @@
 // push thread onto a ready queue for a cluster
 // returns true if the list was previously empty, false otherwise
-__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool local);
+__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint);
 
 //-----------------------------------------------------------------------
@@ -321,4 +321,8 @@
 
 //-----------------------------------------------------------------------
+// get preferred ready for new thread
+unsigned ready_queue_new_preferred();
+
+//-----------------------------------------------------------------------
 // Increase the width of the ready queue (number of lanes) by 4
 void ready_queue_grow  (struct cluster * cltr);
Index: libcfa/src/concurrency/ready_queue.cfa
===================================================================
--- libcfa/src/concurrency/ready_queue.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/ready_queue.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -100,6 +100,6 @@
 	#define __kernel_rseq_unregister rseq_unregister_current_thread
 #elif defined(CFA_HAVE_LINUX_RSEQ_H)
-	void __kernel_raw_rseq_register  (void);
-	void __kernel_raw_rseq_unregister(void);
+	static void __kernel_raw_rseq_register  (void);
+	static void __kernel_raw_rseq_unregister(void);
 
 	#define __kernel_rseq_register __kernel_raw_rseq_register
@@ -246,4 +246,11 @@
 // Cforall Ready Queue used for scheduling
 //=======================================================================
+unsigned long long moving_average(unsigned long long nval, unsigned long long oval) {
+	const unsigned long long tw = 16;
+	const unsigned long long nw = 4;
+	const unsigned long long ow = tw - nw;
+	return ((nw * nval) + (ow * oval)) / tw;
+}
+
 void ?{}(__ready_queue_t & this) with (this) {
 	#if defined(USE_CPU_WORK_STEALING)
@@ -251,12 +258,20 @@
 		lanes.data = alloc( lanes.count );
 		lanes.tscs = alloc( lanes.count );
+		lanes.help = alloc( cpu_info.hthrd_count );
 
 		for( idx; (size_t)lanes.count ) {
 			(lanes.data[idx]){};
 			lanes.tscs[idx].tv = rdtscl();
+			lanes.tscs[idx].ma = rdtscl();
+		}
+		for( idx; (size_t)cpu_info.hthrd_count ) {
+			lanes.help[idx].src = 0;
+			lanes.help[idx].dst = 0;
+			lanes.help[idx].tri = 0;
 		}
 	#else
 		lanes.data  = 0p;
 		lanes.tscs  = 0p;
+		lanes.help  = 0p;
 		lanes.count = 0;
 	#endif
@@ -270,14 +285,16 @@
 	free(lanes.data);
 	free(lanes.tscs);
+	free(lanes.help);
 }
 
 //-----------------------------------------------------------------------
 #if defined(USE_CPU_WORK_STEALING)
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
 		processor * const proc = kernelTLS().this_processor;
-		const bool external = !push_local || (!proc) || (cltr != proc->cltr);
-
+		const bool external = (!proc) || (cltr != proc->cltr);
+
+		// Figure out the current cpu and make sure it is valid
 		const int cpu = __kernel_getcpu();
 		/* paranoid */ verify(cpu >= 0);
@@ -285,5 +302,15 @@
 		/* paranoid */ verify(cpu * READYQ_SHARD_FACTOR < lanes.count);
 
-		const cpu_map_entry_t & map = cpu_info.llc_map[cpu];
+		// Figure out where thread was last time and make sure it's
+		/* paranoid */ verify(thrd->preferred >= 0);
+		/* paranoid */ verify(thrd->preferred < cpu_info.hthrd_count);
+		/* paranoid */ verify(thrd->preferred * READYQ_SHARD_FACTOR < lanes.count);
+		const int prf = thrd->preferred * READYQ_SHARD_FACTOR;
+
+		const cpu_map_entry_t & map;
+		choose(hint) {
+			case UNPARK_LOCAL : &map = &cpu_info.llc_map[cpu];
+			case UNPARK_REMOTE: &map = &cpu_info.llc_map[prf];
+		}
 		/* paranoid */ verify(map.start * READYQ_SHARD_FACTOR < lanes.count);
 		/* paranoid */ verify(map.self * READYQ_SHARD_FACTOR < lanes.count);
@@ -296,5 +323,8 @@
 			if(unlikely(external)) { r = __tls_rand(); }
 			else { r = proc->rdq.its++; }
-			i = start + (r % READYQ_SHARD_FACTOR);
+			choose(hint) {
+				case UNPARK_LOCAL : i = start + (r % READYQ_SHARD_FACTOR);
+				case UNPARK_REMOTE: i = prf   + (r % READYQ_SHARD_FACTOR);
+			}
 			// If we can't lock it retry
 		} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
@@ -332,19 +362,18 @@
 		processor * const proc = kernelTLS().this_processor;
 		const int start = map.self * READYQ_SHARD_FACTOR;
+		const unsigned long long ctsc = rdtscl();
 
 		// Did we already have a help target
 		if(proc->rdq.target == -1u) {
-			// if We don't have a
-			unsigned long long min = ts(lanes.data[start]);
+			unsigned long long max = 0;
 			for(i; READYQ_SHARD_FACTOR) {
-				unsigned long long tsc = ts(lanes.data[start + i]);
-				if(tsc < min) min = tsc;
-			}
-			proc->rdq.cutoff = min;
-
+				unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+				if(tsc > max) max = tsc;
+			}
+			 proc->rdq.cutoff = (max + 2 * max) / 2;
 			/* paranoid */ verify(lanes.count < 65536); // The following code assumes max 65536 cores.
 			/* paranoid */ verify(map.count < 65536); // The following code assumes max 65536 cores.
 
-			if(0 == (__tls_rand() % 10_000)) {
+			if(0 == (__tls_rand() % 100)) {
 				proc->rdq.target = __tls_rand() % lanes.count;
 			} else {
@@ -358,14 +387,21 @@
 		}
 		else {
-			const unsigned long long bias = 0; //2_500_000_000;
-			const unsigned long long cutoff = proc->rdq.cutoff > bias ? proc->rdq.cutoff - bias : proc->rdq.cutoff;
+			unsigned long long max = 0;
+			for(i; READYQ_SHARD_FACTOR) {
+				unsigned long long tsc = moving_average(ctsc - ts(lanes.data[start + i]), lanes.tscs[start + i].ma);
+				if(tsc > max) max = tsc;
+			}
+			const unsigned long long cutoff = (max + 2 * max) / 2;
 			{
 				unsigned target = proc->rdq.target;
 				proc->rdq.target = -1u;
-				if(lanes.tscs[target].tv < cutoff && ts(lanes.data[target]) < cutoff) {
+				lanes.help[target / READYQ_SHARD_FACTOR].tri++;
+				if(moving_average(ctsc - lanes.tscs[target].tv, lanes.tscs[target].ma) > cutoff) {
 					thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
 					proc->rdq.last = target;
 					if(t) return t;
+					else proc->rdq.target = -1u;
 				}
+				else proc->rdq.target = -1u;
 			}
 
@@ -428,8 +464,8 @@
 	}
 
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
-		const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
 		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
 
@@ -515,14 +551,14 @@
 #endif
 #if defined(USE_WORK_STEALING)
-	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, bool push_local) with (cltr->ready_queue) {
+	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
 		__cfadbg_print_safe(ready_queue, "Kernel : Pushing %p on cluster %p\n", thrd, cltr);
 
 		// #define USE_PREFERRED
 		#if !defined(USE_PREFERRED)
-		const bool external = !push_local || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
+		const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || (cltr != kernelTLS().this_processor->cltr);
 		/* paranoid */ verify(external || kernelTLS().this_processor->rdq.id < lanes.count );
 		#else
 			unsigned preferred = thrd->preferred;
-			const bool external = push_local || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
+			const bool external = (hint != UNPARK_LOCAL) || (!kernelTLS().this_processor) || preferred == -1u || thrd->curr_cluster != cltr;
 			/* paranoid */ verifyf(external || preferred < lanes.count, "Invalid preferred queue %u for %u lanes", preferred, lanes.count );
 
@@ -645,4 +681,5 @@
 	// Actually pop the list
 	struct thread$ * thrd;
+	unsigned long long tsc_before = ts(lane);
 	unsigned long long tsv;
 	[thrd, tsv] = pop(lane);
@@ -658,9 +695,15 @@
 	__STATS( stats.success++; )
 
-	#if defined(USE_WORK_STEALING)
+	#if defined(USE_WORK_STEALING) || defined(USE_CPU_WORK_STEALING)
+		unsigned long long now = rdtscl();
 		lanes.tscs[w].tv = tsv;
+		lanes.tscs[w].ma = moving_average(now > tsc_before ? now - tsc_before : 0, lanes.tscs[w].ma);
 	#endif
 
-	thrd->preferred = w;
+	#if defined(USE_CPU_WORK_STEALING)
+		thrd->preferred = w / READYQ_SHARD_FACTOR;
+	#else
+		thrd->preferred = w;
+	#endif
 
 	// return the popped thread
@@ -688,4 +731,25 @@
 
 //-----------------------------------------------------------------------
+// get preferred ready for new thread
+unsigned ready_queue_new_preferred() {
+	unsigned pref = 0;
+	if(struct thread$ * thrd = publicTLS_get( this_thread )) {
+		pref = thrd->preferred;
+	}
+	else {
+		#if defined(USE_CPU_WORK_STEALING)
+			pref = __kernel_getcpu();
+		#endif
+	}
+
+	#if defined(USE_CPU_WORK_STEALING)
+		/* paranoid */ verify(pref >= 0);
+		/* paranoid */ verify(pref < cpu_info.hthrd_count);
+	#endif
+
+	return pref;
+}
+
+//-----------------------------------------------------------------------
 // Check that all the intrusive queues in the data structure are still consistent
 static void check( __ready_queue_t & q ) with (q) {
@@ -915,5 +979,5 @@
 	extern void __enable_interrupts_hard();
 
-	void __kernel_raw_rseq_register  (void) {
+	static void __kernel_raw_rseq_register  (void) {
 		/* paranoid */ verify( __cfaabi_rseq.cpu_id == RSEQ_CPU_ID_UNINITIALIZED );
 
@@ -933,5 +997,5 @@
 	}
 
-	void __kernel_raw_rseq_unregister(void) {
+	static void __kernel_raw_rseq_unregister(void) {
 		/* paranoid */ verify( __cfaabi_rseq.cpu_id >= 0 );
 
Index: libcfa/src/concurrency/ready_subqueue.hfa
===================================================================
--- libcfa/src/concurrency/ready_subqueue.hfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/ready_subqueue.hfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -98,5 +98,4 @@
 
 	// Get the relevant nodes locally
-	unsigned long long ts = this.anchor.ts;
 	thread$ * node = this.anchor.next;
 	this.anchor.next = node->link.next;
@@ -116,5 +115,5 @@
 	/* paranoid */ verify( node->link.ts   != 0  );
 	/* paranoid */ verify( this.anchor.ts  != 0  );
-	return [node, ts];
+	return [node, this.anchor.ts];
 }
 
Index: libcfa/src/concurrency/stats.cfa
===================================================================
--- libcfa/src/concurrency/stats.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/stats.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -48,4 +48,8 @@
 			stats->io.calls.completed   = 0;
 			stats->io.calls.errors.busy = 0;
+			stats->io.ops.sockread      = 0;
+			stats->io.ops.epllread      = 0;
+			stats->io.ops.sockwrite     = 0;
+			stats->io.ops.epllwrite     = 0;
 		#endif
 
@@ -104,4 +108,8 @@
 			tally_one( &cltr->io.calls.completed  , &proc->io.calls.completed   );
 			tally_one( &cltr->io.calls.errors.busy, &proc->io.calls.errors.busy );
+			tally_one( &cltr->io.ops.sockread     , &proc->io.ops.sockread      );
+			tally_one( &cltr->io.ops.epllread     , &proc->io.ops.epllread      );
+			tally_one( &cltr->io.ops.sockwrite    , &proc->io.ops.sockwrite     );
+			tally_one( &cltr->io.ops.epllwrite    , &proc->io.ops.epllwrite     );
 		#endif
 	}
@@ -179,4 +187,7 @@
 				     | " - cmp " | eng3(io.calls.drain) | "/" | eng3(io.calls.completed) | "(" | ws(3, 3, avgcomp) | "/drain)"
 				     | " - " | eng3(io.calls.errors.busy) | " EBUSY";
+				sstr | "- ops blk: "
+				     |   " sk rd: " | eng3(io.ops.sockread)  | "epll: " | eng3(io.ops.epllread)
+				     |   " sk wr: " | eng3(io.ops.sockwrite) | "epll: " | eng3(io.ops.epllwrite);
 				sstr | nl;
 			}
Index: libcfa/src/concurrency/stats.hfa
===================================================================
--- libcfa/src/concurrency/stats.hfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/stats.hfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -102,4 +102,10 @@
 				volatile uint64_t sleeps;
 			} poller;
+			struct {
+				volatile uint64_t sockread;
+				volatile uint64_t epllread;
+				volatile uint64_t sockwrite;
+				volatile uint64_t epllwrite;
+			} ops;
 		};
 	#endif
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/concurrency/thread.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -25,4 +25,6 @@
 #include "invoke.h"
 
+uint64_t thread_rand();
+
 //-----------------------------------------------------------------------------
 // Thread ctors and dtors
@@ -34,7 +36,4 @@
 	preempted = __NO_PREEMPTION;
 	corctx_flag = false;
-	disable_interrupts();
-	last_cpu = __kernel_getcpu();
-	enable_interrupts();
 	curr_cor = &self_cor;
 	self_mon.owner = &this;
@@ -44,5 +43,5 @@
 	link.next = 0p;
 	link.ts   = -1llu;
-	preferred = -1u;
+	preferred = ready_queue_new_preferred();
 	last_proc = 0p;
 	#if defined( __CFA_WITH_VERIFY__ )
@@ -141,5 +140,5 @@
 	/* paranoid */ verify( this_thrd->context.SP );
 
-	schedule_thread$( this_thrd );
+	schedule_thread$( this_thrd, UNPARK_LOCAL );
 	enable_interrupts();
 }
Index: libcfa/src/containers/string_res.cfa
===================================================================
--- libcfa/src/containers/string_res.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/containers/string_res.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -21,6 +21,19 @@
 
 
-#ifdef VbyteDebug
-extern HandleNode *HeaderPtr;
+
+
+
+
+
+
+// DON'T COMMIT:
+// #define VbyteDebug
+
+
+
+
+
+#ifdef VbyteDebug
+HandleNode *HeaderPtr;
 #endif // VbyteDebug
 
@@ -140,4 +153,15 @@
 
 VbyteHeap HeapArea;
+
+VbyteHeap * DEBUG_string_heap = & HeapArea;
+
+size_t DEBUG_string_bytes_avail_until_gc( VbyteHeap * heap ) {
+    return ((char*)heap->ExtVbyte) - heap->EndVbyte;
+}
+
+const char * DEBUG_string_heap_start( VbyteHeap * heap ) {
+    return heap->StartVbyte;
+}
+
 
 // Returns the size of the string in bytes
@@ -225,21 +249,42 @@
 void assign(string_res &this, const char* buffer, size_t bsize) {
 
+    // traverse the incumbent share-edit set (SES) to recover the range of a base string to which `this` belongs
+    string_res * shareEditSetStartPeer = & this;
+    string_res * shareEditSetEndPeer = & this;
+    for (string_res * editPeer = this.shareEditSet_next; editPeer != &this; editPeer = editPeer->shareEditSet_next) {
+        if ( editPeer->Handle.s < shareEditSetStartPeer->Handle.s ) {
+            shareEditSetStartPeer = editPeer;
+        }
+        if ( shareEditSetEndPeer->Handle.s + shareEditSetEndPeer->Handle.lnth < editPeer->Handle.s + editPeer->Handle.lnth) {
+            shareEditSetEndPeer = editPeer;
+        }
+    }
+
+    // full string is from start of shareEditSetStartPeer thru end of shareEditSetEndPeer
+    // `this` occurs in the middle of it, to be replaced
+    // build up the new text in `pasting`
+
+    string_res pasting = {
+        shareEditSetStartPeer->Handle.s,                   // start of SES
+        this.Handle.s - shareEditSetStartPeer->Handle.s }; // length of SES, before this
+    append( pasting,
+        buffer,                                            // start of replacement for this
+        bsize );                                           // length of replacement for this
+    append( pasting,
+        this.Handle.s + this.Handle.lnth,                  // start of SES after this
+        shareEditSetEndPeer->Handle.s + shareEditSetEndPeer->Handle.lnth -
+        (this.Handle.s + this.Handle.lnth) );              // length of SES, after this
+
+    // The above string building can trigger compaction.
+    // The reference points (that are arguments of the string building) may move during that building.
+    // From this point on, they are stable.
+    // So now, capture their values for use in the overlap cases, below.
+    // Do not factor these definitions with the arguments used above.
+
+    char * beforeBegin = shareEditSetStartPeer->Handle.s;
+    size_t beforeLen = this.Handle.s - beforeBegin;
+
     char * afterBegin = this.Handle.s + this.Handle.lnth;
-
-    char * shareEditSetStart = this.Handle.s;
-    char * shareEditSetEnd = afterBegin;
-    for (string_res * editPeer = this.shareEditSet_next; editPeer != &this; editPeer = editPeer->shareEditSet_next) {
-        shareEditSetStart = min( shareEditSetStart, editPeer->Handle.s );
-        shareEditSetEnd = max( shareEditSetStart, editPeer->Handle.s + editPeer->Handle.lnth);
-    }
-
-    char * beforeBegin = shareEditSetStart;
-    size_t beforeLen = this.Handle.s - shareEditSetStart;
-    size_t afterLen = shareEditSetEnd - afterBegin;
-
-    string_res pasting = { beforeBegin, beforeLen };
-    append(pasting, buffer, bsize);
-    string_res after = { afterBegin, afterLen }; // juxtaposed with in-progress pasting
-    pasting += after;                        // optimized case
+    size_t afterLen = shareEditSetEndPeer->Handle.s + shareEditSetEndPeer->Handle.lnth - afterBegin;
 
     size_t oldLnth = this.Handle.lnth;
@@ -253,12 +298,20 @@
     for (string_res * p = this.shareEditSet_next; p != &this; p = p->shareEditSet_next) {
         assert (p->Handle.s >= beforeBegin);
-        if ( p->Handle.s < beforeBegin + beforeLen ) {
-            // p starts before the edit
-            if ( p->Handle.s + p->Handle.lnth < beforeBegin + beforeLen ) {
+        if ( p->Handle.s >= afterBegin ) {
+            assert ( p->Handle.s <= afterBegin + afterLen );
+            assert ( p->Handle.s + p->Handle.lnth <= afterBegin + afterLen );
+            // p starts after the edit
+            // take start and end as end-anchored
+            size_t startOffsetFromEnd = afterBegin + afterLen - p->Handle.s;
+            p->Handle.s = limit - startOffsetFromEnd;
+            // p->Handle.lnth unaffected
+        } else if ( p->Handle.s <= beforeBegin + beforeLen ) {
+            // p starts before, or at the start of, the edit
+            if ( p->Handle.s + p->Handle.lnth <= beforeBegin + beforeLen ) {
                 // p ends before the edit
                 // take end as start-anchored too
                 // p->Handle.lnth unaffected
             } else if ( p->Handle.s + p->Handle.lnth < afterBegin ) {
-                // p ends during the edit
+                // p ends during the edit; p does not include the last character replaced
                 // clip end of p to end at start of edit
                 p->Handle.lnth = beforeLen - ( p->Handle.s - beforeBegin );
@@ -274,27 +327,20 @@
             size_t startOffsetFromStart = p->Handle.s - beforeBegin;
             p->Handle.s = pasting.Handle.s + startOffsetFromStart;
-        } else if ( p->Handle.s < afterBegin ) {
+        } else {
+            assert ( p->Handle.s < afterBegin );
             // p starts during the edit
             assert( p->Handle.s + p->Handle.lnth >= beforeBegin + beforeLen );
             if ( p->Handle.s + p->Handle.lnth < afterBegin ) {
-                // p ends during the edit
+                // p ends during the edit; p does not include the last character replaced
                 // set p to empty string at start of edit
                 p->Handle.s = this.Handle.s;
                 p->Handle.lnth = 0;
             } else {
-                // p ends after the edit
+                // p includes the end of the edit
                 // clip start of p to start at end of edit
+                int charsToClip = afterBegin - p->Handle.s;
                 p->Handle.s = this.Handle.s + this.Handle.lnth;
-                p->Handle.lnth += this.Handle.lnth;
-                p->Handle.lnth -= oldLnth;
+                p->Handle.lnth -= charsToClip;
             }
-        } else {
-            assert ( p->Handle.s <= afterBegin + afterLen );
-            assert ( p->Handle.s + p->Handle.lnth <= afterBegin + afterLen );
-            // p starts after the edit
-            // take start and end as end-anchored
-            size_t startOffsetFromEnd = afterBegin + afterLen - p->Handle.s;
-            p->Handle.s = limit - startOffsetFromEnd;
-            // p->Handle.lnth unaffected
         }
         MoveThisAfter( p->Handle, pasting.Handle );	// move substring handle to maintain sorted order by string position
@@ -641,5 +687,4 @@
     } // if
 #ifdef VbyteDebug
-    serr | "exit:MoveThisAfter";
     {
 	serr | "HandleList:";
@@ -650,8 +695,9 @@
 		serr | n->s[i];
 	    } // for
-	    serr | "\" flink:" | n->flink | " blink:" | n->blink;
+	    serr | "\" flink:" | n->flink | " blink:" | n->blink | nl;
 	} // for
 	serr | nlOn;
     }
+    serr | "exit:MoveThisAfter";
 #endif // VbyteDebug
 } // MoveThisAfter
@@ -662,8 +708,4 @@
 
 //######################### VbyteHeap #########################
-
-#ifdef VbyteDebug
-HandleNode *HeaderPtr = 0p;
-#endif // VbyteDebug
 
 // Move characters from one location in the byte-string area to another. The routine handles the following situations:
Index: libcfa/src/containers/string_res.hfa
===================================================================
--- libcfa/src/containers/string_res.hfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/containers/string_res.hfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -36,4 +36,8 @@
 void ?{}( HandleNode &, VbyteHeap & );		// constructor for nodes in the handle list
 void ^?{}( HandleNode & );			// destructor for handle nodes
+
+extern VbyteHeap * DEBUG_string_heap;
+size_t DEBUG_string_bytes_avail_until_gc( VbyteHeap * heap );
+const char * DEBUG_string_heap_start( VbyteHeap * heap );
 
 
Index: libcfa/src/fstream.cfa
===================================================================
--- libcfa/src/fstream.cfa	(revision 7e7a07617b5af8b6517de35559e5d9fc1550df1a)
+++ libcfa/src/fstream.cfa	(revision 056cbdbf14fab5f28e8ac202d39bdcead640bd83)
@@ -10,6 +10,6 @@
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Jul 29 22:34:10 2021
-// Update Count     : 454
+// Last Modified On : Tue Sep 21 21:51:38 2021
+// Update Count     : 460
 //
 
@@ -28,12 +28,12 @@
 #define IO_MSG "I/O error: "
 
-void ?{}( ofstream & os, void * file ) {
-	os.file$ = file;
-	os.sepDefault$ = true;
-	os.sepOnOff$ = false;
-	os.nlOnOff$ = true;
-	os.prt$ = false;
-	os.sawNL$ = false;
-	os.acquired$ = false;
+void ?{}( ofstream & os, void * file ) with(os) {
+	file$ = file;
+	sepDefault$ = true;
+	sepOnOff$ = false;
+	nlOnOff$ = true;
+	prt$ = false;
+	sawNL$ = false;
+	acquired$ = false;
 	sepSetCur$( os, sepGet( os ) );
 	sepSet( os, " " );
@@ -124,11 +124,9 @@
 void open( ofstream & os, const char name[], const char mode[] ) {
 	FILE * file = fopen( name, mode );
-	// #ifdef __CFA_DEBUG__
 	if ( file == 0p ) {
 		throw (Open_Failure){ os };
 		// abort | IO_MSG "open output file \"" | name | "\"" | nl | strerror( errno );
 	} // if
-	// #endif // __CFA_DEBUG__
-	(os){ file };
+	(os){ file };										// initialize 
 } // open
 
@@ -137,13 +135,13 @@
 } // open
 
-void close( ofstream & os ) {
-  if ( (FILE *)(os.file$) == 0p ) return;
-  if ( (FILE *)(os.file$) == (FILE *)stdout || (FILE *)(os.file$) == (FILE *)stderr ) return;
-
-	if ( fclose( (FILE *)(os.file$) ) == EOF ) {
+void close( ofstream & os ) with(os) {
+  if ( (FILE *)(file$) == 0p ) return;
+  if ( (FILE *)(file$) == (FILE *)stdout || (FILE *)(file$) == (FILE *)stderr ) return;
+
+	if ( fclose( (FILE *)(file$) ) == EOF ) {
 		throw (Close_Failure){ os };
 		// abort | IO_MSG "close output" | nl | strerror( errno );
 	} // if
-	os.file$ = 0p;
+	file$ = 0p;
 } // close
 
@@ -177,8 +175,8 @@
 } // fmt
 
-inline void acquire( ofstream & os ) {
-	lock( os.lock$ );
-	if ( ! os.acquired$ ) os.acquired$ = true;
-	else unlock( os.lock$ );
+inline void acquire( ofstream & os ) with(os) {
+	lock( lock$ );										// may increase recursive lock
+	if ( ! acquired$ ) acquired$ = true;				// not locked ?
+	else unlock( lock$ );								// unwind recursive lock at start
 } // acquire
 
@@ -187,8 +185,5 @@
 } // release
 
-inline void lock( ofstream & os ) { acquire( os ); }
-inline void unlock( ofstream & os ) { release( os ); } 
-
-void ?{}( osacquire & acq, ofstream & os ) { &acq.os = &os; lock( os.lock$ ); }
+void ?{}( osacquire & acq, ofstream & os ) { lock( os.lock$ ); &acq.os = &os; }
 void ^?{}( osacquire & acq ) { release( acq.os ); }
 
@@ -222,8 +217,8 @@
 
 // private
-void ?{}( ifstream & is, void * file ) {
-	is.file$ = file;
-	is.nlOnOff$ = false;
-	is.acquired$ = false;
+void ?{}( ifstream & is, void * file ) with(is) {
+	file$ = file;
+	nlOnOff$ = false;
+	acquired$ = false;
 } // ?{}
 
@@ -265,11 +260,9 @@
 void open( ifstream & is, const char name[], const char mode[] ) {
 	FILE * file = fopen( name, mode );
-	// #ifdef __CFA_DEBUG__
 	if ( file == 0p ) {
 		throw (Open_Failure){ is };
 		// abort | IO_MSG "open input file \"" | name | "\"" | nl | strerror( errno );
 	} // if
-	// #endif // __CFA_DEBUG__
-	is.file$ = file;
+	(is){ file };										// initialize 
 } // open
 
@@ -278,13 +271,13 @@
 } // open
 
-void close( ifstream & is ) {
-  if ( (FILE *)(is.file$) == 0p ) return;
-  if ( (FILE *)(is.file$) == (FILE *)stdin ) return;
-
-	if ( fclose( (FILE *)(is.file$) ) == EOF ) {
+void close( ifstream & is ) with(is) {
+  if ( (FILE *)(file$) == 0p ) return;
+  if ( (FILE *)(file$) == (FILE *)stdin ) return;
+
+	if ( fclose( (FILE *)(file$) ) == EOF ) {
 		throw (Close_Failure){ is };
 		// abort | IO_MSG "close input" | nl | strerror( errno );
 	} // if
-	is.file$ = 0p;
+	file$ = 0p;
 } // close
 
@@ -327,8 +320,8 @@
 } // fmt
 
-inline void acquire( ifstream & is ) {
-	lock( is.lock$ );
-	if ( ! is.acquired$ ) is.acquired$ = true;
-	else unlock( is.lock$ );
+inline void acquire( ifstream & is ) with(is) {
+	lock( lock$ );										// may increase recursive lock
+	if ( ! acquired$ ) acquired$ = true;				// not locked ?
+	else unlock( lock$ );								// unwind recursive lock at start
 } // acquire
 
@@ -337,5 +330,5 @@
 } // release
 
-void ?{}( isacquire & acq, ifstream & is ) { &acq.is = &is; lock( is.lock$ ); }
+void ?{}( isacquire & acq, ifstream & is ) { lock( is.lock$ ); &acq.is = &is; }
 void ^?{}( isacquire & acq ) { release( acq.is ); }
 
@@ -350,14 +343,14 @@
 
 // exception I/O constructors
-void ?{}( Open_Failure & this, ofstream & ostream ) {
-	this.virtual_table = &Open_Failure_vt;
-	this.ostream = &ostream;
-	this.tag = 1;
-} // ?{}
-
-void ?{}( Open_Failure & this, ifstream & istream ) {
-	this.virtual_table = &Open_Failure_vt;
-	this.istream = &istream;
-	this.tag = 0;
+void ?{}( Open_Failure & ex, ofstream & ostream ) with(ex) {
+	virtual_table = &Open_Failure_vt;
+	ostream = &ostream;
+	tag = 1;
+} // ?{}
+
+void ?{}( Open_Failure & ex, ifstream & istream ) with(ex) {
+	virtual_table = &Open_Failure_vt;
+	istream = &istream;
+	tag = 0;
 } // ?{}
 
@@ -366,14 +359,14 @@
 
 // exception I/O constructors
-void ?{}( Close_Failure & this, ofstream & ostream ) {
-	this.virtual_table = &Close_Failure_vt;
-	this.ostream = &ostream;
-	this.tag = 1;
-} // ?{}
-
-void ?{}( Close_Failure & this, ifstream & istream ) {
-	this.virtual_table = &Close_Failure_vt;
-	this.istream = &istream;
-	this.tag = 0;
+void ?{}( Close_Failure & ex, ofstream & ostream ) with(ex) {
+	virtual_table = &Close_Failure_vt;
+	ostream = &ostream;
+	tag = 1;
+} // ?{}
+
+void ?{}( Close_Failure & ex, ifstream & istream ) with(ex) {
+	virtual_table = &Close_Failure_vt;
+	istream = &istream;
+	tag = 0;
 } // ?{}
 
@@ -382,14 +375,14 @@
 
 // exception I/O constructors
-void ?{}( Write_Failure & this, ofstream & ostream ) {
-	this.virtual_table = &Write_Failure_vt;
-	this.ostream = &ostream;
-	this.tag = 1;
-} // ?{}
-
-void ?{}( Write_Failure & this, ifstream & istream ) {
-	this.virtual_table = &Write_Failure_vt;
-	this.istream = &istream;
-	this.tag = 0;
+void ?{}( Write_Failure & ex, ofstream & ostream ) with(ex) {
+	virtual_table = &Write_Failure_vt;
+	ostream = &ostream;
+	tag = 1;
+} // ?{}
+
+void ?{}( Write_Failure & ex, ifstream & istream ) with(ex) {
+	virtual_table = &Write_Failure_vt;
+	istream = &istream;
+	tag = 0;
 } // ?{}
 
@@ -398,14 +391,14 @@
 
 // exception I/O constructors
-void ?{}( Read_Failure & this, ofstream & ostream ) {
-	this.virtual_table = &Read_Failure_vt;
-	this.ostream = &ostream;
-	this.tag = 1;
-} // ?{}
-
-void ?{}( Read_Failure & this, ifstream & istream ) {
-	this.virtual_table = &Read_Failure_vt;
-	this.istream = &istream;
-	this.tag = 0;
+void ?{}( Read_Failure & ex, ofstream & ostream ) with(ex) {
+	virtual_table = &Read_Failure_vt;
+	ostream = &ostream;
+	tag = 1;
+} // ?{}
+
+void ?{}( Read_Failure & ex, ifstream & istream ) with(ex) {
+	virtual_table = &Read_Failure_vt;
+	istream = &istream;
+	tag = 0;
 } // ?{}