Changeset 63be3387 for libcfa

libcfa/src/Makefile.am

-              rb77f0e1
+              r63be3387
         containers/array.hfa \
         containers/list.hfa \
+        containers/queueLockFree.hfa \
+        containers/stackLockFree.hfa \
+        containers/lockfree.hfa \
         containers/string_sharectx.hfa \
         containers/vector2.hfa \
 …
         concurrency/invoke.h \
         concurrency/future.hfa \
+        concurrency/once.hfa \
         concurrency/kernel/fwd.hfa \
         concurrency/mutex_stmt.hfa
 …
 thread_libsrc = ${inst_thread_headers_src} ${inst_thread_headers_src:.hfa=.cfa} \
+        interpose_thread.cfa \
         bits/signal.hfa \
         concurrency/clib/cfathread.cfa \
 …
         concurrency/stats.cfa \
         concurrency/stats.hfa \
+        concurrency/stats.hfa
+        concurrency/stats.hfa \
+        concurrency/pthread.cfa
 else

libcfa/src/bits/containers.hfa

rb77f0e1	r63be3387
152	152
153	153	void append( __queue(T) & this, T * val ) with(this) {
	154	verify(get_next( *val ) == 0p);
154	155	verify(this.tail != 0p);
155	156	verify(*this.tail == 1p);

libcfa/src/bits/defs.hfa

-              rb77f0e1
+              r63be3387
 #ifdef __cforall
 #define __cfa_anonymous_object(x) inline struct x
-#define __cfa_dlink(x) inline dlink(x)
 #else
 #define __cfa_anonymous_object(x) struct x __cfa_anonymous_object
-#define __cfa_dlink(x) struct { struct x * next; struct x * back; } __dlink_substitute
 #endif

libcfa/src/concurrency/clib/cfathread.cfa

-              rb77f0e1
+              r63be3387
                 pthread_attr_t attr;
                 if (int ret = pthread_attr_init(&attr); 0 != ret) {
+                if (int ret = __cfaabi_pthread_attr_init(&attr); 0 != ret) {
                         abort | "failed to create master epoll thread attr: " | ret | strerror(ret);
+                }
                 if (int ret = pthread_create(&master_poller, &attr, master_epoll, 0p); 0 != ret) {
+                if (int ret = __cfaabi_pthread_create(&master_poller, &attr, master_epoll, 0p); 0 != ret) {
                         abort | "failed to create master epoll thread: " | ret | strerror(ret);
+                }

libcfa/src/concurrency/invoke.h

-              rb77f0e1
+              r63be3387
         // Link lists fields
         // instrusive link field for threads
+        // instrusive link field for threads in the ready-queue
         struct __thread_desc_link {
                 struct thread$ * next;
                 volatile unsigned long long ts;
         };
+        // Link lists fields
+        // instrusive link field for threads in the user_link/cltr_link
+        struct __thread_user_link {
+                #ifdef __cforall
+                        inline dlink(thread$);
+                #else
+                        struct thread$ * next; struct thread$ * back;
+                #endif
+        };
+        _Static_assert(sizeof(struct __thread_user_link) == 2 * sizeof(struct thread$ *), "__thread_user_link should be consistent in C and Cforall");
         struct thread$ {
 …
                 // Link lists fields
                 // instrusive link field for threads
                 struct __thread_desc_link link;
+                struct __thread_desc_link rdy_link;
                 // current execution status for coroutine
 …
                 struct __monitor_group_t monitors;
                 // used to put threads on dlist data structure
                 __cfa_dlink(thread$);
+                struct {
                         struct thread$ * next;
                         struct thread$ * prev;
                 } node;
+                // intrusive link fields, used for locks, monitors and any user defined data structure
+                // default link fields for dlist
+                struct __thread_user_link user_link;
+                // secondary intrusive link fields, used for global cluster list
+                // default link fields for dlist
+                struct __thread_user_link cltr_link;
                 // used to store state between clh lock/unlock
 …
                 #if defined( __CFA_WITH_VERIFY__ )
+                        struct processor * volatile executing;
                         void * canary;
                 #endif
         };
+        #ifdef __cforall
+                P9_EMBEDDED( thread$, dlink(thread$) )
+        #endif
         // Wrapper for gdb
         struct cfathread_thread_t { struct thread$ debug; };
 …
         #ifdef __cforall
         extern "Cforall" {
+                static inline thread$ * volatile & ?`next ( thread$ * this ) {
+                        return this->user_link.next;
+                }
                 static inline thread$ *& get_next( thread$ & this ) __attribute__((const)) {
+                        return this.link.next;
+                }
+                static inline [thread$ *&, thread$ *& ] __get( thread$ & this ) __attribute__((const)) {
+                        return this.node.[next, prev];
+                }
+                        return this.user_link.next;
+                }
+                static inline tytagref( dlink(thread$), dlink(thread$) ) ?`inner( thread$ & this ) {
+                        dlink(thread$) & b = this.user_link;
+                        tytagref( dlink(thread$), dlink(thread$) ) result = { b };
+                        return result;
+                }
+                static inline tytagref(struct __thread_user_link, dlink(thread$)) ?`inner( struct thread$ & this ) {
+                        struct __thread_user_link & ib = this.cltr_link;
+                        dlink(thread$) & b = ib`inner;
+                        tytagref(struct __thread_user_link, dlink(thread$)) result = { b };
+                        return result;
+                }
+                P9_EMBEDDED(struct __thread_user_link, dlink(thread$))
                 static inline void ?{}(__monitor_group_t & this) {

libcfa/src/concurrency/io.cfa

-              rb77f0e1
+              r63be3387
                 if( we ) {
                         sigval_t value = { PREEMPT_IO };
                         pthread_sigqueue(ctx->proc->kernel_thread, SIGUSR1, value);
+                        __cfaabi_pthread_sigqueue(ctx->proc->kernel_thread, SIGUSR1, value);
+                }
 …
+                }
+        }
-        #if defined(CFA_WITH_IO_URING_IDLE)
-                bool __kernel_read(struct processor * proc, io_future_t & future, iovec & iov, int fd) {
-                        io_context$ * ctx = proc->io.ctx;
-                        /* paranoid */ verify( ! __preemption_enabled() );
-                        /* paranoid */ verify( proc == __cfaabi_tls.this_processor );
-                        /* paranoid */ verify( ctx );
-                        __u32 idx;
-                        struct io_uring_sqe * sqe;
-                        // We can proceed to the fast path
-                        if( !__alloc(ctx, &idx, 1) ) {
-                                /* paranoid */ verify( false ); // for now check if this happens, next time just abort the sleep.
-                                return false;
+                        }
-                        // Allocation was successful
-                        __fill( &sqe, 1, &idx, ctx );
-                        sqe->user_data = (uintptr_t)&future;
-                        sqe->flags = 0;
-                        sqe->fd = fd;
-                        sqe->off = 0;
-                        sqe->ioprio = 0;
-                        sqe->fsync_flags = 0;
-                        sqe->__pad2[0] = 0;
-                        sqe->__pad2[1] = 0;
-                        sqe->__pad2[2] = 0;
-                        #if defined(CFA_HAVE_IORING_OP_READ)
-                                sqe->opcode = IORING_OP_READ;
-                                sqe->addr = (uint64_t)iov.iov_base;
-                                sqe->len = iov.iov_len;
-                        #elif defined(CFA_HAVE_READV) && defined(CFA_HAVE_IORING_OP_READV)
-                                sqe->opcode = IORING_OP_READV;
-                                sqe->addr = (uintptr_t)&iov;
-                                sqe->len = 1;
-                        #else
-                                #error CFA_WITH_IO_URING_IDLE but none of CFA_HAVE_READV, CFA_HAVE_IORING_OP_READV or CFA_HAVE_IORING_OP_READ defined
-                        #endif
-                        asm volatile("": : :"memory");
-                        /* paranoid */ verify( sqe->user_data == (uintptr_t)&future );
-                        __submit_only( ctx, &idx, 1 );
-                        /* paranoid */ verify( proc == __cfaabi_tls.this_processor );
-                        /* paranoid */ verify( ! __preemption_enabled() );
-                        return true;
+                }
-                void __cfa_io_idle( struct processor * proc ) {
-                        iovec iov;
-                        __atomic_acquire( &proc->io.ctx->cq.lock );
-                        __attribute__((used)) volatile bool was_reset = false;
-                        with( proc->idle_wctx) {
-                                // Do we already have a pending read
-                                if(available(*ftr)) {
-                                        // There is no pending read, we need to add one
-                                        reset(*ftr);
-                                        iov.iov_base = rdbuf;
-                                        iov.iov_len  = sizeof(eventfd_t);
-                                        __kernel_read(proc, *ftr, iov, evfd );
-                                        ftr->result = 0xDEADDEAD;
-                                        *((eventfd_t *)rdbuf) = 0xDEADDEADDEADDEAD;
-                                        was_reset = true;
+                                }
+                        }
-                        if( !__atomic_load_n( &proc->do_terminate, __ATOMIC_SEQ_CST ) ) {
-                                __ioarbiter_flush( *proc->io.ctx );
-                                proc->idle_wctx.sleep_time = rdtscl();
-                                ioring_syscsll( *proc->io.ctx, 1, IORING_ENTER_GETEVENTS);
+                        }
-                        ready_schedule_lock();
-                        __cfa_do_drain( proc->io.ctx, proc->cltr );
-                        ready_schedule_unlock();
-                        asm volatile ("" :: "m" (was_reset));
+                }
-        #endif
 #endif

libcfa/src/concurrency/io/setup.cfa

-              rb77f0e1
+              r63be3387
         bool __cfa_io_flush( processor * proc ) { return false; }
         bool __cfa_io_drain( processor * proc ) __attribute__((nonnull (1))) { return false; }
-        void __cfa_io_idle ( processor * ) __attribute__((nonnull (1))) {}
         void __cfa_io_stop ( processor * proc ) {}
 …
+        }
-//=============================================================================================
-// I/O Context Sleep
-//=============================================================================================
-        // static inline void __epoll_ctl(io_context$ & ctx, int op, const char * error) {
-        //      struct epoll_event ev;
-        //      ev.events = EPOLLIN | EPOLLONESHOT;
-        //      ev.data.u64 = (__u64)&ctx;
-        //      int ret = epoll_ctl(iopoll.epollfd, op, ctx.efd, &ev);
-        //      if (ret < 0) {
-        //              abort( "KERNEL ERROR: EPOLL %s - (%d) %s\n", error, (int)errno, strerror(errno) );
-        //      }
-        // }
-        // static void __epoll_register(io_context$ & ctx) {
-        //      __epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
-        // }
-        // static void __epoll_unregister(io_context$ & ctx) {
-        //      // Read the current epoch so we know when to stop
-        //      size_t curr = __atomic_load_n(&iopoll.epoch, __ATOMIC_SEQ_CST);
-        //      // Remove the fd from the iopoller
-        //      __epoll_ctl(ctx, EPOLL_CTL_DEL, "REMOVE");
-        //      // Notify the io poller thread of the shutdown
-        //      iopoll.run = false;
-        //      sigval val = { 1 };
-        //      pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
-        //      // Make sure all this is done
-        //      __atomic_thread_fence(__ATOMIC_SEQ_CST);
-        //      // Wait for the next epoch
-        //      while(curr == iopoll.epoch && !iopoll.stopped) Pause();
-        // }
-        // void __ioctx_prepare_block(io_context$ & ctx) {
-        //      __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.fd, &ctx);
-        //      __epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
-        // }
 //=============================================================================================

libcfa/src/concurrency/kernel.cfa

-              rb77f0e1
+              r63be3387
 extern bool __cfa_io_drain( processor * proc ) __attribute__((nonnull (1)));
 extern bool __cfa_io_flush( processor * ) __attribute__((nonnull (1)));
+extern void __cfa_io_idle( processor * ) __attribute__((nonnull (1)));
+#if defined(CFA_WITH_IO_URING_IDLE)
+        extern bool __kernel_read(processor * proc, io_future_t & future, iovec &, int fd);
+#endif
 extern void __disable_interrupts_hard();
 …
         verify(this);
-        /* paranoid */ verify( this->idle_wctx.ftr   != 0p );
-        /* paranoid */ verify( this->idle_wctx.rdbuf != 0p );
-        // used for idle sleep when io_uring is present
-        // mark it as already fulfilled so we know if there is a pending request or not
-        this->idle_wctx.ftr->self.ptr = 1p;
         __cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
         #if !defined(__CFA_NO_STATISTICS__)
 …
         /* paranoid */ verify( ! __preemption_enabled() );
         /* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
         /* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+        /* paranoid */ verifyf( thrd_dst->rdy_link.next == 0p, "Expected null got %p", thrd_dst->rdy_link.next );
         __builtin_prefetch( thrd_dst->context.SP );
 …
                 /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ) || thrd_dst->curr_cor == proc_cor || thrd_dst->corctx_flag, "ERROR : Destination thread$ %p has been corrupted.\n StackPointer too small.\n", thrd_dst ); // add escape condition if we are setting up the processor
                 /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit) || thrd_dst->curr_cor == proc_cor || thrd_dst->corctx_flag, "ERROR : Destination thread$ %p has been corrupted.\n StackPointer too large.\n", thrd_dst ); // add escape condition if we are setting up the processor
+                /* paranoid */ verify( __atomic_exchange_n( &thrd_dst->executing, this, __ATOMIC_SEQ_CST) == 0p );
                 /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
 …
                 /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
+                /* paranoid */ verify( __atomic_exchange_n( &thrd_dst->executing, 0p, __ATOMIC_SEQ_CST) == this );
                 /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit) || thrd_dst->corctx_flag, "ERROR : Destination thread$ %p has been corrupted.\n StackPointer too large.\n", thrd_dst );
                 /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ) || thrd_dst->corctx_flag, "ERROR : Destination thread$ %p has been corrupted.\n StackPointer too small.\n", thrd_dst );
+                /* paranoid */ verify( thrd_dst->state != Halted );
                 /* paranoid */ verify( thrd_dst->context.SP );
-                /* paranoid */ verify( thrd_dst->curr_cluster == this->cltr );
                 /* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
                 /* paranoid */ verify( ! __preemption_enabled() );
 …
                                         "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ #endif
         /* paranoid */ verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+        /* paranoid */ verifyf( thrd->rdy_link.next == 0p, "Expected null got %p", thrd->rdy_link.next );
         /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
 …
                 /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) < ((uintptr_t)__get_stack(thrd->curr_cor)->base ), "ERROR : thread$ %p has been corrupted.\n StackPointer too small.\n", thrd );
-                thrd->state = Halting;
                 if( TICKET_RUNNING != thrd->ticket ) { abort( "Thread terminated with pending unpark" ); }
                 if( thrd != this->owner ) { abort( "Thread internal monitor has incorrect owner" ); }
                 if( this->recursion != 1) { abort( "Thread internal monitor has unbalanced recursion" ); }
+                thrd->state = Halting;
+                thrd->ticket = TICKET_DEAD;
                 // Leave the thread
 …
                 // If that is the case, abandon the preemption.
                 bool preempted = false;
                 if(thrd->link.next == 0p) {
+                if(thrd->rdy_link.next == 0p) {
                         preempted = true;
                         thrd->preempted = reason;
 …
+        #if !defined(CFA_WITH_IO_URING_IDLE)
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(this->print_halts) {
+                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl());
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(this->print_halts) {
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->unique_id, rdtscl());
+                }
+        #endif
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle_fd);
+        {
+                eventfd_t val;
+                ssize_t ret = read( this->idle_wctx.evfd, &val, sizeof(val) );
+                if(ret < 0) {
+                        switch((int)errno) {
+                        case EAGAIN:
+                        #if EAGAIN != EWOULDBLOCK
+                                case EWOULDBLOCK:
+                        #endif
+                        case EINTR:
+                                // No need to do anything special here, just assume it's a legitimate wake-up
+                                break;
+                        default:
+                                abort( "KERNEL : internal error, read failure on idle eventfd, error(%d) %s.", (int)errno, strerror( (int)errno ) );
+                        }
+                #endif
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle_fd);
+                {
+                        eventfd_t val;
+                        ssize_t ret = read( this->idle_wctx.evfd, &val, sizeof(val) );
+                        if(ret < 0) {
+                                switch((int)errno) {
+                                case EAGAIN:
+                                #if EAGAIN != EWOULDBLOCK
+                                        case EWOULDBLOCK:
+                                #endif
+                                case EINTR:
+                                        // No need to do anything special here, just assume it's a legitimate wake-up
+                                        break;
+                                default:
+                                        abort( "KERNEL : internal error, read failure on idle eventfd, error(%d) %s.", (int)errno, strerror( (int)errno ) );
+                                }
+                        }
+                }
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if(this->print_halts) {
+                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl());
+                        }
+                #endif
+        #else
+                __cfa_io_idle( this );
+                }
+        }
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(this->print_halts) {
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->unique_id, rdtscl());
+                }
         #endif
+}
 …
                 insert_first(this.idles, proc);
+                // update the pointer to the head wait context, which should now point to this proc.
                 __atomic_store_n(&this.fdw, &proc.idle_wctx, __ATOMIC_SEQ_CST);
         unlock( this );
 …
+                {
+                        // update the pointer to the head wait context
                         struct __fd_waitctx * wctx = 0;
                         if(!this.idles`isEmpty) wctx = &this.idles`first.idle_wctx;

libcfa/src/concurrency/kernel.hfa

-              rb77f0e1
+              r63be3387
         // 1 - means the proc should wake-up immediately
         // FD - means the proc is going asleep and should be woken by writing to the FD.
+        //      The FD value should always be the evfd field just below.
         volatile int sem;
 …
         int evfd;
+        // buffer into which the proc will read from evfd
+        // unused if not using io_uring for idle sleep
+        void * rdbuf;
+        // future use to track the read of the eventfd
+        // unused if not using io_uring for idle sleep
+        io_future_t * ftr;
+        // Used for debugging, should be removed eventually.
         volatile unsigned long long wake__time;
         volatile unsigned long long sleep_time;
 …
 // P9_EMBEDDED( processor, dlink(processor) )
 static inline tytagref( dlink(processor), dlink(processor) ) ?`inner( processor & this ) {
     dlink(processor) & b = this.link;
     tytagref( dlink(processor), dlink(processor) ) result = { b };
     return result;
+        dlink(processor) & b = this.link;
+        tytagref( dlink(processor), dlink(processor) ) result = { b };
+        return result;
+}
 …
         // List of threads
         __spinlock_t thread_list_lock;
         __dllist_t(struct thread$) threads;
+        dlist(struct thread$, struct __thread_user_link) threads;
         unsigned int nthreads;
 …
                 io_context_params params;
         } io;
+        struct {
+                struct processor ** procs;
+                unsigned cnt;
+        } managed;
         #if !defined(__CFA_NO_STATISTICS__)
 …
 static inline struct cluster   * active_cluster  () { return publicTLS_get( this_processor )->cltr; }
+// set the number of internal processors
+// these processors are in addition to any explicitly declared processors
+unsigned set_concurrency( cluster & this, unsigned new_count );
 #if !defined(__CFA_NO_STATISTICS__)
         void print_stats_now( cluster & this, int flags );

libcfa/src/concurrency/kernel/cluster.cfa

-              rb77f0e1
+              r63be3387
         // We add a boat-load of assertions here because the anchor code is very fragile
         /* paranoid */ _Static_assert( offsetof( thread$, link ) == nested_offsetof(__intrusive_lane_t, l.anchor) );
         /* paranoid */ verify( offsetof( thread$, link ) == nested_offsetof(__intrusive_lane_t, l.anchor) );
         /* paranoid */ verify( ((uintptr_t)( mock_head(this) ) + offsetof( thread$, link )) == (uintptr_t)(&this.l.anchor) );
         /* paranoid */ verify( &mock_head(this)->link.next == &this.l.anchor.next );
         /* paranoid */ verify( &mock_head(this)->link.ts   == &this.l.anchor.ts   );
         /* paranoid */ verify( mock_head(this)->link.next == 0p );
         /* paranoid */ verify( mock_head(this)->link.ts   == MAX );
+        /* paranoid */ _Static_assert( offsetof( thread$, rdy_link ) == nested_offsetof(__intrusive_lane_t, l.anchor) );
+        /* paranoid */ verify( offsetof( thread$, rdy_link ) == nested_offsetof(__intrusive_lane_t, l.anchor) );
+        /* paranoid */ verify( ((uintptr_t)( mock_head(this) ) + offsetof( thread$, rdy_link )) == (uintptr_t)(&this.l.anchor) );
+        /* paranoid */ verify( &mock_head(this)->rdy_link.next == &this.l.anchor.next );
+        /* paranoid */ verify( &mock_head(this)->rdy_link.ts   == &this.l.anchor.ts   );
+        /* paranoid */ verify( mock_head(this)->rdy_link.next == 0p );
+        /* paranoid */ verify( mock_head(this)->rdy_link.ts   == MAX );
         /* paranoid */ verify( mock_head(this) == this.l.prev );
         /* paranoid */ verify( __alignof__(__intrusive_lane_t) == 64 );

libcfa/src/concurrency/kernel/private.hfa

-              rb77f0e1
+              r63be3387
 #endif
+#include <signal.h>
 #include "kernel.hfa"
 #include "thread.hfa"
 …
+}
-// Defines whether or not we *want* to use io_uring_enter as the idle_sleep blocking call
-// #define CFA_WANT_IO_URING_IDLE
-// Defines whether or not we *can* use io_uring_enter as the idle_sleep blocking call
-#if defined(CFA_WANT_IO_URING_IDLE) && defined(CFA_HAVE_LINUX_IO_URING_H)
-        #if defined(CFA_HAVE_IORING_OP_READ) || (defined(CFA_HAVE_READV) && defined(CFA_HAVE_IORING_OP_READV))
-                #define CFA_WITH_IO_URING_IDLE
-        #endif
-#endif
 // #define READYQ_USE_LINEAR_AVG
 #define READYQ_USE_LOGDBL_AVG
 …
 #endif
+extern "C" {
+        __attribute__((visibility("protected"))) int __cfaabi_pthread_create(pthread_t *_thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
+        __attribute__((visibility("protected"))) int __cfaabi_pthread_join(pthread_t _thread, void **retval);
+        __attribute__((visibility("protected"))) pthread_t __cfaabi_pthread_self(void);
+        __attribute__((visibility("protected"))) int __cfaabi_pthread_attr_init(pthread_attr_t *attr);
+        __attribute__((visibility("protected"))) int __cfaabi_pthread_attr_destroy(pthread_attr_t *attr);
+        __attribute__((visibility("protected"))) int __cfaabi_pthread_attr_setstack( pthread_attr_t *attr, void *stackaddr, size_t stacksize );
+        __attribute__((visibility("protected"))) int __cfaabi_pthread_attr_getstacksize( const pthread_attr_t *attr, size_t *stacksize );
+        __attribute__((visibility("protected"))) int __cfaabi_pthread_sigqueue(pthread_t _thread, int sig, const union sigval value);
+        __attribute__((visibility("protected"))) int __cfaabi_pthread_sigmask( int how, const sigset_t *set, sigset_t *oset);
+}
 //-----------------------------------------------------------------------------
 // Scheduler
 …
 #define TICKET_RUNNING ( 0) // thread is running
 #define TICKET_UNBLOCK ( 1) // thread should ignore next block
+#define TICKET_DEAD    (0xDEAD) // thread should never be unparked
 //-----------------------------------------------------------------------------

libcfa/src/concurrency/kernel/startup.cfa

-              rb77f0e1
+              r63be3387
 #define __cforall_thread__
 #define _GNU_SOURCE
+// #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 // C Includes
 …
 KERNEL_STORAGE(thread$,              mainThread);
 KERNEL_STORAGE(__stack_t,            mainThreadCtx);
-// KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
-KERNEL_STORAGE(eventfd_t,            mainIdleEventFd);
-KERNEL_STORAGE(io_future_t,          mainIdleFuture);
 #if !defined(__CFA_NO_STATISTICS__)
 KERNEL_STORAGE(__stats_t, mainProcStats);
 …
                 ( this.runner ){};
                 init( this, "Main Processor", *mainCluster, 0p );
                 kernel_thread = pthread_self();
+                kernel_thread = __cfaabi_pthread_self();
                 runner{ &this };
 …
         mainProcessor = (processor *)&storage_mainProcessor;
         (*mainProcessor){};
-        mainProcessor->idle_wctx.rdbuf = &storage_mainIdleEventFd;
-        mainProcessor->idle_wctx.ftr   = (io_future_t*)&storage_mainIdleFuture;
-        /* paranoid */ verify( sizeof(storage_mainIdleEventFd) == sizeof(eventfd_t) );
         __cfa_io_start( mainProcessor );
 …
+}
+extern "C"{
+        void pthread_delete_kernel_threads_();
+}
 static void __kernel_shutdown(void) {
         if(!cfa_main_returned) return;
+        //delete kernel threads for pthread_concurrency
+        pthread_delete_kernel_threads_();
         /* paranoid */ verify( __preemption_enabled() );
         disable_interrupts();
 …
                 /* paranoid */ verify( this.do_terminate == true );
                 __cfaabi_dbg_print_safe("Kernel : destroyed main processor context %p\n", &runner);
+                __cfadbg_print_safe(runtime_core, "Kernel : destroyed main processor context %p\n", &runner);
+        }
 …
         register_tls( proc );
-        // used for idle sleep when io_uring is present
-        io_future_t future;
-        eventfd_t idle_buf;
-        proc->idle_wctx.ftr = &future;
-        proc->idle_wctx.rdbuf = &idle_buf;
         // SKULLDUGGERY: We want to create a context for the processor coroutine
         // which is needed for the 2-step context switch. However, there is no reason
 …
         (proc->runner){ proc, &info };
         __cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
+        __cfadbg_print_safe(runtime_core, "Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
         //Set global state
 …
         self_mon.recursion = 1;
         self_mon_p = &self_mon;
         link.next = 0p;
         link.ts   = MAX;
+        rdy_link.next = 0p;
+        rdy_link.ts   = MAX;
         preferred = ready_queue_new_preferred();
         last_proc = 0p;
         random_state = __global_random_mask ? __global_random_prime : __global_random_prime ^ rdtscl();
         #if defined( __CFA_WITH_VERIFY__ )
+                executing = 0p;
                 canary = 0x0D15EA5E0D15EA5Ep;
         #endif
-        node.next = 0p;
-        node.prev = 0p;
         doregister(curr_cluster, this);
 …
         #endif
         threads{ __get };
+        threads{};
         io.arbiter = create();
         io.params = io_params;
+        managed.procs = 0p;
+        managed.cnt = 0;
         doregister(this);
 …
 void ^?{}(cluster & this) libcfa_public {
+        set_concurrency( this, 0 );
         destroy(this.io.arbiter);
 …
         lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
         cltr->nthreads += 1;
         push_front(cltr->threads, thrd);
+        insert_first(cltr->threads, thrd);
         unlock    (cltr->thread_list_lock);
+}
 …
 void unregister( cluster * cltr, thread$ & thrd ) {
         lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+        remove(cltr->threads, thrd );
+        cltr->nthreads -= 1;
+        {
+                tytagref( dlink(thread$), dlink(thread$) ) ?`inner( thread$ & this ) = void;
+                with( DLINK_VIA( thread$, struct __thread_user_link ) )
+                        remove( thrd );
+                cltr->nthreads -= 1;
+        }
         unlock(cltr->thread_list_lock);
+}
 …
         pthread_attr_t attr;
         check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+        check( __cfaabi_pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
         size_t stacksize = max( PTHREAD_STACK_MIN, DEFAULT_STACK_SIZE );
 …
         #endif
         check( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
         check( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
+        check( __cfaabi_pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+        check( __cfaabi_pthread_create( pthread, &attr, start, arg ), "pthread_create" );
         return stack;
+}
 void __destroy_pthread( pthread_t pthread, void * stack, void ** retval ) {
         int err = pthread_join( pthread, retval );
+        int err = __cfaabi_pthread_join( pthread, retval );
         if( err != 0 ) abort("KERNEL ERROR: joining pthread %p caused error %s\n", (void*)pthread, strerror(err));
 …
                 pthread_attr_t attr;
                 check( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+                check( __cfaabi_pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
                 size_t stacksize;
                 // default stack size, normally defined by shell limit
                 check( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
+                check( __cfaabi_pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
                 assert( stacksize >= PTHREAD_STACK_MIN );
                 stacksize += __page_size;
 …
+}
+unsigned set_concurrency( cluster & this, unsigned new ) libcfa_public {
+        unsigned old = this.managed.cnt;
+        __cfadbg_print_safe(runtime_core, "Kernel : resizing cluster from %u to %u\n", old, (unsigned)new);
+        // Delete all the old unneeded procs
+        if(old > new) for(i; (unsigned)new ~ old) {
+                __cfadbg_print_safe(runtime_core, "Kernel : destroying %u\n", i);
+                delete( this.managed.procs[i] );
+        }
+        // Allocate new array (uses realloc and memcpies the data)
+        this.managed.procs = alloc( new, this.managed.procs`realloc );
+        this.managed.cnt = new;
+        // Create the desired new procs
+        if(old < new) for(i; old ~ new) {
+                __cfadbg_print_safe(runtime_core, "Kernel : constructing %u\n", i);
+                (*(this.managed.procs[i] = alloc())){ this };
+        }
+        // return the old count
+        return old;
+}
 #if defined(__CFA_WITH_VERIFY__)
 static bool verify_fwd_bck_rng(void) {

libcfa/src/concurrency/locks.hfa

-              rb77f0e1
+              r63be3387
 #include "bits/weakso_locks.hfa"
 #include "containers/queueLockFree.hfa"
+#include "containers/lockfree.hfa"
 #include "containers/list.hfa"
 …
+}
 static inline size_t on_wait(simple_owner_lock & this) with(this) {
+static inline size_t on_wait(simple_owner_lock & this) with(this) {
         lock( lock __cfaabi_dbg_ctx2 );
         /* paranoid */ verifyf( owner != 0p, "Attempt to release lock %p that isn't held", &this );

libcfa/src/concurrency/monitor.cfa

-              rb77f0e1
+              r63be3387
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->link.next == 0p );
+                /* paranoid */ verify( thrd->user_link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->link.next == 1p );
+                /* paranoid */ verify( thrd->user_link.next == 1p );
                 unlock( this->lock );
 …
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->link.next == 0p );
+                /* paranoid */ verify( thrd->user_link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->link.next == 1p );
+                /* paranoid */ verify( thrd->user_link.next == 1p );
                 unlock( this->lock );
 …
         thread$ * new_owner = pop_head( this->entry_queue );
         /* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
         /* paranoid */ verify( !new_owner || new_owner->link.next == 0p );
+        /* paranoid */ verify( !new_owner || new_owner->user_link.next == 0p );
         __set_owner( this, new_owner );
 …
         __queue_t(thread$) & entry_queue = monitors[0]->entry_queue;
+        #if defined( __CFA_WITH_VERIFY__ )
+                thread$ * last = 0p;
+        #endif
         // For each thread in the entry-queue
         for(    thread$ ** thrd_it = &entry_queue.head;
                 (*thrd_it) != 1p;
                 thrd_it = &(*thrd_it)->link.next
+                thrd_it = &get_next(**thrd_it)
         ) {
+                thread$ * curr = *thrd_it;
+                /* paranoid */ verifyf( !last || last->user_link.next == curr, "search not making progress, from %p (%p) to %p", last, last->user_link.next, curr );
+                /* paranoid */ verifyf( curr != last, "search not making progress, from %p to %p", last, curr );
                 // For each acceptable check if it matches
                 int i = 0;
 …
                 for( __acceptable_t * it = begin; it != end; it++, i++ ) {
                         // Check if we have a match
                         if( *it == (*thrd_it)->monitors ) {
+                        if( *it == curr->monitors ) {
                                 // If we have a match return it
 …
+                        }
+                }
+                #if defined( __CFA_WITH_VERIFY__ )
+                        last = curr;
+                #endif
+        }
 …
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->link.next == 0p );
+                /* paranoid */ verify( thrd->user_link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->link.next == 1p );
+                /* paranoid */ verify( thrd->user_link.next == 1p );
                 unlock( this->lock );

libcfa/src/concurrency/preemption.cfa

-              rb77f0e1
+              r63be3387
         sigset_t oldset;
         int ret;
         ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+        ret = __cfaabi_pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
         if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
 …
         sigaddset( &mask, sig );
         if ( pthread_sigmask( SIG_UNBLOCK, &mask, 0p ) == -1 ) {
+        if ( __cfaabi_pthread_sigmask( SIG_UNBLOCK, &mask, 0p ) == -1 ) {
             abort( "internal error, pthread_sigmask" );
+        }
 …
         sigaddset( &mask, sig );
         if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
+        if ( __cfaabi_pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
                 abort( "internal error, pthread_sigmask" );
+        }
 …
 static void preempt( processor * this ) {
         sigval_t value = { PREEMPT_NORMAL };
         pthread_sigqueue( this->kernel_thread, SIGUSR1, value );
+        __cfaabi_pthread_sigqueue( this->kernel_thread, SIGUSR1, value );
+}
 …
         sigset_t oldset;
         int ret;
         ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+        ret = __cfaabi_pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
         if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
 …
         sigset_t oldset;
         int ret;
         ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+        ret = __cfaabi_pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
         if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
 …
         sigval val;
         val.sival_int = 0;
         pthread_sigqueue( alarm_thread, SIGALRM, val );
+        __cfaabi_pthread_sigqueue( alarm_thread, SIGALRM, val );
         // Wait for the preemption thread to finish
 …
         static_assert( sizeof( sigset_t ) == sizeof( cxt->uc_sigmask ), "Expected cxt->uc_sigmask to be of sigset_t" );
         #endif
         if ( pthread_sigmask( SIG_SETMASK, (sigset_t *)&(cxt->uc_sigmask), 0p ) == -1 ) {
+        if ( __cfaabi_pthread_sigmask( SIG_SETMASK, (sigset_t *)&(cxt->uc_sigmask), 0p ) == -1 ) {
                 abort( "internal error, sigprocmask" );
+        }
 …
         sigset_t mask;
         sigfillset(&mask);
         if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
+        if ( __cfaabi_pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
             abort( "internal error, pthread_sigmask" );
+        }

libcfa/src/concurrency/ready_subqueue.hfa

-              rb77f0e1
+              r63be3387
 static inline thread$ * mock_head(const __intrusive_lane_t & this) {
         thread$ * rhead = (thread$ *)(
                 (uintptr_t)( &this.l.anchor ) - __builtin_offsetof( thread$, link )
+                (uintptr_t)( &this.l.anchor ) - __builtin_offsetof( thread$, rdy_link )
         );
         return rhead;
 …
 static inline void push( __intrusive_lane_t & this, thread$ * node ) {
         /* paranoid */ verify( this.l.lock );
         /* paranoid */ verify( node->link.next == 0p );
         /* paranoid */ verify( __atomic_load_n(&node->link.ts, __ATOMIC_RELAXED) == MAX  );
         /* paranoid */ verify( this.l.prev->link.next == 0p );
         /* paranoid */ verify( __atomic_load_n(&this.l.prev->link.ts, __ATOMIC_RELAXED)   == MAX  );
+        /* paranoid */ verify( node->rdy_link.next == 0p );
+        /* paranoid */ verify( __atomic_load_n(&node->rdy_link.ts, __ATOMIC_RELAXED) == MAX  );
+        /* paranoid */ verify( this.l.prev->rdy_link.next == 0p );
+        /* paranoid */ verify( __atomic_load_n(&this.l.prev->rdy_link.ts, __ATOMIC_RELAXED)   == MAX  );
         if( this.l.anchor.next == 0p ) {
                 /* paranoid */ verify( this.l.anchor.next == 0p );
 …
         // Get the relevant nodes locally
         this.l.prev->link.next = node;
         __atomic_store_n(&this.l.prev->link.ts, rdtscl(), __ATOMIC_RELAXED);
+        this.l.prev->rdy_link.next = node;
+        __atomic_store_n(&this.l.prev->rdy_link.ts, rdtscl(), __ATOMIC_RELAXED);
         this.l.prev = node;
         #if !defined(__CFA_NO_STATISTICS__)
 …
         // Get the relevant nodes locally
         thread$ * node = this.l.anchor.next;
         this.l.anchor.next = node->link.next;
         __atomic_store_n(&this.l.anchor.ts, __atomic_load_n(&node->link.ts, __ATOMIC_RELAXED), __ATOMIC_RELAXED);
+        this.l.anchor.next = node->rdy_link.next;
+        __atomic_store_n(&this.l.anchor.ts, __atomic_load_n(&node->rdy_link.ts, __ATOMIC_RELAXED), __ATOMIC_RELAXED);
         bool is_empty = this.l.anchor.next == 0p;
         node->link.next = 0p;
         __atomic_store_n(&node->link.ts, ULLONG_MAX, __ATOMIC_RELAXED);
+        node->rdy_link.next = 0p;
+        __atomic_store_n(&node->rdy_link.ts, ULLONG_MAX, __ATOMIC_RELAXED);
         #if !defined(__CFA_NO_STATISTICS__)
                 this.l.cnt--;
 …
         unsigned long long ats = __atomic_load_n(&this.l.anchor.ts, __ATOMIC_RELAXED);
         /* paranoid */ verify( node->link.next == 0p );
         /* paranoid */ verify( __atomic_load_n(&node->link.ts , __ATOMIC_RELAXED) == MAX );
         /* paranoid */ verify( __atomic_load_n(&node->link.ts , __ATOMIC_RELAXED) != 0   );
+        /* paranoid */ verify( node->rdy_link.next == 0p );
+        /* paranoid */ verify( __atomic_load_n(&node->rdy_link.ts , __ATOMIC_RELAXED) == MAX );
+        /* paranoid */ verify( __atomic_load_n(&node->rdy_link.ts , __ATOMIC_RELAXED) != 0   );
         /* paranoid */ verify( ats != 0 );
         /* paranoid */ verify( (ats == MAX) == is_empty );

libcfa/src/concurrency/thread.cfa

-              rb77f0e1
+              r63be3387
         self_mon_p = &self_mon;
         curr_cluster = &cl;
         link.next = 0p;
         link.ts   = MAX;
+        rdy_link.next = 0p;
+        rdy_link.ts   = MAX;
         preferred = ready_queue_new_preferred();
         last_proc = 0p;
         random_state = __global_random_mask ? __global_random_prime : __global_random_prime ^ rdtscl();
         #if defined( __CFA_WITH_VERIFY__ )
+                executing = 0p;
                 canary = 0x0D15EA5E0D15EA5Ep;
         #endif
-        node.next = 0p;
-        node.prev = 0p;
         clh_node = malloc( );
 …
 //-----------------------------------------------------------------------------
+bool migrate( thread$ * thrd, struct cluster & cl ) {
+        monitor$ * tmon = get_monitor(thrd);
+        monitor$ * __monitors[] = { tmon };
+        monitor_guard_t __guard = { __monitors, 1 };
+        {
+                // if nothing needs to be done, return false
+                if( thrd->curr_cluster == &cl ) return false;
+                // are we migrating ourself?
+                const bool local = thrd == active_thread();
+                /* paranoid */ verify( !local || &cl != active_cluster() );
+                /* paranoid */ verify( !local || thrd->curr_cluster == active_cluster() );
+                /* paranoid */ verify( !local || thrd->curr_cluster == active_processor()->cltr );
+                /* paranoid */ verify( local || tmon->signal_stack.top->owner->waiting_thread == thrd );
+                /* paranoid */ verify( local || tmon->signal_stack.top );
+                // make sure we aren't interrupted while doing this
+                // not as important if we aren't local
+                disable_interrupts();
+                // actually move the thread
+                unregister( thrd->curr_cluster, *thrd );
+                thrd->curr_cluster = &cl;
+                doregister( thrd->curr_cluster, *thrd );
+                // restore interrupts
+                enable_interrupts();
+                // if this is the local thread, we are still running on the old cluster
+                if(local) yield();
+                /* paranoid */ verify( !local || &cl == active_cluster() );
+                /* paranoid */ verify( !local || thrd->curr_cluster == active_cluster() );
+                /* paranoid */ verify( !local || thrd->curr_cluster == active_processor()->cltr );
+                /* paranoid */ verify(  local || tmon->signal_stack.top );
+                /* paranoid */ verify(  local || tmon->signal_stack.top->owner->waiting_thread == thrd );
+                return true;
+        }
+}
+//-----------------------------------------------------------------------------
 #define GENERATOR LCG

libcfa/src/concurrency/thread.hfa

-              rb77f0e1
+              r63be3387
 //----------
+// misc
+bool migrate( thread$ * thrd, struct cluster & cl );
+forall( T & | is_thread(T) )
+static inline bool migrate( T & mutex thrd, struct cluster & cl ) { return migrate( &(thread&)thrd, cl ); }
+//----------
 // prng
 static inline {

libcfa/src/containers/array.hfa

-              rb77f0e1
+              r63be3387
+#pragma once
 #include <assert.h>
 …
     // About the choice of integral types offered as subscript overloads:
     // Intent is to cover these use cases:
+    //    a[0]                                                // i : zero_t
+    //    a[1]                                                // i : one_t
+    //    a[2]                                                // i : int
     //    float foo( ptrdiff_t i ) { return a[i]; }           // i : ptrdiff_t
+    //    float foo( size_t i ) { return a[i]; }              // i : size_t
     //    forall( [N] ) ... for( i; N ) { total += a[i]; }    // i : typeof( sizeof(42) )
     //    for( i; 5 ) { total += a[i]; }                      // i : int
+    //
     // It gets complicated by:
     // -  CFA does overloading on concrete types, like int and unsigned int, not on typedefed
 …
     //    should give them type size_t.
     //
     //                          gcc -m32         cfa -m32 given bug         gcc -m64
+    //                          gcc -m32         cfa -m32 given bug         gcc -m64 (and cfa)
     // ptrdiff_t                int              int                        long int
     // size_t                   unsigned int     unsigned int               unsigned long int
     // typeof( sizeof(42) )     unsigned int     unsigned long int          unsigned long int
     // int                      int              int                        int
+    //
+    // So the solution must support types {zero_t, one_t, int, unsigned int, long int, unsigned long int}
+    //
+    // The solution cannot rely on implicit conversions (e.g. just have one overload for ptrdiff_t)
+    // because assertion satisfaction requires types to match exacly.  Both higher-dimensional
+    // subscripting and operations on slices use asserted subscript operators.  The test case
+    // array-container/array-sbscr-cases covers the combinations.  Mike beleives that commenting out
+    // any of the current overloads leads to one of those cases failing, either on 64- or 32-bit.
+    // Mike is open to being shown a smaller set of overloads that still passes the test.
+    static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, zero_t ) {
+        assert( 0 < N );
+        return (Timmed &) a.strides[0];
+    }
+    static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, one_t ) {
+        assert( 1 < N );
+        return (Timmed &) a.strides[1];
+    }
     static inline Timmed & ?[?]( arpk(N, S, Timmed, Tbase) & a, int i ) {
 …
         return N;
+    }
+    static inline void __taglen( tag(arpk(N, S, Timmed, Tbase)), tag(N) ) {}
     // workaround #226 (and array relevance thereof demonstrated in mike102/otype-slow-ndims.cfa)
 …
 #endif
+// Available for users to work around Trac #265
+// If `a[...0...]` isn't working, try `a[...ix0...]` instead.
+#define ix0 ((ptrdiff_t)0)
 //
 // Rotation
 …
 //
+trait ar(A &, Tv &) {
+    Tv& ?[?]( A&, ptrdiff_t );
+    size_t ?`len( A& );
+};
+// desired:
+// trait ar(A &, Tv &, [N]) {
+//     Tv& ?[?]( A&, zero_t );
+//     Tv& ?[?]( A&, one_t  );
+//     Tv& ?[?]( A&, int    );
+//                   ...
+//     size_t ?`len( A& );
+//     void __taglen( tag(C), tag(N) );
+// };
+// working around N's not being accepted as arguments to traits
+#define ar(A, Tv, N) {                 \
+    Tv& ?[?]( A&, zero_t );            \
+    Tv& ?[?]( A&, one_t );             \
+    Tv& ?[?]( A&, int );               \
+    Tv& ?[?]( A&, unsigned int );      \
+    Tv& ?[?]( A&, long int );          \
+    Tv& ?[?]( A&, unsigned long int ); \
+    size_t ?`len( A& );                \
+    void __taglen( tag(A), tag(N) );   \
+}

libcfa/src/heap.cfa

-              rb77f0e1
+              r63be3387
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Oct 13 22:21:52 2022
 // Update Count     : 1557
+// Last Modified On : Sun Oct 30 20:56:20 2022
+// Update Count     : 1584
 //
 …
 #define FASTLOOKUP                                                                              // use O(1) table lookup from allocation size to bucket size
-#define RETURNSPIN                                                                              // toggle spinlock / lockfree stack
 #define OWNERSHIP                                                                               // return freed memory to owner thread
+#define RETURNSPIN                                                                              // toggle spinlock / lockfree queue
+#if ! defined( OWNERSHIP ) && defined( RETURNSPIN )
+#warning "RETURNSPIN is ignored without OWNERSHIP; suggest commenting out RETURNSPIN"
+#endif // ! OWNERSHIP && RETURNSPIN
 #define CACHE_ALIGN 64
 …
+//######################### Spin Lock #########################
+//######################### Helpers #########################
+// generic Bsearchl does not inline, so substitute with hand-coded binary-search.
+inline __attribute__((always_inline))
+static size_t Bsearchl( unsigned int key, const unsigned int vals[], size_t dim ) {
+        size_t l = 0, m, h = dim;
+        while ( l < h ) {
+                m = (l + h) / 2;
+                if ( (unsigned int &)(vals[m]) < key ) {                // cast away const
+                        l = m + 1;
+                } else {
+                        h = m;
+                } // if
+        } // while
+        return l;
+} // Bsearchl
 …
-#define SPINLOCK 0
-#define LOCKFREE 1
-#define BUCKETLOCK SPINLOCK
-#if BUCKETLOCK == SPINLOCK
-#elif BUCKETLOCK == LOCKFREE
-#include <stackLockFree.hfa>
-#else
-        #error undefined lock type for bucket lock
-#endif // LOCKFREE
 // Recursive definitions: HeapManager needs size of bucket array and bucket area needs sizeof HeapManager storage.
 // Break recursion by hardcoding number of buckets and statically checking number is correct after bucket array defined.
 …
                                                                 void * home;                    // allocated block points back to home locations (must overlay alignment)
                                                                 size_t blockSize;               // size for munmap (must overlay alignment)
-                                                                #if BUCKETLOCK == SPINLOCK
                                                                 Storage * next;                 // freed block points to next freed block of same size
-                                                                #endif // SPINLOCK
                                                         };
                                                         size_t size;                            // allocation size in bytes
                                                 };
-                                                #if BUCKETLOCK == LOCKFREE
-                                                Link(Storage) next;                             // freed block points next freed block of same size (double-wide)
-                                                #endif // LOCKFREE
                                         };
                                 } real; // RealHeader
 …
         struct __attribute__(( aligned (8) )) FreeHeader {
                 size_t blockSize __attribute__(( aligned(8) )); // size of allocations on this list
-                #if BUCKETLOCK == SPINLOCK
                 #ifdef OWNERSHIP
                 #ifdef RETURNSPIN
 …
                 Storage * returnList;                                                   // other thread return list
                 #endif // OWNERSHIP
                 Storage * freeList;                                                             // thread free list
-                #else
-                StackLF(Storage) freeList;
-                #endif // BUCKETLOCK
                 Heap * homeManager;                                                             // heap owner (free storage to bucket, from bucket to heap)
         }; // FreeHeader
 …
         #endif // __STATISTICS__
 }; // Heap
-#if BUCKETLOCK == LOCKFREE
-inline __attribute__((always_inline))
-static {
-        Link(Heap.Storage) * ?`next( Heap.Storage * this ) { return &this->header.kind.real.next; }
-        void ?{}( Heap.FreeHeader & ) {}
-        void ^?{}( Heap.FreeHeader & ) {}
-} // distribution
-#endif // LOCKFREE
 …
-// generic Bsearchl does not inline, so substitute with hand-coded binary-search.
-inline __attribute__((always_inline))
-static size_t Bsearchl( unsigned int key, const unsigned int vals[], size_t dim ) {
-        size_t l = 0, m, h = dim;
-        while ( l < h ) {
-                m = (l + h) / 2;
-                if ( (unsigned int &)(vals[m]) < key ) {                // cast away const
-                        l = m + 1;
-                } else {
-                        h = m;
-                } // if
-        } // while
-        return l;
-} // Bsearchl
 void heapMasterCtor() with( heapMaster ) {
         // Singleton pattern to initialize heap master
 …
         __map_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
         ?{}( extLock );
         ?{}( mgrLock );
+        extLock = 0;
+        mgrLock = 0;
         char * end = (char *)sbrk( 0 );
 …
                                 #ifdef OWNERSHIP
                                 #ifdef RETURNSPIN
+                                ?{}( freeLists[j].returnLock );
+                                freeLists[j].returnLock = 0;
+                                freeLists[j].returnList = 0p;
                                 #endif // RETURNSPIN
-                                freeLists[j].returnList = 0p;
                                 #endif // OWNERSHIP
                                 freeLists[j].freeList = 0p;
                                 freeLists[j].homeManager = heap;
                                 freeLists[j].blockSize = bucketSizes[j];
                         } // for
                         heapBuffer = 0p;
                         heapReserve = 0;
 …
         if ( unlikely( ! heapMasterBootFlag ) ) heapMasterCtor();
         lock( heapMaster.mgrLock );             // protect heapMaster counters
+        lock( heapMaster.mgrLock );                                                     // protect heapMaster counters
         // get storage for heap manager
 …
         // find the closest bucket size less than or equal to the mmapStart size
         maxBucketsUsed = Bsearchl( mmapStart, bucketSizes, NoBucketSizes ); // binary search
         verify( maxBucketsUsed < NoBucketSizes );                       // subscript failure ?
         verify( mmapStart <= bucketSizes[maxBucketsUsed] ); // search failure ?
 …
                 size_t increase = ceiling2( size > heapExpand ? size : heapExpand, libAlign() );
-                // Do not call abort or strerror( errno ) as they may call malloc.
                 if ( unlikely( sbrk( increase ) == (void *)-1 ) ) {     // failed, no memory ?
                         unlock( extLock );
                         abort( NO_MEMORY_MSG, size );                           // no memory
+                        abort( NO_MEMORY_MSG, size );                           // give up
                 } // if
 …
                 #endif // __STATISTICS__
-                // Spin until the lock is acquired for this particular size of block.
-                #if BUCKETLOCK == SPINLOCK
                 block = freeHead->freeList;                                             // remove node from stack
-                #else
-                block = pop( freeHead->freeList );
-                #endif // BUCKETLOCK
                 if ( unlikely( block == 0p ) ) {                                // no free block ?
+                        // Freelist for this size is empty, so check return list (OWNERSHIP), carve it out of the heap, if there
+                        // is enough left, or get some more heap storage and carve it off.
                         #ifdef OWNERSHIP
+                        // Freelist for that size is empty, so carve it out of the heap, if there is enough left, or get some more
+                        // and then carve it off.
+                        #ifdef RETURNSPIN
+                        #if BUCKETLOCK == SPINLOCK
+                        lock( freeHead->returnLock );
+                        block = freeHead->returnList;
+                        freeHead->returnList = 0p;
+                        unlock( freeHead->returnLock );
+                        #else
+                        block = __atomic_exchange_n( &freeHead->returnList, nullptr, __ATOMIC_SEQ_CST );
+                        #endif // RETURNSPIN
+                        if ( likely( block == 0p ) ) {                  // return list also empty?
+                        if ( unlikely( freeHead->returnList ) ) {       // race, get next time if lose race
+                                #ifdef RETURNSPIN
+                                lock( freeHead->returnLock );
+                                block = freeHead->returnList;
+                                freeHead->returnList = 0p;
+                                unlock( freeHead->returnLock );
+                                #else
+                                block = __atomic_exchange_n( &freeHead->returnList, 0p, __ATOMIC_SEQ_CST );
+                                #endif // RETURNSPIN
+                                verify( block );
+                                #ifdef __STATISTICS__
+                                stats.return_pulls += 1;
+                                #endif // __STATISTICS__
+                                // OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
+                                freeHead->freeList = block->header.kind.real.next; // merge returnList into freeHead
+                        } else {
                         #endif // OWNERSHIP
                                 // Do not leave kernel thread as manager_extend accesses heapManager.
 …
                                 #ifdef __CFA_DEBUG__
                                 // Scrub new memory so subsequent uninitialized usages might fail. Only scrub the first 1024 bytes.
+                                // Scrub new memory so subsequent uninitialized usages might fail. Only scrub the first SCRUB_SIZE bytes.
                                 memset( block->data, SCRUB, min( SCRUB_SIZE, tsize - sizeof(Heap.Storage) ) );
                                 #endif // __CFA_DEBUG__
-                        #endif // BUCKETLOCK
                         #ifdef OWNERSHIP
-                        } else {                                                                        // merge returnList into freeHead
-                                #ifdef __STATISTICS__
-                                stats.return_pulls += 1;
-                                #endif // __STATISTICS__
-                                // OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
-                                freeHead->freeList = block->header.kind.real.next;
                         } // if
                         #endif // OWNERSHIP
 …
   if ( unlikely( size > ULONG_MAX - __page_size ) ) return 0p;
                 tsize = ceiling2( tsize, __page_size );                 // must be multiple of page size
                 #ifdef __STATISTICS__
                 stats.counters[STAT_NAME].alloc += tsize;
 …
                         if ( errno == ENOMEM ) abort( NO_MEMORY_MSG, tsize ); // no memory
                         // Do not call strerror( errno ) as it may call malloc.
+                        abort( "**** Error **** attempt to allocate large object (> %zu) of size %zu bytes and mmap failed with errno %d.", size, heapMaster.mmapStart, errno );
+                        abort( "**** Error **** attempt to allocate large object (> %zu) of size %zu bytes and mmap failed with errno %d.",
+                                   size, heapMaster.mmapStart, errno );
                 } // if
                 block->header.kind.real.blockSize = MarkMmappedBit( tsize ); // storage size for munmap
                 #ifdef __CFA_DEBUG__
                 // Scrub new memory so subsequent uninitialized usages might fail. Only scrub the first 1024 bytes.  The rest of
                 // the storage set to 0 by mmap.
+                // Scrub new memory so subsequent uninitialized usages might fail. Only scrub the first SCRUB_SIZE bytes. The
+                // rest of the storage set to 0 by mmap.
                 memset( block->data, SCRUB, min( SCRUB_SIZE, tsize - sizeof(Heap.Storage) ) );
                 #endif // __CFA_DEBUG__
 …
                 #endif // __CFA_DEBUG__
+                #ifdef OWNERSHIP
                 if ( likely( heapManager == freeHead->homeManager ) ) { // belongs to this thread
                         header->kind.real.next = freeHead->freeList; // push on stack
 …
                         verify( heapManager );
-                        #ifdef OWNERSHIP
                         #ifdef RETURNSPIN
                         lock( freeHead->returnLock );
 …
                         header->kind.real.next = freeHead->returnList; // link new node to top node
                         // CAS resets header->kind.real.next = freeHead->returnList on failure
                         while ( ! __atomic_compare_exchange_n( &freeHead->returnList, &header->kind.real.next, header,
+                        while ( ! __atomic_compare_exchange_n( &freeHead->returnList, &header->kind.real.next, (Heap.Storage *)header,
                                                                                                    false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) );
                         #endif // RETURNSPIN
+                        #else                                                                           // no OWNERSHIP
+                        freeHead = &heap->freeLists[ClearStickyBits( header->kind.real.home ) - &freeHead->homeManager->freeLists[0]];
+                        header->kind.real.next = freeHead->freeList; // push on stack
+                        freeHead->freeList = (Heap.Storage *)header;
+                        #endif // ! OWNERSHIP
+                        #ifdef __U_STATISTICS__
+                        stats.return_pushes += 1;
+                        stats.return_storage_request += rsize;
+                        stats.return_storage_alloc += size;
+                        #endif // __U_STATISTICS__
+                        // OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
+                } // if
+                } // if
+                #else                                                                                   // no OWNERSHIP
+                // kind.real.home is address in owner thread's freeLists, so compute the equivalent position in this thread's freeList.
+                freeHead = &freeLists[ClearStickyBits( (Heap.FreeHeader *)(header->kind.real.home) ) - &freeHead->homeManager->freeLists[0]];
+                header->kind.real.next = freeHead->freeList;    // push on stack
+                freeHead->freeList = (Heap.Storage *)header;
+                #endif // ! OWNERSHIP
+                #ifdef __U_STATISTICS__
+                stats.return_pushes += 1;
+                stats.return_storage_request += rsize;
+                stats.return_storage_alloc += size;
+                #endif // __U_STATISTICS__
+                // OK TO BE PREEMPTED HERE AS heapManager IS NO LONGER ACCESSED.
         } // if
 …
                 #endif // __STATISTICS__
-                #if BUCKETLOCK == SPINLOCK
                 for ( Heap.Storage * p = freeLists[i].freeList; p != 0p; p = p->header.kind.real.next ) {
-                #else
-                        for(;;) {
-//              for ( Heap.Storage * p = top( freeLists[i].freeList ); p != 0p; p = (p)`next->top ) {
-//              for ( Heap.Storage * p = top( freeLists[i].freeList ); p != 0p; /* p = getNext( p )->top */) {
-//                      Heap.Storage * temp = p->header.kind.real.next.top; // FIX ME: direct assignent fails, initialization works`
-//                      typeof(p) temp = (( p )`next)->top;                     // FIX ME: direct assignent fails, initialization works`
-//                      p = temp;
-                #endif // BUCKETLOCK
                         total += size;
                         #ifdef __STATISTICS__

libcfa/src/interpose.cfa

-              rb77f0e1
+              r63be3387
 typedef void (* generic_fptr_t)(void);
+static generic_fptr_t do_interpose_symbol( void * library, const char symbol[], const char version[] ) {
+        const char * error;
+        union { generic_fptr_t fptr; void * ptr; } originalFunc;
+        #if defined( _GNU_SOURCE )
+                if ( version ) {
+                        originalFunc.ptr = dlvsym( library, symbol, version );
+                } else {
+                        originalFunc.ptr = dlsym( library, symbol );
+                }
+        #else
+                originalFunc.ptr = dlsym( library, symbol );
+        #endif // _GNU_SOURCE
+        error = dlerror();
+        if ( error ) abort( "interpose_symbol : internal error, %s\n", error );
+        return originalFunc.fptr;
+}
 static generic_fptr_t interpose_symbol( const char symbol[], const char version[] ) {
         const char * error;
         static void * library;
+        static void * pthread_library;
         if ( ! library ) {
                 #if defined( RTLD_NEXT )
 …
                 #endif
         } // if
+        union { generic_fptr_t fptr; void * ptr; } originalFunc;
+        #if defined( _GNU_SOURCE )
+                if ( version ) {
+                        originalFunc.ptr = dlvsym( library, symbol, version );
+                } else {
+                        originalFunc.ptr = dlsym( library, symbol );
+                }
+        #else
+                originalFunc.ptr = dlsym( library, symbol );
+        #endif // _GNU_SOURCE
+        error = dlerror();
+        if ( error ) abort( "interpose_symbol : internal error, %s\n", error );
+        return originalFunc.fptr;
+        if ( ! pthread_library ) {
+                #if defined( RTLD_NEXT )
+                        pthread_library = RTLD_NEXT;
+                #else
+                        // missing RTLD_NEXT => must hard-code library name, assuming libstdc++
+                        pthread_library = dlopen( "libpthread.so", RTLD_LAZY );
+                        error = dlerror();
+                        if ( error ) {
+                                abort( "interpose_symbol : failed to open libpthread, %s\n", error );
+                        }
+                #endif
+        } // if
+        return do_interpose_symbol(library, symbol, version);
+}
 …
 extern "C" {
+        void __cfathreadabi_interpose_startup( generic_fptr_t (*do_interpose_symbol)( void * library, const char symbol[], const char version[] ) ) __attribute__((weak));
         void __cfaabi_interpose_startup( void ) {
                 const char *version = 0p;
 …
                 INTERPOSE_LIBC( exit , version );
 #pragma GCC diagnostic pop
+                if(__cfathreadabi_interpose_startup) __cfathreadabi_interpose_startup( do_interpose_symbol );
                 // As a precaution (and necessity), errors that result in termination are delivered on a separate stack because

Context Navigation

Legend:

libcfa/src/Makefile.am

libcfa/src/bits/containers.hfa

libcfa/src/bits/defs.hfa

libcfa/src/concurrency/clib/cfathread.cfa

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/io/setup.cfa

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel/cluster.cfa

libcfa/src/concurrency/kernel/private.hfa

libcfa/src/concurrency/kernel/startup.cfa

libcfa/src/concurrency/locks.hfa

libcfa/src/concurrency/monitor.cfa

libcfa/src/concurrency/preemption.cfa

libcfa/src/concurrency/ready_subqueue.hfa

libcfa/src/concurrency/thread.cfa

libcfa/src/concurrency/thread.hfa

libcfa/src/containers/array.hfa

libcfa/src/heap.cfa

libcfa/src/interpose.cfa

Download in other formats: