Changeset 58fe85a for libcfa/src/concurrency

libcfa/src/concurrency/CtxSwitch-arm32.S

-              r3c64c668
+              r58fe85a
         @ 32 bit ARM context switch
         @ This function assumes that r9 has no special meaning on the platform it's
         @ being built on.
         @ If r9 is special, uncomment the following line and it will be left alone
+        # 32 bit ARM context switch
+        # This function assumes that r9 has no special meaning on the platform it's
+        # being built on.
+        # If r9 is special, uncomment the following line and it will be left alone
         @ #define R9_SPECIAL
+        # #define R9_SPECIAL
         #define PTR_BYTE        4
 …
 __cfactx_switch:
         @ save callee-saved registers: r4-r8, r10, r11, r13(sp) (plus r9 depending on platform specification)
         @ I've seen reference to 31 registers on 64-bit, if this is the case, more need to be saved
         @ save thread state registers: r14(lr)
         @ r12(ip) is intra-procedure-call scratch register, does not need saving between function calls
+        # save callee-saved registers: r4-r8, r10, r11, r13(sp) (plus r9 depending on platform specification)
+        # I've seen reference to 31 registers on 64-bit, if this is the case, more need to be saved
+        # save thread state registers: r14(lr)
+        # r12(ip) is intra-procedure-call scratch register, does not need saving between function calls
         #ifdef R9_SPECIAL
 …
         #endif // R9_SPECIAL
         @ save floating point registers: s16-s31
+        # save floating point registers: s16-s31
         vstmdb r13!, {s16-s31}
         @ save frame pointer and stack pointer to outgoing datastructure
+        # save frame pointer and stack pointer to outgoing datastructure
         str sp, [r0, #SP_OFFSET]
         str fp, [r0, #FP_OFFSET]
         @ restore frame pointer and stack pointer from incoming datastructure
+        # restore frame pointer and stack pointer from incoming datastructure
         ldr fp, [r1, #FP_OFFSET]
         ldr sp, [r1, #SP_OFFSET]
         @ restore floating point registers: s16-s31
+        # restore floating point registers: s16-s31
         vldm r13!, {s16-s31}
         @ restore r14(lr)
         @ restore 64-bit extra registers?
         @ restore callee-saved registers: r4-r8, r10, r11, r13
+        # restore r14(lr)
+        # restore 64-bit extra registers?
+        # restore callee-saved registers: r4-r8, r10, r11, r13
         #ifdef R9_SPECIAL
         ldmfd r13!, {r4-r8,r10,r11,r15}
         #else
         ldmfd r13!, {r4-r11,r14}    @ loading r14 back into r15 returns
+        ldmfd r13!, {r4-r11,r14}    # loading r14 back into r15 returns
         mov r15, r14

libcfa/src/concurrency/CtxSwitch-i386.S

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Dec 6 12:27:26 2016
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Jul 21 22:29:25 2017
+// Update Count     : 1
+//
+// This  library is free  software; you  can redistribute  it and/or  modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software  Foundation; either  version 2.1 of  the License, or  (at your
+// option) any later version.
+//
+// This library is distributed in the  hope that it will be useful, but WITHOUT
+// ANY  WARRANTY;  without even  the  implied  warranty  of MERCHANTABILITY  or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should  have received a  copy of the  GNU Lesser General  Public License
+// along  with this library.
+// Last Modified On : Sun Sep  6 18:23:37 2020
+// Update Count     : 5
 //
+// This context switch routine depends on the fact that the stack of a new
+// thread has been set up to look like the thread has saved its context in
+// the normal manner.
+//
+// void CtxSwitch( machine_context *from, machine_context *to );
+// The context switch routine requires the initial the stack of a thread to
+// look like the thread has saved its context in the normal manner.
+// Offsets in the context structure. This needs to be synchronized with the
+// high level code a little better.
+// Offsets must synchronized with the __stack_context_t in invoke.h.
 #define PTR_BYTE        4
 #define SP_OFFSET       ( 0 * PTR_BYTE )
 #define FP_OFFSET       ( 1 * PTR_BYTE )
-#define PC_OFFSET       ( 2 * PTR_BYTE )
+// Context switch between coroutines/tasks.
+//   void __cfactx_switch( struct __stack_context_t * from, struct __stack_context_t * to ) ;
+// Arguments "from" in register 4(%esp), "to" in register 20(%esp)
+        .file "CtxSwitch-i386.S"
         .text
         .align 2
         .globl __cfactx_switch
         .type  __cfactx_switch, @function
+        .global __cfactx_switch
+        .type __cfactx_switch, @function
 __cfactx_switch:
         // Copy the "from" context argument from the stack to register eax
         // Return address is at 0(%esp), with parameters following
+        // Return address is at 0(%esp), with parameters following.
         movl 4(%esp),%eax
 …
         movl %ebp,FP_OFFSET(%eax)
         // Copy the "to" context argument from the stack to register eax
         // Having pushed three words (= 12 bytes) on the stack, the
         // argument is now at 8 + 12 = 20(%esp)
+        // Copy the "to" context argument from the stack to register eax. Having
+        // pushed 3 words (= 12 bytes) on the stack, the argument is now at
+        // 8 + 12 = 20(%esp).
         movl 20(%esp),%eax
 …
         ret
         .size  __cfactx_switch, .-__cfactx_switch
+        .size __cfactx_switch, .-__cfactx_switch
 // Local Variables: //

libcfa/src/concurrency/CtxSwitch-x86_64.S

-              r3c64c668
+              r58fe85a
 // CtxSwitch-x86_64.S --
 //
 // Author           : Thierry Delisle
 // Created On       : Mon Nov 28 12:27:26 2016
+// Author           : Peter A. Buhr
+// Created On       : Mon Aug 10 08:10:26 2020
 // Last Modified By : Peter A. Buhr
+// Last Modified On : Fri Jul 21 22:28:11 2017
+// Update Count     : 1
+//
+// This  library is free  software; you  can redistribute  it and/or  modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software  Foundation; either  version 2.1 of  the License, or  (at your
+// option) any later version.
+//
+// This library is distributed in the  hope that it will be useful, but WITHOUT
+// ANY  WARRANTY;  without even  the  implied  warranty  of MERCHANTABILITY  or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should  have received a  copy of the  GNU Lesser General  Public License
+// along  with this library.
+// Last Modified On : Sat Oct 24 14:36:25 2020
+// Update Count     : 10
 //
+// This context switch routine depends on the fact that the stack of a new
+// thread has been set up to look like the thread has saved its context in
+// the normal manner.
+//
+// void CtxSwitch( machine_context *from, machine_context *to );
+// The context switch routine requires the initial the stack of a thread to
+// look like the thread has saved its context in the normal manner.
+// Offsets in the context structure. This needs to be synchronized with the
+// high level code a little better.
+// Offsets must synchronized with the __stack_context_t in invoke.h.
 #define PTR_BYTE        8
 …
 #define FP_OFFSET       ( 1 * PTR_BYTE )
+//-----------------------------------------------------------------------------
+// Regular context switch routine which enables switching from one context to anouther
+// Context switch between coroutines/tasks.
+//   void __cfactx_switch( struct __stack_context_t * from, struct __stack_context_t * to ) ;
+// Arguments "from" in register rdi, "to" in register rsi.
+        .file "CtxSwitch-x86_64.S"
         .text
         .align 2
         .globl __cfactx_switch
         .type  __cfactx_switch, @function
+        .global __cfactx_switch
+        .type __cfactx_switch, @function
 __cfactx_switch:
 …
         ret
         .size  __cfactx_switch, .-__cfactx_switch
+        .size __cfactx_switch, .-__cfactx_switch
+//-----------------------------------------------------------------------------
+// Stub used to create new stacks which are ready to be context switched to
+// Stub to create new stacks which can be context switched to
+//   void __cfactx_invoke_stub( void );
         .text
         .align 2
         .globl __cfactx_invoke_stub
         .type    __cfactx_invoke_stub, @function
+        .global __cfactx_invoke_stub
+        .type __cfactx_invoke_stub, @function
 __cfactx_invoke_stub:
         movq %rbx, %rdi
+        movq %rbx, %rdi                                         // move main and this to first two arguments
         movq %r12, %rsi
         jmp *%r13
         .size  __cfactx_invoke_stub, .-__cfactx_invoke_stub
+        jmp *%r13                                                       // jmp to invoke
+        .size __cfactx_invoke_stub, .-__cfactx_invoke_stub
 // Local Variables: //
 // mode: c //
+// mode: asm //
 // tab-width: 4 //
 // End: //

libcfa/src/concurrency/alarm.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Fri Jun 2 11:31:25 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Jan  5 08:41:36 2020
 // Update Count     : 69
+// Last Modified On : Wed Jun 17 16:11:35 2020
+// Update Count     : 75
 //
 #define __cforall_thread__
-extern "C" {
 #include <errno.h>
 #include <stdio.h>
+#include <unistd.h>
 #include <string.h>
-#include <unistd.h>
 #include <sys/time.h>
+}
 #include "alarm.hfa"
 #include "kernel_private.hfa"
+#include "kernel/fwd.hfa"
 #include "preemption.hfa"
 …
 //=============================================================================================
 void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period ) with( this ) {
+void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period) with( this ) {
         this.thrd = thrd;
         this.alarm = alarm;
         this.period = period;
-        next = 0;
         set = false;
         kernel_alarm = false;
+        type = User;
+}
 void ?{}( alarm_node_t & this, processor   * proc, Time alarm, Duration period ) with( this ) {
+void ?{}( alarm_node_t & this, processor * proc, Time alarm, Duration period ) with( this ) {
         this.proc = proc;
         this.alarm = alarm;
         this.period = period;
-        next = 0;
         set = false;
+        kernel_alarm = true;
+        type = Kernel;
+}
+void ?{}( alarm_node_t & this, Alarm_Callback callback, Time alarm, Duration period ) with( this ) {
+        this.alarm = alarm;
+        this.period = period;
+        this.callback = callback;
+        set = false;
+        type = Callback;
+}
 …
+}
+#if !defined(NDEBUG) && (defined(__CFA_DEBUG__) || defined(__CFA_VERIFY__))
+bool validate( alarm_list_t * this ) {
+        alarm_node_t ** it = &this->head;
+        while( (*it) ) {
+                it = &(*it)->next;
+void insert( alarm_list_t * this, alarm_node_t * n ) {
+        alarm_node_t * it = & (*this)`first;
+        while( it && (n->alarm > it->alarm) ) {
+                it = & (*it)`next;
+        }
+        if ( it ) {
+                insert_before( *it, *n );
+        } else {
+                insert_last(*this, *n);
+        }
+        return it == this->tail;
+}
+#endif
+static inline void insert_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t p ) {
+        verify( !n->next );
+        if( p == this->tail ) {
+                this->tail = &n->next;
+        }
+        else {
+                n->next = *p;
+        }
+        *p = n;
+        verify( validate( this ) );
+}
+void insert( alarm_list_t * this, alarm_node_t * n ) {
+        alarm_node_t ** it = &this->head;
+        while( (*it) && (n->alarm > (*it)->alarm) ) {
+                it = &(*it)->next;
+        }
+        insert_at( this, n, it );
+        verify( validate( this ) );
+        verify( validate( *this ) );
+}
 alarm_node_t * pop( alarm_list_t * this ) {
+        alarm_node_t * head = this->head;
+        verify( validate( *this ) );
+        alarm_node_t * head = & (*this)`first;
         if( head ) {
+                this->head = head->next;
+                if( !head->next ) {
+                        this->tail = &this->head;
+                }
+                head->next = 0p;
+                remove(*head);
+        }
         verify( validate( this ) );
+        verify( validate( *this ) );
         return head;
+}
-static inline void remove_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t it ) {
-        verify( it );
-        verify( (*it) == n );
-        (*it) = n->next;
-        if( !n-> next ) {
-                this->tail = it;
+        }
-        n->next = 0p;
-        verify( validate( this ) );
+}
-static inline void remove( alarm_list_t * this, alarm_node_t * n ) {
-        alarm_node_t ** it = &this->head;
-        while( (*it) && (*it) != n ) {
-                it = &(*it)->next;
+        }
-        verify( validate( this ) );
-        if( *it ) { remove_at( this, n, it ); }
-        verify( validate( this ) );
+}
 void register_self( alarm_node_t * this ) {
         alarm_list_t * alarms = &event_kernel->alarms;
+        alarm_list_t & alarms = event_kernel->alarms;
         disable_interrupts();
 …
+        {
                 verify( validate( alarms ) );
                 bool first = !alarms->head;
+                bool first = ! & alarms`first;
                 insert( alarms, this );
+                insert( &alarms, this );
                 if( first ) {
                         __kernel_set_timer( alarms->head->alarm - __kernel_get_time() );
+                        __kernel_set_timer( alarms`first.alarm - __kernel_get_time() );
+                }
+        }
 …
         lock( event_kernel->lock __cfaabi_dbg_ctx2 );
+        {
                 verify( validate( &event_kernel->alarms ) );
                 remove( &event_kernel->alarms, this );
+                verify( validate( event_kernel->alarms ) );
+                remove( *this );
+        }
         unlock( event_kernel->lock );
 …
+}
+//=============================================================================================
+// Utilities
+//=============================================================================================
+void sleep( Duration duration ) {
+        alarm_node_t node = { active_thread(), __kernel_get_time() + duration, 0`s };
+        register_self( &node );
+        park();
+        /* paranoid */ verify( !node.set );
+        /* paranoid */ verify( & node`next == 0p );
+        /* paranoid */ verify( & node`prev == 0p );
+}
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/alarm.hfa

-              r3c64c668
+              r58fe85a
 #include "time.hfa"
+#include "containers/list.hfa"
 struct $thread;
 struct processor;
 …
 //=============================================================================================
+enum alarm_type{ Kernel = 0, User = 1, Callback = 2 };
+struct alarm_node_t;
+typedef void (*Alarm_Callback)(alarm_node_t & );
 struct alarm_node_t {
         Time alarm;                             // time when alarm goes off
         Duration period;                        // if > 0 => period of alarm
+        alarm_node_t * next;            // intrusive link list field
+        DLISTED_MGD_IMPL_IN(alarm_node_t)
         union {
+                $thread * thrd; // thrd who created event
+                processor * proc;               // proc who created event
+                $thread * thrd;                                 // thrd who created event
+                processor * proc;                               // proc who created event
+                Alarm_Callback callback;                // callback to handle event
         };
         bool set                :1;             // whether or not the alarm has be registered
         bool kernel_alarm       :1;             // true if this is not a user defined alarm
+        enum alarm_type type;           // true if this is not a user defined alarm
 };
+typedef alarm_node_t ** __alarm_it_t;
+DLISTED_MGD_IMPL_OUT(alarm_node_t)
 void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period );
 void ?{}( alarm_node_t & this, processor   * proc, Time alarm, Duration period );
+void ?{}( alarm_node_t & this, Alarm_Callback callback, Time alarm, Duration period );
 void ^?{}( alarm_node_t & this );
+struct alarm_list_t {
+        alarm_node_t * head;
+        __alarm_it_t tail;
+};
+static inline void ?{}( alarm_list_t & this ) with( this ) {
+        head = 0;
+        tail = &head;
+}
+typedef dlist(alarm_node_t, alarm_node_t) alarm_list_t;
 void insert( alarm_list_t * this, alarm_node_t * n );

libcfa/src/concurrency/coroutine.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 12:29:25 2020
 // Update Count     : 16
+// Last Modified On : Tue Dec 15 12:06:04 2020
+// Update Count     : 23
 //
 …
 #include "coroutine.hfa"
-extern "C" {
 #include <stddef.h>
 #include <malloc.h>
 …
 #include <string.h>
 #include <unistd.h>
+// use this define to make unwind.h play nice, definetely a hack
+#define HIDE_EXPORTS
+#include <sys/mman.h>                                                                   // mprotect
 #include <unwind.h>
-#undef HIDE_EXPORTS
-#include <sys/mman.h>
+}
 #include "kernel_private.hfa"
+#include "exception.hfa"
+#include "math.hfa"
+#define CFA_COROUTINE_USE_MMAP 0
 #define __CFA_INVOKE_PRIVATE__
 …
 //-----------------------------------------------------------------------------
+FORALL_DATA_INSTANCE(CoroutineCancelled, (dtype coroutine_t), (coroutine_t))
+forall(dtype T)
+void mark_exception(CoroutineCancelled(T) *) {}
+forall(dtype T)
+void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src) {
+        dst->virtual_table = src->virtual_table;
+        dst->the_coroutine = src->the_coroutine;
+        dst->the_exception = src->the_exception;
+}
+forall(dtype T)
+const char * msg(CoroutineCancelled(T) *) {
+        return "CoroutineCancelled(...)";
+}
+// This code should not be inlined. It is the error path on resume.
+forall(dtype T | is_coroutine(T))
+void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc ) {
+        verify( desc->cancellation );
+        desc->state = Cancelled;
+        exception_t * except = __cfaehm_cancellation_exception( desc->cancellation );
+        // TODO: Remove explitate vtable set once trac#186 is fixed.
+        CoroutineCancelled(T) except;
+        except.virtual_table = &get_exception_vtable(&except);
+        except.the_coroutine = &cor;
+        except.the_exception = except;
+        throwResume except;
+        except->virtual_table->free( except );
+        free( desc->cancellation );
+        desc->cancellation = 0p;
+}
+//-----------------------------------------------------------------------------
 // Global state variables
 // minimum feasible stack size in bytes
+#define MinStackSize 1000
+static const size_t MinStackSize = 1000;
 extern size_t __page_size;                              // architecture pagesize HACK, should go in proper runtime singleton
+extern int __map_prot;
 void __stack_prepare( __stack_info_t * this, size_t create_size );
+void __stack_clean  ( __stack_info_t * this );
 //-----------------------------------------------------------------------------
 …
         bool userStack = ((intptr_t)this.storage & 0x1) != 0;
         if ( ! userStack && this.storage ) {
+                __attribute__((may_alias)) intptr_t * istorage = (intptr_t *)&this.storage;
+                *istorage &= (intptr_t)-1;
+                void * storage = this.storage->limit;
+                __cfaabi_dbg_debug_do(
+                        storage = (char*)(storage) - __page_size;
+                        if ( mprotect( storage, __page_size, PROT_READ | PROT_WRITE ) == -1 ) {
+                                abort( "(coStack_t *)%p.^?{}() : internal error, mprotect failure, error(%d) %s.", &this, errno, strerror( errno ) );
+                        }
+                );
+                __cfaabi_dbg_print_safe("Kernel : Deleting stack %p\n", storage);
+                free( storage );
+                __stack_clean( &this );
+        }
+}
 …
 void ^?{}($coroutine& this) {
         if(this.state != Halted && this.state != Start && this.state != Primed) {
                 $coroutine * src = TL_GET( this_thread )->curr_cor;
+                $coroutine * src = active_coroutine();
                 $coroutine * dst = &this;
 …
         assert(__page_size != 0l);
         size_t size = libCeiling( storageSize, 16 ) + stack_data_size;
+        size = ceiling(size, __page_size);
         // If we are running debug, we also need to allocate a guardpage to catch stack overflows.
         void * storage;
+        __cfaabi_dbg_debug_do(
+                storage = memalign( __page_size, size + __page_size );
+        );
+        __cfaabi_dbg_no_debug_do(
+                storage = (void*)malloc(size);
+        );
+        #if CFA_COROUTINE_USE_MMAP
+                storage = mmap(0p, size + __page_size, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+                if(storage == ((void*)-1)) {
+                        abort( "coroutine stack creation : internal error, mmap failure, error(%d) %s.", errno, strerror( errno ) );
+                }
+                if ( mprotect( storage, __page_size, PROT_NONE ) == -1 ) {
+                        abort( "coroutine stack creation : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
+                } // if
+                storage = (void *)(((intptr_t)storage) + __page_size);
+        #else
+                __cfaabi_dbg_debug_do(
+                        storage = memalign( __page_size, size + __page_size );
+                );
+                __cfaabi_dbg_no_debug_do(
+                        storage = (void*)malloc(size);
+                );
+                __cfaabi_dbg_debug_do(
+                        if ( mprotect( storage, __page_size, PROT_NONE ) == -1 ) {
+                                abort( "__stack_alloc : internal error, mprotect failure, error(%d) %s.", (int)errno, strerror( (int)errno ) );
+                        }
+                        storage = (void *)(((intptr_t)storage) + __page_size);
+                );
+        #endif
         __cfaabi_dbg_print_safe("Kernel : Created stack %p of size %zu\n", storage, size);
-        __cfaabi_dbg_debug_do(
-                if ( mprotect( storage, __page_size, PROT_NONE ) == -1 ) {
-                        abort( "__stack_alloc : internal error, mprotect failure, error(%d) %s.", (int)errno, strerror( (int)errno ) );
+                }
-                storage = (void *)(((intptr_t)storage) + __page_size);
-        );
         verify( ((intptr_t)storage & (libAlign() - 1)) == 0ul );
         return [storage, size];
+}
+void __stack_clean  ( __stack_info_t * this ) {
+        size_t size = ((intptr_t)this->storage->base) - ((intptr_t)this->storage->limit) + sizeof(__stack_t);
+        void * storage = this->storage->limit;
+        #if CFA_COROUTINE_USE_MMAP
+                storage = (void *)(((intptr_t)storage) - __page_size);
+                if(munmap(storage, size + __page_size) == -1) {
+                        abort( "coroutine stack destruction : internal error, munmap failure, error(%d) %s.", errno, strerror( errno ) );
+                }
+        #else
+                __cfaabi_dbg_debug_do(
+                        storage = (char*)(storage) - __page_size;
+                        if ( mprotect( storage, __page_size, __map_prot ) == -1 ) {
+                                abort( "(coStack_t *)%p.^?{}() : internal error, mprotect failure, error(%d) %s.", &this, errno, strerror( errno ) );
+                        }
+                );
+                free( storage );
+        #endif
+        __cfaabi_dbg_print_safe("Kernel : Deleting stack %p\n", storage);
+}
 …
                 size = libFloor(create_size - stack_data_size - diff, libAlign());
         } // if
         assertf( size >= MinStackSize, "Stack size %zd provides less than minimum of %d bytes for a stack.", size, MinStackSize );
         this->storage = (__stack_t *)((intptr_t)storage + size);
+        assertf( size >= MinStackSize, "Stack size %zd provides less than minimum of %zd bytes for a stack.", size, MinStackSize );
+        this->storage = (__stack_t *)((intptr_t)storage + size - sizeof(__stack_t));
         this->storage->limit = storage;
+        this->storage->base  = (void*)((intptr_t)storage + size);
+        this->storage->base  = (void*)((intptr_t)storage + size - sizeof(__stack_t));
+        this->storage->exception_context.top_resume = 0p;
+        this->storage->exception_context.current_exception = 0p;
         __attribute__((may_alias)) intptr_t * istorage = (intptr_t*)&this->storage;
         *istorage |= userStack ? 0x1 : 0x0;
 …
         struct $coroutine * __cfactx_cor_finish(void) {
                 struct $coroutine * cor = kernelTLS.this_thread->curr_cor;
+                struct $coroutine * cor = active_coroutine();
                 if(cor->state == Primed) {
                         suspend();
+                        __cfactx_suspend();
+                }

libcfa/src/concurrency/coroutine.hfa

-              r3c64c668
+              r58fe85a
 #include <assert.h>
 #include "invoke.h"
+#include "../exception.hfa"
+//-----------------------------------------------------------------------------
+// Exception thrown from resume when a coroutine stack is cancelled.
+FORALL_DATA_EXCEPTION(CoroutineCancelled, (dtype coroutine_t), (coroutine_t)) (
+        coroutine_t * the_coroutine;
+        exception_t * the_exception;
+);
+forall(dtype T)
+void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src);
+forall(dtype T)
+const char * msg(CoroutineCancelled(T) *);
 //-----------------------------------------------------------------------------
 …
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
 trait is_coroutine(dtype T) {
       void main(T & this);
       $coroutine * get_coroutine(T & this);
+trait is_coroutine(dtype T | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
+        void main(T & this);
+        $coroutine * get_coroutine(T & this);
 };
 …
 //-----------------------------------------------------------------------------
 // Public coroutine API
-static inline void suspend(void);
-forall(dtype T | is_coroutine(T))
-static inline T & resume(T & cor);
 forall(dtype T | is_coroutine(T))
 void prime(T & cor);
 static inline struct $coroutine * active_coroutine() { return TL_GET( this_thread )->curr_cor; }
+static inline struct $coroutine * active_coroutine() { return active_thread()->curr_cor; }
 //-----------------------------------------------------------------------------
 …
 static inline void $ctx_switch( $coroutine * src, $coroutine * dst ) __attribute__((nonnull (1, 2))) {
         // set state of current coroutine to inactive
         src->state = src->state == Halted ? Halted : Inactive;
+        src->state = src->state == Halted ? Halted : Blocked;
         // set new coroutine that task is executing
         TL_GET( this_thread )->curr_cor = dst;
+        active_thread()->curr_cor = dst;
         // context switch to specified coroutine
 …
+}
+extern void __stack_prepare   ( __stack_info_t * this, size_t size /* ignored if storage already allocated */);
+extern void __stack_prepare( __stack_info_t * this, size_t size /* ignored if storage already allocated */);
+extern void __stack_clean  ( __stack_info_t * this );
 // Suspend implementation inlined for performance
+static inline void suspend(void) {
+        // optimization : read TLS once and reuse it
+        // Safety note: this is preemption safe since if
+        // preemption occurs after this line, the pointer
+        // will also migrate which means this value will
+        // stay in syn with the TLS
+        $coroutine * src = TL_GET( this_thread )->curr_cor;
+extern "C" {
+        static inline void __cfactx_suspend(void) {
+                // optimization : read TLS once and reuse it
+                // Safety note: this is preemption safe since if
+                // preemption occurs after this line, the pointer
+                // will also migrate which means this value will
+                // stay in syn with the TLS
+                $coroutine * src = active_coroutine();
         assertf( src->last != 0,
                 "Attempt to suspend coroutine \"%.256s\" (%p) that has never been resumed.\n"
                 "Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
                 src->name, src );
         assertf( src->last->state != Halted,
                 "Attempt by coroutine \"%.256s\" (%p) to suspend back to terminated coroutine \"%.256s\" (%p).\n"
                 "Possible cause is terminated coroutine's main routine has already returned.",
                 src->name, src, src->last->name, src->last );
+                assertf( src->last != 0,
+                        "Attempt to suspend coroutine \"%.256s\" (%p) that has never been resumed.\n"
+                        "Possible cause is a suspend executed in a member called by a coroutine user rather than by the coroutine main.",
+                        src->name, src );
+                assertf( src->last->state != Halted,
+                        "Attempt by coroutine \"%.256s\" (%p) to suspend back to terminated coroutine \"%.256s\" (%p).\n"
+                        "Possible cause is terminated coroutine's main routine has already returned.",
+                        src->name, src, src->last->name, src->last );
+        $ctx_switch( src, src->last );
+                $ctx_switch( src, src->last );
+        }
+}
+forall(dtype T | is_coroutine(T))
+void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc );
 // Resume implementation inlined for performance
 …
         // will also migrate which means this value will
         // stay in syn with the TLS
         $coroutine * src = TL_GET( this_thread )->curr_cor;
+        $coroutine * src = active_coroutine();
         $coroutine * dst = get_coroutine(cor);
         if( unlikely(dst->context.SP == 0p) ) {
-                TL_GET( this_thread )->curr_cor = dst;
                 __stack_prepare(&dst->stack, 65000);
                 __cfactx_start(main, dst, cor, __cfactx_invoke_coroutine);
-                TL_GET( this_thread )->curr_cor = src;
+        }
 …
         // always done for performance testing
         $ctx_switch( src, dst );
+        if ( unlikely(dst->cancellation) ) {
+                __cfaehm_cancelled_coroutine( cor, dst );
+        }
         return cor;
 …
         // will also migrate which means this value will
         // stay in syn with the TLS
         $coroutine * src = TL_GET( this_thread )->curr_cor;
+        $coroutine * src = active_coroutine();
         // not resuming self ?

libcfa/src/concurrency/invoke.c

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Jan 17 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Feb  9 16:37:42 2018
 // Update Count     : 5
+// Last Modified On : Sat Oct 24 14:35:28 2020
+// Update Count     : 32
 //
 …
         struct FakeStack {
             void *fixedRegisters[3];              // fixed registers ebx, edi, esi (popped on 1st uSwitch, values unimportant)
             void *rturn;                          // where to go on return from uSwitch
             void *dummyReturn;                    // fake return compiler would have pushed on call to uInvoke
             void *argument[3];                    // for 16-byte ABI, 16-byte alignment starts here
             void *padding;                        // padding to force 16-byte alignment, as "base" is 16-byte aligned
+            void *fixedRegisters[3];                                            // fixed registers ebx, edi, esi (popped on 1st uSwitch, values unimportant)
+            void *rturn;                                                                        // where to go on return from uSwitch
+            void *dummyReturn;                                                          // fake return compiler would have pushed on call to uInvoke
+            void *argument[3];                                                          // for 16-byte ABI, 16-byte alignment starts here
+            void *padding;                                                                      // padding to force 16-byte alignment, as "base" is 16-byte aligned
         };
 …
         fs->dummyReturn = NULL;
         fs->argument[0] = main;     // argument to invoke
         fs->argument[1] = this;     // argument to invoke
+        fs->argument[0] = main;                                                         // argument to invoke
+        fs->argument[1] = this;                                                         // argument to invoke
         fs->rturn = invoke;
 …
         struct FakeStack {
                 void *fixedRegisters[5];            // fixed registers rbx, r12, r13, r14, r15
                 void *rturn;                        // where to go on return from uSwitch
                 void *dummyReturn;                  // NULL return address to provide proper alignment
+                void *fixedRegisters[5];                                                // fixed registers rbx, r12, r13, r14, r15
+                void *rturn;                                                                    // where to go on return from uSwitch
+                void *dummyReturn;                                                              // NULL return address to provide proper alignment
         };
         cor->context.SP = (char *)stack->base - sizeof( struct FakeStack );
         cor->context.FP = NULL;         // terminate stack with NULL fp
+        cor->context.FP = NULL;                                                         // terminate stack with NULL fp
         struct FakeStack *fs = (struct FakeStack *)cor->context.SP;
 …
         fs->dummyReturn = NULL;
         fs->rturn = __cfactx_invoke_stub;
         fs->fixedRegisters[0] = main;
         fs->fixedRegisters[1] = this;
+        fs->fixedRegisters[0] = main;                                           // argument to invoke
+        fs->fixedRegisters[1] = this;                                           // argument to invoke
         fs->fixedRegisters[2] = invoke;
+#elif defined( __ARM_ARCH )
+#error ARM needs to be upgrade to use to parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+#elif defined( __ARM_ARCH_32 )
+#error ARM needs to be upgrade to use two parameters like X86/X64 (A.K.A. : I broke this and do not know how to fix it)
+        // More details about the error:
+        // To avoid the thunk problem, I changed the invoke routine to pass the main explicitly
+        // instead of relying on an assertion. This effectively hoists any required thunk one level
+        // which was enough to get to global scope in most cases.
+        // This means that __cfactx_invoke_... now takes two parameters and the FakeStack needs
+        // to be adjusted as a consequence of that.
+        // I don't know how to do that for ARM, hence the #error
         struct FakeStack {
                 float fpRegs[16];                       // floating point registers
                 void *intRegs[9];                       // integer/pointer registers
                 void *arg[2];                           // placeholder for this pointer
+                float fpRegs[16];                                                               // floating point registers
+                void * intRegs[9];                                                              // integer/pointer registers
+                void * arg[2];                                                                  // placeholder for this pointer
         };
 …
         fs->arg[1] = invoke;
+#elif defined( __ARM_ARCH )
+        struct FakeStack {
+                void * intRegs[12];                                                             // x19-x30 integer registers
+                double fpRegs[8];                                                               // v8-v15 floating point
+        };
+        cor->context.SP = (char *)stack->base - sizeof( struct FakeStack );
+        cor->context.FP = NULL;
+        struct FakeStack *fs = (struct FakeStack *)cor->context.SP;
+        fs->intRegs[0] = main;                                                          // argument to invoke x19 => x0
+        fs->intRegs[1] = this;                                                          // argument to invoke x20 => x1
+        fs->intRegs[2] = invoke;
+        fs->intRegs[11] = __cfactx_invoke_stub;                         // link register x30 => ret moves to pc
 #else
         #error uknown hardware architecture

libcfa/src/concurrency/invoke.h

-              r3c64c668
+              r58fe85a
 #include "bits/defs.hfa"
 #include "bits/locks.hfa"
+#include "kernel/fwd.hfa"
 #ifdef __cforall
 …
 #define _INVOKE_H_
+#ifdef __ARM_ARCH
+        // function prototypes are only really used by these macros on ARM
+        void disable_global_interrupts();
+        void enable_global_interrupts();
+        #define TL_GET( member ) ( { __typeof__( kernelTLS.member ) target; \
+                disable_global_interrupts(); \
+                target = kernelTLS.member; \
+                enable_global_interrupts(); \
+                target; } )
+        #define TL_SET( member, value ) disable_global_interrupts(); \
+                kernelTLS.member = value; \
+                enable_global_interrupts();
+#else
+        #define TL_GET( member ) kernelTLS.member
+        #define TL_SET( member, value ) kernelTLS.member = value;
+#endif
+        #ifdef __cforall
+        extern "Cforall" {
+                extern __attribute__((aligned(128))) thread_local struct KernelThreadData {
+                        struct $thread    * volatile this_thread;
+                        struct processor      * volatile this_processor;
+                        struct {
+                                volatile unsigned short disable_count;
+                                volatile bool enabled;
+                                volatile bool in_progress;
+                        } preemption_state;
+                        uint32_t rand_seed;
+                } kernelTLS __attribute__ ((tls_model ( "initial-exec" )));
+        }
+        #endif
+        struct __cfaehm_try_resume_node;
+        struct __cfaehm_base_exception_t;
+        struct exception_context_t {
+                struct __cfaehm_try_resume_node * top_resume;
+                struct __cfaehm_base_exception_t * current_exception;
+        };
         struct __stack_context_t {
 …
                 // base of stack
                 void * base;
+                // Information for exception handling.
+                struct exception_context_t exception_context;
         };
 …
         };
+        enum coroutine_state { Halted, Start, Primed, Inactive, Active, Rerun };
+        enum __Preemption_Reason { __NO_PREEMPTION, __ALARM_PREEMPTION, __POLL_PREEMPTION, __MANUAL_PREEMPTION };
+        enum __Coroutine_State { Halted, Start, Primed, Blocked, Ready, Active, Cancelled, Halting };
         struct $coroutine {
 …
                 // current execution status for coroutine
                 enum coroutine_state state;
+                enum __Coroutine_State state;
                 // first coroutine to resume this one
 …
         };
+        // Wrapper for gdb
+        struct cfathread_coroutine_t { struct $coroutine debug; };
+        static inline struct __stack_t * __get_stack( struct $coroutine * cor ) {
+                return (struct __stack_t*)(((uintptr_t)cor->stack.storage) & ((uintptr_t)-2));
+        }
         // struct which calls the monitor is accepting
 …
                 struct __condition_node_t * dtor_node;
         };
+        // Wrapper for gdb
+        struct cfathread_monitor_t { struct $monitor debug; };
         struct __monitor_group_t {
 …
                 // last function that acquired monitors
                 fptr_t func;
+        };
+        // Link lists fields
+        // instrusive link field for threads
+        struct __thread_desc_link {
+                struct $thread * next;
+                struct $thread * prev;
+                volatile unsigned long long ts;
+                int preferred;
         };
 …
                 // current execution status for coroutine
+                volatile int state;
+                enum __Preemption_Reason preempted;
+                // Possible values are:
+                //    - TICKET_BLOCKED (-1) thread is blocked
+                //    - TICKET_RUNNING ( 0) thread is running
+                //    - TICKET_UNBLOCK ( 1) thread should ignore next block
+                volatile int ticket;
+                enum __Coroutine_State state:8;
+                enum __Preemption_Reason preempted:8;
                 //SKULLDUGGERY errno is not save in the thread data structure because returnToKernel appears to be the only function to require saving and restoring it
+                // pointer to the cluster on which the thread is running
+                struct cluster * curr_cluster;
+                // Link lists fields
+                // instrusive link field for threads
+                struct __thread_desc_link link;
                 // coroutine body used to store context
 …
                 struct $monitor *  self_mon_p;
-                // pointer to the cluster on which the thread is running
-                struct cluster * curr_cluster;
                 // monitors currently held by this thread
                 struct __monitor_group_t monitors;
+                // Link lists fields
+                // instrusive link field for threads
+                struct $thread * next;
+                // used to put threads on user data structures
+                struct {
+                        struct $thread * next;
+                        struct $thread * back;
+                } seqable;
                 struct {
 …
                         struct $thread * prev;
                 } node;
+        };
+                #if defined( __CFA_WITH_VERIFY__ )
+                        void * canary;
+                #endif
+        };
+        // Wrapper for gdb
+        struct cfathread_thread_t { struct $thread debug; };
+        #ifdef __CFA_DEBUG__
+                void __cfaabi_dbg_record_thrd($thread & this, bool park, const char prev_name[]);
+        #else
+                #define __cfaabi_dbg_record_thrd(x, y, z)
+        #endif
         #ifdef __cforall
         extern "Cforall" {
                 static inline $thread *& get_next( $thread & this ) __attribute__((const)) {
                         return this.next;
+                        return this.link.next;
+                }
                 static inline [$thread *&, $thread *& ] __get( $thread & this ) __attribute__((const)) {
                         return this.node.[next, prev];
+                }
+                static inline $thread *& Back( $thread * this ) __attribute__((const)) {
+                        return this->seqable.back;
+                }
+                static inline $thread *& Next( $thread * this ) __attribute__((const)) {
+                        return this->seqable.next;
+                }
+                static inline bool listed( $thread * this ) {
+                        return this->seqable.next != 0p;
+                }

libcfa/src/concurrency/kernel.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Tue Jan 17 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 13:03:15 2020
 // Update Count     : 58
+// Last Modified On : Mon Aug 31 07:08:20 2020
+// Update Count     : 71
 //
 #define __cforall_thread__
+// #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 //C Includes
-#include <stddef.h>
 #include <errno.h>
-#include <string.h>
-extern "C" {
 #include <stdio.h>
-#include <fenv.h>
-#include <sys/resource.h>
 #include <signal.h>
 #include <unistd.h>
-#include <limits.h>                                                                             // PTHREAD_STACK_MIN
-#include <sys/mman.h>                                                                   // mprotect
+}
 //CFA Includes
-#include "time.hfa"
 #include "kernel_private.hfa"
 #include "preemption.hfa"
-#include "startup.hfa"
 //Private includes
 …
 #include "invoke.h"
 //-----------------------------------------------------------------------------
 // Some assembly required
 #if defined( __i386 )
-        #define CtxGet( ctx )        \
-                __asm__ volatile (     \
-                        "movl %%esp,%0\n"\
-                        "movl %%ebp,%1\n"\
-                        : "=rm" (ctx.SP),\
-                                "=rm" (ctx.FP) \
+                )
         // mxcr : SSE Status and Control bits (control bits are preserved across function calls)
         // fcw  : X87 FPU control word (preserved across function calls)
 …
 #elif defined( __x86_64 )
-        #define CtxGet( ctx )        \
-                __asm__ volatile (     \
-                        "movq %%rsp,%0\n"\
-                        "movq %%rbp,%1\n"\
-                        : "=rm" (ctx.SP),\
-                                "=rm" (ctx.FP) \
+                )
         #define __x87_store         \
                 uint32_t __mxcr;      \
 …
+                )
+#elif defined( __ARM_ARCH )
+#define CtxGet( ctx ) __asm__ ( \
+                "mov %0,%%sp\n"   \
+                "mov %1,%%r11\n"   \
+        : "=rm" (ctx.SP), "=rm" (ctx.FP) )
+#elif defined( __arm__ )
+        #define __x87_store
+        #define __x87_load
+#elif defined( __aarch64__ )
+        #define __x87_store              \
+                uint32_t __fpcntl[2];    \
+                __asm__ volatile (    \
+                        "mrs x9, FPCR\n" \
+                        "mrs x10, FPSR\n"  \
+                        "stp x9, x10, %0\n"  \
+                        : "=m" (__fpcntl) : : "x9", "x10" \
+                )
+        #define __x87_load         \
+                __asm__ volatile (    \
+                        "ldp x9, x10, %0\n"  \
+                        "msr FPSR, x10\n"  \
+                        "msr FPCR, x9\n" \
+                : "=m" (__fpcntl) : : "x9", "x10" \
+                )
 #else
         #error unknown hardware architecture
+        #error unsupported hardware architecture
 #endif
+extern $thread * mainThread;
+extern processor * mainProcessor;
 //-----------------------------------------------------------------------------
+//Start and stop routine for the kernel, declared first to make sure they run first
+static void __kernel_startup (void) __attribute__(( constructor( STARTUP_PRIORITY_KERNEL ) ));
+static void __kernel_shutdown(void) __attribute__(( destructor ( STARTUP_PRIORITY_KERNEL ) ));
+//-----------------------------------------------------------------------------
+// Kernel storage
+KERNEL_STORAGE(cluster,         mainCluster);
+KERNEL_STORAGE(processor,       mainProcessor);
+KERNEL_STORAGE($thread, mainThread);
+KERNEL_STORAGE(__stack_t,       mainThreadCtx);
+cluster     * mainCluster;
+processor   * mainProcessor;
+$thread * mainThread;
+extern "C" {
+        struct { __dllist_t(cluster) list; __spinlock_t lock; } __cfa_dbg_global_clusters;
+}
+size_t __page_size = 0;
+//-----------------------------------------------------------------------------
+// Global state
+thread_local struct KernelThreadData kernelTLS __attribute__ ((tls_model ( "initial-exec" ))) = {
+        NULL,                                                                                           // cannot use 0p
+        NULL,
+        { 1, false, false },
+u //this should be seeded better but due to a bug calling rdtsc doesn't work
+};
+//-----------------------------------------------------------------------------
+// Struct to steal stack
+struct current_stack_info_t {
+        __stack_t * storage;                                                            // pointer to stack object
+        void * base;                                                                            // base of stack
+        void * limit;                                                                           // stack grows towards stack limit
+        void * context;                                                                         // address of cfa_context_t
+};
+void ?{}( current_stack_info_t & this ) {
+        __stack_context_t ctx;
+        CtxGet( ctx );
+        this.base = ctx.FP;
+        rlimit r;
+        getrlimit( RLIMIT_STACK, &r);
+        size_t size = r.rlim_cur;
+        this.limit = (void *)(((intptr_t)this.base) - size);
+        this.context = &storage_mainThreadCtx;
+}
+//-----------------------------------------------------------------------------
+// Main thread construction
+void ?{}( $coroutine & this, current_stack_info_t * info) with( this ) {
+        stack.storage = info->storage;
+        with(*stack.storage) {
+                limit     = info->limit;
+                base      = info->base;
+        }
+        __attribute__((may_alias)) intptr_t * istorage = (intptr_t*) &stack.storage;
+        *istorage |= 0x1;
+        name = "Main Thread";
+        state = Start;
+        starter = 0p;
+        last = 0p;
+        cancellation = 0p;
+}
+void ?{}( $thread & this, current_stack_info_t * info) with( this ) {
+        state = Start;
+        self_cor{ info };
+        curr_cor = &self_cor;
+        curr_cluster = mainCluster;
+        self_mon.owner = &this;
+        self_mon.recursion = 1;
+        self_mon_p = &self_mon;
+        next = 0p;
+        node.next = 0p;
+        node.prev = 0p;
+        doregister(curr_cluster, this);
+        monitors{ &self_mon_p, 1, (fptr_t)0 };
+}
+//-----------------------------------------------------------------------------
+// Processor coroutine
+void ?{}(processorCtx_t & this) {
+}
+// Construct the processor context of non-main processors
+static void ?{}(processorCtx_t & this, processor * proc, current_stack_info_t * info) {
+        (this.__cor){ info };
+        this.proc = proc;
+}
+static void * __invoke_processor(void * arg);
+void ?{}(processor & this, const char name[], cluster & cltr) with( this ) {
+        this.name = name;
+        this.cltr = &cltr;
+        terminated{ 0 };
+        destroyer = 0p;
+        do_terminate = false;
+        preemption_alarm = 0p;
+        pending_preemption = false;
+        runner.proc = &this;
+        idleLock{};
+        __cfaabi_dbg_print_safe("Kernel : Starting core %p\n", &this);
+        this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
+        __cfaabi_dbg_print_safe("Kernel : core %p started\n", &this);
+}
+void ^?{}(processor & this) with( this ){
+        if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
+                __cfaabi_dbg_print_safe("Kernel : core %p signaling termination\n", &this);
+                __atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
+                wake( &this );
+                P( terminated );
+                verify( kernelTLS.this_processor != &this);
+        }
+        pthread_join( kernel_thread, 0p );
+        free( this.stack );
+}
+void ?{}(cluster & this, const char name[], Duration preemption_rate) with( this ) {
+        this.name = name;
+        this.preemption_rate = preemption_rate;
+        ready_queue{};
+        ready_queue_lock{};
+        procs{ __get };
+        idles{ __get };
+        threads{ __get };
+        doregister(this);
+}
+void ^?{}(cluster & this) {
+        unregister(this);
+}
+// Kernel Scheduling logic
+static $thread * __next_thread(cluster * this);
+static $thread * __next_thread_slow(cluster * this);
+static void __run_thread(processor * this, $thread * dst);
+static void __wake_one(cluster * cltr);
+static void push  (__cluster_idles & idles, processor & proc);
+static void remove(__cluster_idles & idles, processor & proc);
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
 //=============================================================================================
 // Kernel Scheduling logic
 //=============================================================================================
-static $thread * __next_thread(cluster * this);
-static void __run_thread(processor * this, $thread * dst);
-static void __halt(processor * this);
 //Main of the processor contexts
 void main(processorCtx_t & runner) {
         // Because of a bug, we couldn't initialized the seed on construction
         // Do it here
+        kernelTLS.rand_seed ^= rdtscl();
+        __cfaabi_tls.rand_seed ^= rdtscl();
+        __cfaabi_tls.ready_rng.fwd_seed = 25214903917_l64u * (rdtscl() ^ (uintptr_t)&runner);
+        __tls_rand_advance_bck();
         processor * this = runner.proc;
         verify(this);
+        __cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
+        doregister(this->cltr, this);
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
+        #if !defined(__CFA_NO_STATISTICS__)
+                if( this->print_halts ) {
+                        __cfaabi_bits_print_safe( STDOUT_FILENO, "Processor : %d - %s (%p)\n", this->id, this->name, (void*)this);
+                }
+        #endif
+        {
 …
                 preemption_scope scope = { this };
                 __cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p started\n", this);
                 $thread * readyThread = 0p;
+                for( unsigned int spin_count = 0; ! __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST); spin_count++ ) {
+                MAIN_LOOP:
+                for() {
+                        // Try to get the next thread
                         readyThread = __next_thread( this->cltr );
+                        if(readyThread) {
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                /* paranoid */ verifyf( readyThread->state == Inactive || readyThread->state == Start || readyThread->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", readyThread->state, readyThread->preempted);
+                                /* paranoid */ verifyf( readyThread->next == 0p, "Expected null got %p", readyThread->next );
+                                __run_thread(this, readyThread);
+                                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                                spin_count = 0;
+                        } else {
+                                // spin(this, &spin_count);
+                                __halt(this);
+                        if( !readyThread ) {
+                                readyThread = __next_thread_slow( this->cltr );
+                        }
+                }
+                __cfaabi_dbg_print_safe("Kernel : core %p stopping\n", this);
+        }
+        unregister(this->cltr, this);
+                        HALT:
+                        if( !readyThread ) {
+                                // Don't block if we are done
+                                if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        __tls_stats()->ready.sleep.halts++;
+                                #endif
+                                // Push self to idle stack
+                                push(this->cltr->idles, * this);
+                                // Confirm the ready-queue is empty
+                                readyThread = __next_thread_slow( this->cltr );
+                                if( readyThread ) {
+                                        // A thread was found, cancel the halt
+                                        remove(this->cltr->idles, * this);
+                                        #if !defined(__CFA_NO_STATISTICS__)
+                                                __tls_stats()->ready.sleep.cancels++;
+                                        #endif
+                                        // continue the mai loop
+                                        break HALT;
+                                }
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if(this->print_halts) {
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 0\n", this->id, rdtscl());
+                                        }
+                                #endif
+                                wait( this->idle );
+                                #if !defined(__CFA_NO_STATISTICS__)
+                                        if(this->print_halts) {
+                                                __cfaabi_bits_print_safe( STDOUT_FILENO, "PH:%d - %lld 1\n", this->id, rdtscl());
+                                        }
+                                #endif
+                                // We were woken up, remove self from idle
+                                remove(this->cltr->idles, * this);
+                                // DON'T just proceed, start looking again
+                                continue MAIN_LOOP;
+                        }
+                        /* paranoid */ verify( readyThread );
+                        // We found a thread run it
+                        __run_thread(this, readyThread);
+                        // Are we done?
+                        if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                }
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
+        }
         V( this->terminated );
+        __cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+        if(this == mainProcessor) {
+                // HACK : the coroutine context switch expects this_thread to be set
+                // and it make sense for it to be set in all other cases except here
+                // fake it
+                __cfaabi_tls.this_thread = mainThread;
+        }
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p terminated\n", this);
+}
 …
 // from the processor coroutine to the target thread
 static void __run_thread(processor * this, $thread * thrd_dst) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verifyf( thrd_dst->state == Ready || thrd_dst->preempted != __NO_PREEMPTION, "state : %d, preempted %d\n", thrd_dst->state, thrd_dst->preempted);
+        /* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
+        __builtin_prefetch( thrd_dst->context.SP );
         $coroutine * proc_cor = get_coroutine(this->runner);
-        // Update global state
-        kernelTLS.this_thread = thrd_dst;
         // set state of processor coroutine to inactive
         verify(proc_cor->state == Active);
         proc_cor->state = Inactive;
+        proc_cor->state = Blocked;
         // Actually run the thread
         RUNNING:  while(true) {
+                if(unlikely(thrd_dst->preempted)) {
+                        thrd_dst->preempted = __NO_PREEMPTION;
+                        verify(thrd_dst->state == Active || thrd_dst->state == Rerun);
+                } else {
+                        verify(thrd_dst->state == Start || thrd_dst->state == Primed || thrd_dst->state == Inactive);
+                        thrd_dst->state = Active;
+                }
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                thrd_dst->preempted = __NO_PREEMPTION;
+                thrd_dst->state = Active;
+                // Update global state
+                kernelTLS().this_thread = thrd_dst;
+                /* paranoid */ verify( ! __preemption_enabled() );
+                /* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
+                /* paranoid */ verify( thrd_dst->curr_cluster == this->cltr );
+                /* paranoid */ verify( thrd_dst->context.SP );
+                /* paranoid */ verify( thrd_dst->state != Halted );
+                /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ) || thrd_dst->curr_cor == proc_cor, "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst ); // add escape condition if we are setting up the processor
+                /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit) || thrd_dst->curr_cor == proc_cor, "ERROR : Destination $thread %p has been corrupted.\n StackPointer too large.\n", thrd_dst ); // add escape condition if we are setting up the processor
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
                 // set context switch to the thread that the processor is executing
-                verify( thrd_dst->context.SP );
                 __cfactx_switch( &proc_cor->context, &thrd_dst->context );
                 // when __cfactx_switch returns we are back in the processor coroutine
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_dst->canary );
+                /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) > ((uintptr_t)__get_stack(thrd_dst->curr_cor)->limit), "ERROR : Destination $thread %p has been corrupted.\n StackPointer too large.\n", thrd_dst );
+                /* paranoid */ verifyf( ((uintptr_t)thrd_dst->context.SP) < ((uintptr_t)__get_stack(thrd_dst->curr_cor)->base ), "ERROR : Destination $thread %p has been corrupted.\n StackPointer too small.\n", thrd_dst );
+                /* paranoid */ verify( thrd_dst->context.SP );
+                /* paranoid */ verify( thrd_dst->curr_cluster == this->cltr );
+                /* paranoid */ verify( kernelTLS().this_thread == thrd_dst );
+                /* paranoid */ verify( ! __preemption_enabled() );
+                // Reset global state
+                kernelTLS().this_thread = 0p;
                 // We just finished running a thread, there are a few things that could have happened.
                 // 1 - Regular case : the thread has blocked and now one has scheduled it yet.
                 // 2 - Racy case    : the thread has blocked but someone has already tried to schedule it.
-                // 3 - Polite Racy case : the thread has blocked, someone has already tried to schedule it, but the thread is nice and wants to go through the ready-queue any way
                 // 4 - Preempted
                 // In case 1, we may have won a race so we can't write to the state again.
                 // In case 2, we lost the race so we now own the thread.
-                // In case 3, we lost the race but can just reschedule the thread.
                 if(unlikely(thrd_dst->preempted != __NO_PREEMPTION)) {
 …
+                }
+                if(unlikely(thrd_dst->state == Halting)) {
+                        // The thread has halted, it should never be scheduled/run again
+                        // finish the thread
+                        __thread_finish( thrd_dst );
+                        break RUNNING;
+                }
+                /* paranoid */ verify( thrd_dst->state == Active );
+                thrd_dst->state = Blocked;
                 // set state of processor coroutine to active and the thread to inactive
+                static_assert(sizeof(thrd_dst->state) == sizeof(int));
+                enum coroutine_state old_state = __atomic_exchange_n(&thrd_dst->state, Inactive, __ATOMIC_SEQ_CST);
+                switch(old_state) {
+                        case Halted:
+                                // The thread has halted, it should never be scheduled/run again, leave it back to Halted and move on
+                                thrd_dst->state = Halted;
+                                // We may need to wake someone up here since
+                                unpark( this->destroyer );
+                                this->destroyer = 0p;
+                                break RUNNING;
+                        case Active:
+                int old_ticket = __atomic_fetch_sub(&thrd_dst->ticket, 1, __ATOMIC_SEQ_CST);
+                switch(old_ticket) {
+                        case TICKET_RUNNING:
                                 // This is case 1, the regular case, nothing more is needed
                                 break RUNNING;
                         case Rerun:
+                        case TICKET_UNBLOCK:
                                 // This is case 2, the racy case, someone tried to run this thread before it finished blocking
                                 // In this case, just run it again.
 …
                         default:
                                 // This makes no sense, something is wrong abort
                                 abort("Finished running a thread that was Inactive/Start/Primed %d\n", old_state);
+                                abort();
+                }
+        }
 …
         // Just before returning to the processor, set the processor coroutine to active
         proc_cor->state = Active;
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
 // KERNEL_ONLY
 void returnToKernel() {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        $coroutine * proc_cor = get_coroutine(kernelTLS.this_processor->runner);
+        $thread * thrd_src = kernelTLS.this_thread;
+        /* paranoid */ verify( ! __preemption_enabled() );
+        $coroutine * proc_cor = get_coroutine(kernelTLS().this_processor->runner);
+        $thread * thrd_src = kernelTLS().this_thread;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct processor * last_proc = kernelTLS().this_processor;
+        #endif
         // Run the thread on this processor
 …
                         __x87_store;
                 #endif
+                verify( proc_cor->context.SP );
+                /* paranoid */ verify( proc_cor->context.SP );
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_src->canary );
                 __cfactx_switch( &thrd_src->context, &proc_cor->context );
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd_src->canary );
                 #if defined( __i386 ) || defined( __x86_64 )
                         __x87_load;
 …
+        }
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+}
+// KERNEL_ONLY
+// Context invoker for processors
+// This is the entry point for processors (kernel threads)
+// It effectively constructs a coroutine by stealing the pthread stack
+static void * __invoke_processor(void * arg) {
+        processor * proc = (processor *) arg;
+        kernelTLS.this_processor = proc;
+        kernelTLS.this_thread    = 0p;
+        kernelTLS.preemption_state.[enabled, disable_count] = [false, 1];
+        // SKULLDUGGERY: We want to create a context for the processor coroutine
+        // which is needed for the 2-step context switch. However, there is no reason
+        // to waste the perfectly valid stack create by pthread.
+        current_stack_info_t info;
+        __stack_t ctx;
+        info.storage = &ctx;
+        (proc->runner){ proc, &info };
+        __cfaabi_dbg_print_safe("Coroutine : created stack %p\n", get_coroutine(proc->runner)->stack.storage);
+        //Set global state
+        kernelTLS.this_thread = 0p;
+        //We now have a proper context from which to schedule threads
+        __cfaabi_dbg_print_safe("Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+        // SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
+        // resume it to start it like it normally would, it will just context switch
+        // back to here. Instead directly call the main since we already are on the
+        // appropriate stack.
+        get_coroutine(proc->runner)->state = Active;
+        main( proc->runner );
+        get_coroutine(proc->runner)->state = Halted;
+        // Main routine of the core returned, the core is now fully terminated
+        __cfaabi_dbg_print_safe("Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+        return 0p;
+}
+static void Abort( int ret, const char func[] ) {
+        if ( ret ) {                                                                            // pthread routines return errno values
+                abort( "%s : internal error, error(%d) %s.", func, ret, strerror( ret ) );
+        } // if
+} // Abort
+void * __create_pthread( pthread_t * pthread, void * (*start)(void *), void * arg ) {
+        pthread_attr_t attr;
+        Abort( pthread_attr_init( &attr ), "pthread_attr_init" ); // initialize attribute
+        size_t stacksize;
+        // default stack size, normally defined by shell limit
+        Abort( pthread_attr_getstacksize( &attr, &stacksize ), "pthread_attr_getstacksize" );
+        assert( stacksize >= PTHREAD_STACK_MIN );
+        void * stack;
+        __cfaabi_dbg_debug_do(
+                stack = memalign( __page_size, stacksize + __page_size );
+                // pthread has no mechanism to create the guard page in user supplied stack.
+                if ( mprotect( stack, __page_size, PROT_NONE ) == -1 ) {
+                        abort( "mprotect : internal error, mprotect failure, error(%d) %s.", errno, strerror( errno ) );
+                } // if
+        );
+        __cfaabi_dbg_no_debug_do(
+                stack = malloc( stacksize );
+        );
+        Abort( pthread_attr_setstack( &attr, stack, stacksize ), "pthread_attr_setstack" );
+        Abort( pthread_create( pthread, &attr, start, arg ), "pthread_create" );
+        return stack;
+}
+// KERNEL_ONLY
+static void __kernel_first_resume( processor * this ) {
+        $thread * src = mainThread;
+        $coroutine * dst = get_coroutine(this->runner);
+        verify( ! kernelTLS.preemption_state.enabled );
+        kernelTLS.this_thread->curr_cor = dst;
+        __stack_prepare( &dst->stack, 65000 );
+        __cfactx_start(main, dst, this->runner, __cfactx_invoke_coroutine);
+        verify( ! kernelTLS.preemption_state.enabled );
+        dst->last = &src->self_cor;
+        dst->starter = dst->starter ? dst->starter : &src->self_cor;
+        // set state of current coroutine to inactive
+        src->state = src->state == Halted ? Halted : Inactive;
+        // context switch to specified coroutine
+        verify( dst->context.SP );
+        __cfactx_switch( &src->context, &dst->context );
+        // when __cfactx_switch returns we are back in the src coroutine
+        mainThread->curr_cor = &mainThread->self_cor;
+        // set state of new coroutine to active
+        src->state = Active;
+        verify( ! kernelTLS.preemption_state.enabled );
+}
+// KERNEL_ONLY
+static void __kernel_last_resume( processor * this ) {
+        $coroutine * src = &mainThread->self_cor;
+        $coroutine * dst = get_coroutine(this->runner);
+        verify( ! kernelTLS.preemption_state.enabled );
+        verify( dst->starter == src );
+        verify( dst->context.SP );
+        // context switch to the processor
+        __cfactx_switch( &src->context, &dst->context );
+        #if !defined(__CFA_NO_STATISTICS__)
+                if(last_proc != kernelTLS().this_processor) {
+                        __tls_stats()->ready.threads.migration++;
+                }
+        #endif
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) < ((uintptr_t)__get_stack(thrd_src->curr_cor)->base ), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too small.\n", thrd_src );
+        /* paranoid */ verifyf( ((uintptr_t)thrd_src->context.SP) > ((uintptr_t)__get_stack(thrd_src->curr_cor)->limit), "ERROR : Returning $thread %p has been corrupted.\n StackPointer too large.\n", thrd_src );
+}
 …
 // Scheduler routines
 // KERNEL ONLY
+void __schedule_thread( $thread * thrd ) with( *thrd->curr_cluster ) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+void __schedule_thread( $thread * thrd ) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        /* paranoid */ verify( thrd );
+        /* paranoid */ verify( thrd->state != Halted );
+        /* paranoid */ verify( thrd->curr_cluster );
         /* paranoid */ #if defined( __CFA_WITH_VERIFY__ )
         /* paranoid */ if( thrd->state == Inactive || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
                           "Error inactive thread marked as preempted, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ if( thrd->preempted != __NO_PREEMPTION ) assertf(thrd->state == Active || thrd->state == Rerun,
                           "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
+        /* paranoid */  if( thrd->state == Blocked || thrd->state == Start ) assertf( thrd->preempted == __NO_PREEMPTION,
+                                        "Error inactive thread marked as preempted, state %d, preemption %d\n", thrd->state, thrd->preempted );
+        /* paranoid */  if( thrd->preempted != __NO_PREEMPTION ) assertf(thrd->state == Active,
+                                        "Error preempted thread marked as not currently running, state %d, preemption %d\n", thrd->state, thrd->preempted );
         /* paranoid */ #endif
+        /* paranoid */ verifyf( thrd->next == 0p, "Expected null got %p", thrd->next );
+        lock  ( ready_queue_lock __cfaabi_dbg_ctx2 );
+        bool was_empty = !(ready_queue != 0);
+        append( ready_queue, thrd );
+        unlock( ready_queue_lock );
+        if(was_empty) {
+                lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                if(idles) {
+                        wake_fast(idles.head);
+                }
+                unlock    (proc_list_lock);
+        }
+        else if( struct processor * idle = idles.head ) {
+                wake_fast(idle);
+        }
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verifyf( thrd->link.next == 0p, "Expected null got %p", thrd->link.next );
+        /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
+        if (thrd->preempted == __NO_PREEMPTION) thrd->state = Ready;
+        ready_schedule_lock();
+                // Dereference the thread now because once we push it, there is not guaranteed it's still valid.
+                struct cluster * cl = thrd->curr_cluster;
+                // push the thread to the cluster ready-queue
+                push( cl, thrd );
+                // variable thrd is no longer safe to use
+                // wake the cluster using the save variable.
+                __wake_one( cl );
+        ready_schedule_unlock();
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
 // KERNEL ONLY
+static $thread * __next_thread(cluster * this) with( *this ) {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        lock( ready_queue_lock __cfaabi_dbg_ctx2 );
+        $thread * head = pop_head( ready_queue );
+        unlock( ready_queue_lock );
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        return head;
+static inline $thread * __next_thread(cluster * this) with( *this ) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        ready_schedule_lock();
+                $thread * thrd = pop( this );
+        ready_schedule_unlock();
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        return thrd;
+}
+// KERNEL ONLY
+static inline $thread * __next_thread_slow(cluster * this) with( *this ) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        ready_schedule_lock();
+                $thread * thrd = pop_slow( this );
+        ready_schedule_unlock();
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        return thrd;
+}
 …
         if( !thrd ) return;
+        disable_interrupts();
+        static_assert(sizeof(thrd->state) == sizeof(int));
+        enum coroutine_state old_state = __atomic_exchange_n(&thrd->state, Rerun, __ATOMIC_SEQ_CST);
+        switch(old_state) {
+                case Active:
+        int old_ticket = __atomic_fetch_add(&thrd->ticket, 1, __ATOMIC_SEQ_CST);
+        switch(old_ticket) {
+                case TICKET_RUNNING:
                         // Wake won the race, the thread will reschedule/rerun itself
                         break;
                 case Inactive:
+                case TICKET_BLOCKED:
                         /* paranoid */ verify( ! thrd->preempted != __NO_PREEMPTION );
+                        // Wake lost the race,
+                        thrd->state = Inactive;
+                        __schedule_thread( thrd );
+                        /* paranoid */ verify( thrd->state == Blocked );
+                        {
+                                /* paranoid */ verify( publicTLS_get(this_proc_id) );
+                                bool full = publicTLS_get(this_proc_id)->full_proc;
+                                if(full) disable_interrupts();
+                                /* paranoid */ verify( ! __preemption_enabled() );
+                                // Wake lost the race,
+                                __schedule_thread( thrd );
+                                /* paranoid */ verify( ! __preemption_enabled() );
+                                if(full) enable_interrupts( __cfaabi_dbg_ctx );
+                                /* paranoid */ verify( publicTLS_get(this_proc_id) );
+                        }
                         break;
-                case Rerun:
-                        abort("More than one thread attempted to schedule thread %p\n", thrd);
-                        break;
-                case Halted:
-                case Start:
-                case Primed:
                 default:
                         // This makes no sense, something is wrong abort
+                        abort();
+        }
+                        abort("Thread %p (%s) has mismatch park/unpark\n", thrd, thrd->self_cor.name);
+        }
+}
+void park( void ) {
+        /* paranoid */ verify( __preemption_enabled() );
+        disable_interrupts();
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_thread->preempted == __NO_PREEMPTION );
+        returnToKernel();
+        /* paranoid */ verify( ! __preemption_enabled() );
         enable_interrupts( __cfaabi_dbg_ctx );
+}
+void park( void ) {
+        /* paranoid */ verify( kernelTLS.preemption_state.enabled );
+        disable_interrupts();
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( kernelTLS.this_thread->preempted == __NO_PREEMPTION );
+        returnToKernel();
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        enable_interrupts( __cfaabi_dbg_ctx );
+        /* paranoid */ verify( kernelTLS.preemption_state.enabled );
+}
+// KERNEL ONLY
+void __leave_thread() {
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        returnToKernel();
+        abort();
+        /* paranoid */ verify( __preemption_enabled() );
+}
+extern "C" {
+        // Leave the thread monitor
+        // last routine called by a thread.
+        // Should never return
+        void __cfactx_thrd_leave() {
+                $thread * thrd = active_thread();
+                $monitor * this = &thrd->self_mon;
+                // Lock the monitor now
+                lock( this->lock __cfaabi_dbg_ctx2 );
+                disable_interrupts();
+                /* paranoid */ verify( ! __preemption_enabled() );
+                /* paranoid */ verify( thrd->state == Active );
+                /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
+                /* paranoid */ verify( kernelTLS().this_thread == thrd );
+                /* paranoid */ verify( thrd->context.SP );
+                /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) > ((uintptr_t)__get_stack(thrd->curr_cor)->limit), "ERROR : $thread %p has been corrupted.\n StackPointer too large.\n", thrd );
+                /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) < ((uintptr_t)__get_stack(thrd->curr_cor)->base ), "ERROR : $thread %p has been corrupted.\n StackPointer too small.\n", thrd );
+                thrd->state = Halting;
+                if( TICKET_RUNNING != thrd->ticket ) { abort( "Thread terminated with pending unpark" ); }
+                if( thrd != this->owner ) { abort( "Thread internal monitor has incorrect owner" ); }
+                if( this->recursion != 1) { abort( "Thread internal monitor has unbalanced recursion" ); }
+                // Leave the thread
+                returnToKernel();
+                // Control flow should never reach here!
+                abort();
+        }
+}
 // KERNEL ONLY
 bool force_yield( __Preemption_Reason reason ) {
         /* paranoid */ verify( kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( __preemption_enabled() );
         disable_interrupts();
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         $thread * thrd = kernelTLS.this_thread;
         /* paranoid */ verify(thrd->state == Active || thrd->state == Rerun);
+        /* paranoid */ verify( ! __preemption_enabled() );
+        $thread * thrd = kernelTLS().this_thread;
+        /* paranoid */ verify(thrd->state == Active);
         // SKULLDUGGERY: It is possible that we are preempting this thread just before
 …
         // If that is the case, abandon the preemption.
         bool preempted = false;
         if(thrd->next == 0p) {
+        if(thrd->link.next == 0p) {
                 preempted = true;
                 thrd->preempted = reason;
 …
+        }
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( ! __preemption_enabled() );
         enable_interrupts_noPoll();
         /* paranoid */ verify( kernelTLS.preemption_state.enabled );
+        /* paranoid */ verify( __preemption_enabled() );
         return preempted;
 …
 //=============================================================================================
 // Kernel Setup logic
+// Kernel Idle Sleep
 //=============================================================================================
+//-----------------------------------------------------------------------------
+// Kernel boot procedures
+static void __kernel_startup(void) {
+        verify( ! kernelTLS.preemption_state.enabled );
+        __cfaabi_dbg_print_safe("Kernel : Starting\n");
+        __page_size = sysconf( _SC_PAGESIZE );
+        __cfa_dbg_global_clusters.list{ __get };
+        __cfa_dbg_global_clusters.lock{};
+        // Initialize the main cluster
+        mainCluster = (cluster *)&storage_mainCluster;
+        (*mainCluster){"Main Cluster"};
+        __cfaabi_dbg_print_safe("Kernel : Main cluster ready\n");
+        // Start by initializing the main thread
+        // SKULLDUGGERY: the mainThread steals the process main thread
+        // which will then be scheduled by the mainProcessor normally
+        mainThread = ($thread *)&storage_mainThread;
+        current_stack_info_t info;
+        info.storage = (__stack_t*)&storage_mainThreadCtx;
+        (*mainThread){ &info };
+        __cfaabi_dbg_print_safe("Kernel : Main thread ready\n");
+        // Construct the processor context of the main processor
+        void ?{}(processorCtx_t & this, processor * proc) {
+                (this.__cor){ "Processor" };
+                this.__cor.starter = 0p;
+                this.proc = proc;
+        }
+        void ?{}(processor & this) with( this ) {
+                name = "Main Processor";
+                cltr = mainCluster;
+                terminated{ 0 };
+                do_terminate = false;
+                preemption_alarm = 0p;
+                pending_preemption = false;
+                kernel_thread = pthread_self();
+                runner{ &this };
+                __cfaabi_dbg_print_safe("Kernel : constructed main processor context %p\n", &runner);
+        }
+        // Initialize the main processor and the main processor ctx
+        // (the coroutine that contains the processing control flow)
+        mainProcessor = (processor *)&storage_mainProcessor;
+        (*mainProcessor){};
+        //initialize the global state variables
+        kernelTLS.this_processor = mainProcessor;
+        kernelTLS.this_thread    = mainThread;
+        // Enable preemption
+        kernel_start_preemption();
+        // Add the main thread to the ready queue
+        // once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
+        __schedule_thread(mainThread);
+        // SKULLDUGGERY: Force a context switch to the main processor to set the main thread's context to the current UNIX
+        // context. Hence, the main thread does not begin through __cfactx_invoke_thread, like all other threads. The trick here is that
+        // mainThread is on the ready queue when this call is made.
+        __kernel_first_resume( kernelTLS.this_processor );
+        // THE SYSTEM IS NOW COMPLETELY RUNNING
+        __cfaabi_dbg_print_safe("Kernel : Started\n--------------------------------------------------\n\n");
+        verify( ! kernelTLS.preemption_state.enabled );
+// Wake a thread from the front if there are any
+static void __wake_one(cluster * this) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( ready_schedule_islocked() );
+        // Check if there is a sleeping processor
+        processor * p;
+        unsigned idle;
+        unsigned total;
+        [idle, total, p] = query(this->idles);
+        // If no one is sleeping, we are done
+        if( idle == 0 ) return;
+        // We found a processor, wake it up
+        post( p->idle );
+        #if !defined(__CFA_NO_STATISTICS__)
+                __tls_stats()->ready.sleep.wakes++;
+        #endif
+        /* paranoid */ verify( ready_schedule_islocked() );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        return;
+}
+// Unconditionnaly wake a thread
+void __wake_proc(processor * this) {
+        __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
+        disable_interrupts();
+                /* paranoid */ verify( ! __preemption_enabled() );
+                post( this->idle );
         enable_interrupts( __cfaabi_dbg_ctx );
+        verify( TL_GET( preemption_state.enabled ) );
+}
+static void __kernel_shutdown(void) {
+        __cfaabi_dbg_print_safe("\n--------------------------------------------------\nKernel : Shutting down\n");
+        verify( TL_GET( preemption_state.enabled ) );
+        disable_interrupts();
+        verify( ! kernelTLS.preemption_state.enabled );
+        // SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
+        // When its coroutine terminates, it return control to the mainThread
+        // which is currently here
+        __atomic_store_n(&mainProcessor->do_terminate, true, __ATOMIC_RELEASE);
+        __kernel_last_resume( kernelTLS.this_processor );
+        mainThread->self_cor.state = Halted;
+        // THE SYSTEM IS NOW COMPLETELY STOPPED
+        // Disable preemption
+        kernel_stop_preemption();
+        // Destroy the main processor and its context in reverse order of construction
+        // These were manually constructed so we need manually destroy them
+        ^(mainProcessor->runner){};
+        ^(mainProcessor){};
+        // Final step, destroy the main thread since it is no longer needed
+        // Since we provided a stack to this taxk it will not destroy anything
+        ^(mainThread){};
+        ^(__cfa_dbg_global_clusters.list){};
+        ^(__cfa_dbg_global_clusters.lock){};
+        __cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
+}
+//=============================================================================================
+// Kernel Quiescing
+//=============================================================================================
+static void __halt(processor * this) with( *this ) {
+        // verify( ! __atomic_load_n(&do_terminate, __ATOMIC_SEQ_CST) );
+        with( *cltr ) {
+                lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                remove    (procs, *this);
+                push_front(idles, *this);
+                unlock    (proc_list_lock);
+        }
+        __cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+        wait( idleLock );
+        __cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+        with( *cltr ) {
+                lock      (proc_list_lock __cfaabi_dbg_ctx2);
+                remove    (idles, *this);
+                push_front(procs, *this);
+                unlock    (proc_list_lock);
+}
+static void push  (__cluster_idles & this, processor & proc) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        lock( this );
+                this.idle++;
+                /* paranoid */ verify( this.idle <= this.total );
+                insert_first(this.list, proc);
+        unlock( this );
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
+static void remove(__cluster_idles & this, processor & proc) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        lock( this );
+                this.idle--;
+                /* paranoid */ verify( this.idle >= 0 );
+                remove(proc);
+        unlock( this );
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
+static [unsigned idle, unsigned total, * processor] query( & __cluster_idles this ) {
+        for() {
+                uint64_t l = __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST);
+                if( 1 == (l % 2) ) { Pause(); continue; }
+                unsigned idle    = this.idle;
+                unsigned total   = this.total;
+                processor * proc = &this.list`first;
+                // Compiler fence is unnecessary, but gcc-8 and older incorrectly reorder code without it
+                asm volatile("": : :"memory");
+                if(l != __atomic_load_n(&this.lock, __ATOMIC_SEQ_CST)) { Pause(); continue; }
+                return [idle, total, proc];
+        }
+}
 …
         // the globalAbort flag is true.
         lock( kernel_abort_lock __cfaabi_dbg_ctx2 );
+        // disable interrupts, it no longer makes sense to try to interrupt this processor
+        disable_interrupts();
         // first task to abort ?
 …
+        }
         return kernelTLS.this_thread;
+        return __cfaabi_tls.this_thread;
+}
 void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
         $thread * thrd = kernel_data;
+        $thread * thrd = ( $thread * ) kernel_data;
         if(thrd) {
 …
 int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
         return get_coroutine(kernelTLS.this_thread) == get_coroutine(mainThread) ? 4 : 2;
+        return get_coroutine(kernelTLS().this_thread) == get_coroutine(mainThread) ? 4 : 2;
+}
 …
 void ^?{}(semaphore & this) {}
 void P(semaphore & this) with( this ){
+bool P(semaphore & this) with( this ){
         lock( lock __cfaabi_dbg_ctx2 );
         count -= 1;
         if ( count < 0 ) {
                 // queue current task
                 append( waiting, kernelTLS.this_thread );
+                append( waiting, active_thread() );
                 // atomically release spin lock and block
                 unlock( lock );
                 park();
+                return true;
+        }
         else {
             unlock( lock );
+        }
+}
+void V(semaphore & this) with( this ) {
+            return false;
+        }
+}
+bool V(semaphore & this) with( this ) {
         $thread * thrd = 0p;
         lock( lock __cfaabi_dbg_ctx2 );
 …
         // make new owner
         unpark( thrd );
+}
+//-----------------------------------------------------------------------------
+// Global Queues
+void doregister( cluster     & cltr ) {
+        lock      ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+        push_front( __cfa_dbg_global_clusters.list, cltr );
+        unlock    ( __cfa_dbg_global_clusters.lock );
+}
+void unregister( cluster     & cltr ) {
+        lock  ( __cfa_dbg_global_clusters.lock __cfaabi_dbg_ctx2);
+        remove( __cfa_dbg_global_clusters.list, cltr );
+        unlock( __cfa_dbg_global_clusters.lock );
+}
+void doregister( cluster * cltr, $thread & thrd ) {
+        lock      (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+        cltr->nthreads += 1;
+        push_front(cltr->threads, thrd);
+        unlock    (cltr->thread_list_lock);
+}
+void unregister( cluster * cltr, $thread & thrd ) {
+        lock  (cltr->thread_list_lock __cfaabi_dbg_ctx2);
+        remove(cltr->threads, thrd );
+        cltr->nthreads -= 1;
+        unlock(cltr->thread_list_lock);
+}
+void doregister( cluster * cltr, processor * proc ) {
+        lock      (cltr->proc_list_lock __cfaabi_dbg_ctx2);
+        cltr->nprocessors += 1;
+        push_front(cltr->procs, *proc);
+        unlock    (cltr->proc_list_lock);
+}
+void unregister( cluster * cltr, processor * proc ) {
+        lock  (cltr->proc_list_lock __cfaabi_dbg_ctx2);
+        remove(cltr->procs, *proc );
+        cltr->nprocessors -= 1;
+        unlock(cltr->proc_list_lock);
+        return thrd != 0p;
+}
+bool V(semaphore & this, unsigned diff) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        int release = max(-count, (int)diff);
+        count += diff;
+        for(release) {
+                unpark( pop_head( waiting ) );
+        }
+        unlock( lock );
+        return thrd != 0p;
+}
 …
 __cfaabi_dbg_debug_do(
         extern "C" {
                 void __cfaabi_dbg_record(__spinlock_t & this, const char prev_name[]) {
+                void __cfaabi_dbg_record_lock(__spinlock_t & this, const char prev_name[]) {
                         this.prev_name = prev_name;
                         this.prev_thrd = kernelTLS.this_thread;
+                        this.prev_thrd = kernelTLS().this_thread;
+                }
+        }
 …
         return true;
+}
+//-----------------------------------------------------------------------------
+// Statistics
+#if !defined(__CFA_NO_STATISTICS__)
+        void print_halts( processor & this ) {
+                this.print_halts = true;
+        }
+        void print_stats_now( cluster & this, int flags ) {
+                __print_stats( this.stats, this.print_stats, "Cluster", this.name, (void*)&this );
+        }
+        extern int __print_alarm_stats;
+        void print_alarm_stats() {
+                __print_alarm_stats = -1;
+        }
+#endif
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/kernel.hfa

-              r3c64c668
+              r58fe85a
 #pragma once
-#include <stdbool.h>
 #include "invoke.h"
 #include "time_t.hfa"
 #include "coroutine.hfa"
+#include "containers/list.hfa"
 extern "C" {
 #include <pthread.h>
 #include <semaphore.h>
+        #include <bits/pthreadtypes.h>
+        #include <linux/types.h>
+}
 …
 void  ?{}(semaphore & this, int count = 1);
 void ^?{}(semaphore & this);
+void   P (semaphore & this);
+void   V (semaphore & this);
+bool   P (semaphore & this);
+bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);
 …
 extern struct cluster * mainCluster;
+// Processor
+// Processor id, required for scheduling threads
+struct __processor_id_t {
+        unsigned id:24;
+        bool full_proc:1;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct __stats_t * stats;
+        #endif
+};
 coroutine processorCtx_t {
         struct processor * proc;
 …
 // Wrapper around kernel threads
 struct processor {
+struct __attribute__((aligned(128))) processor {
         // Main state
+        inline __processor_id_t;
+        // Cluster from which to get threads
+        struct cluster * cltr;
+        // Set to true to notify the processor should terminate
+        volatile bool do_terminate;
         // Coroutine ctx who does keeps the state of the processor
         struct processorCtx_t runner;
-        // Cluster from which to get threads
-        struct cluster * cltr;
         // Name of the processor
         const char * name;
 …
         // Handle to pthreads
         pthread_t kernel_thread;
-        // RunThread data
-        // Action to do after a thread is ran
-        $thread * destroyer;
         // Preemption data
 …
         bool pending_preemption;
+        // Idle lock
+        __bin_sem_t idleLock;
+        // Termination
+        // Set to true to notify the processor should terminate
+        volatile bool do_terminate;
+        // Termination synchronisation
+        // Idle lock (kernel semaphore)
+        __bin_sem_t idle;
+        // Termination synchronisation (user semaphore)
         semaphore terminated;
 …
         // Link lists fields
+        struct __dbg_node_proc {
+                struct processor * next;
+                struct processor * prev;
+        } node;
+        DLISTED_MGD_IMPL_IN(processor)
+        #if !defined(__CFA_NO_STATISTICS__)
+                int print_stats;
+                bool print_halts;
+        #endif
 #ifdef __CFA_DEBUG__
 …
 static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }
+static inline [processor *&, processor *& ] __get( processor & this ) __attribute__((const)) { return this.node.[next, prev]; }
+DLISTED_MGD_IMPL_OUT(processor)
+//-----------------------------------------------------------------------------
+// I/O
+struct __io_data;
+// IO poller user-thread
+// Not using the "thread" keyword because we want to control
+// more carefully when to start/stop it
+struct $io_ctx_thread {
+        struct __io_data * ring;
+        single_sem sem;
+        volatile bool done;
+        $thread self;
+};
+struct io_context {
+        $io_ctx_thread thrd;
+};
+struct io_context_params {
+        int num_entries;
+        int num_ready;
+        int submit_aff;
+        bool eager_submits:1;
+        bool poller_submits:1;
+        bool poll_submit:1;
+        bool poll_complete:1;
+};
+void  ?{}(io_context_params & this);
+void  ?{}(io_context & this, struct cluster & cl);
+void  ?{}(io_context & this, struct cluster & cl, const io_context_params & params);
+void ^?{}(io_context & this);
+struct io_cancellation {
+        __u64 target;
+};
+static inline void  ?{}(io_cancellation & this) { this.target = -1u; }
+static inline void ^?{}(io_cancellation &) {}
+bool cancel(io_cancellation & this);
+//-----------------------------------------------------------------------------
+// Cluster Tools
+// Intrusives lanes which are used by the relaxed ready queue
+struct __attribute__((aligned(128))) __intrusive_lane_t;
+void  ?{}(__intrusive_lane_t & this);
+void ^?{}(__intrusive_lane_t & this);
+// Counter used for wether or not the lanes are all empty
+struct __attribute__((aligned(128))) __snzi_node_t;
+struct __snzi_t {
+        unsigned mask;
+        int root;
+        __snzi_node_t * nodes;
+};
+void  ?{}( __snzi_t & this, unsigned depth );
+void ^?{}( __snzi_t & this );
+//TODO adjust cache size to ARCHITECTURE
+// Structure holding the relaxed ready queue
+struct __ready_queue_t {
+        // Data tracking how many/which lanes are used
+        // Aligned to 128 for cache locality
+        __snzi_t snzi;
+        // Data tracking the actual lanes
+        // On a seperate cacheline from the used struct since
+        // used can change on each push/pop but this data
+        // only changes on shrink/grow
+        struct {
+                // Arary of lanes
+                __intrusive_lane_t * volatile data;
+                // Number of lanes (empty or not)
+                volatile size_t count;
+        } lanes;
+};
+void  ?{}(__ready_queue_t & this);
+void ^?{}(__ready_queue_t & this);
+// Idle Sleep
+struct __cluster_idles {
+        // Spin lock protecting the queue
+        volatile uint64_t lock;
+        // Total number of processors
+        unsigned total;
+        // Total number of idle processors
+        unsigned idle;
+        // List of idle processors
+        dlist(processor, processor) list;
+};
 //-----------------------------------------------------------------------------
 // Cluster
+struct cluster {
+        // Ready queue locks
+        __spinlock_t ready_queue_lock;
+struct __attribute__((aligned(128))) cluster {
         // Ready queue for threads
         __queue_t($thread) ready_queue;
+        __ready_queue_t ready_queue;
         // Name of the cluster
 …
         Duration preemption_rate;
+        // List of processors
+        __spinlock_t proc_list_lock;
+        __dllist_t(struct processor) procs;
+        __dllist_t(struct processor) idles;
+        unsigned int nprocessors;
+        // List of idle processors
+        __cluster_idles idles;
         // List of threads
 …
                 cluster * prev;
         } node;
+        struct {
+                io_context * ctxs;
+                unsigned cnt;
+        } io;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct __stats_t * stats;
+                int print_stats;
+        #endif
 };
 extern Duration default_preemption();
 void ?{} (cluster & this, const char name[], Duration preemption_rate);
+void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned num_io, const io_context_params & io_params);
 void ^?{}(cluster & this);
+static inline void ?{} (cluster & this)                           { this{"Anonymous Cluster", default_preemption()}; }
+static inline void ?{} (cluster & this, Duration preemption_rate) { this{"Anonymous Cluster", preemption_rate}; }
+static inline void ?{} (cluster & this, const char name[])        { this{name, default_preemption()}; }
+static inline void ?{} (cluster & this)                                            { io_context_params default_params;    this{"Anonymous Cluster", default_preemption(), 1, default_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate)                  { io_context_params default_params;    this{"Anonymous Cluster", preemption_rate, 1, default_params}; }
+static inline void ?{} (cluster & this, const char name[])                         { io_context_params default_params;    this{name, default_preemption(), 1, default_params}; }
+static inline void ?{} (cluster & this, unsigned num_io)                           { io_context_params default_params;    this{"Anonymous Cluster", default_preemption(), num_io, default_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned num_io) { io_context_params default_params;    this{"Anonymous Cluster", preemption_rate, num_io, default_params}; }
+static inline void ?{} (cluster & this, const char name[], unsigned num_io)        { io_context_params default_params;    this{name, default_preemption(), num_io, default_params}; }
+static inline void ?{} (cluster & this, const io_context_params & io_params)                                            { this{"Anonymous Cluster", default_preemption(), 1, io_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, const io_context_params & io_params)                  { this{"Anonymous Cluster", preemption_rate, 1, io_params}; }
+static inline void ?{} (cluster & this, const char name[], const io_context_params & io_params)                         { this{name, default_preemption(), 1, io_params}; }
+static inline void ?{} (cluster & this, unsigned num_io, const io_context_params & io_params)                           { this{"Anonymous Cluster", default_preemption(), num_io, io_params}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, unsigned num_io, const io_context_params & io_params) { this{"Anonymous Cluster", preemption_rate, num_io, io_params}; }
+static inline void ?{} (cluster & this, const char name[], unsigned num_io, const io_context_params & io_params)        { this{name, default_preemption(), num_io, io_params}; }
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }
+static inline struct processor * active_processor() { return TL_GET( this_processor ); } // UNSAFE
+static inline struct cluster   * active_cluster  () { return TL_GET( this_processor )->cltr; }
+static inline struct processor * active_processor() { return publicTLS_get( this_processor ); } // UNSAFE
+static inline struct cluster   * active_cluster  () { return publicTLS_get( this_processor )->cltr; }
+#if !defined(__CFA_NO_STATISTICS__)
+        void print_stats_now( cluster & this, int flags );
+        static inline void print_stats_at_exit( cluster & this, int flags ) {
+                this.print_stats |= flags;
+        }
+        static inline void print_stats_at_exit( processor & this, int flags ) {
+                this.print_stats |= flags;
+        }
+        void print_halts( processor & this );
+#endif
 // Local Variables: //

libcfa/src/concurrency/kernel_private.hfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Feb 13 12:27:26 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Nov 30 19:25:02 2019
 // Update Count     : 8
+// Last Modified On : Wed Aug 12 08:21:33 2020
+// Update Count     : 9
 //
 …
 #include "alarm.hfa"
+#include "stats.hfa"
 //-----------------------------------------------------------------------------
 // Scheduler
+struct __attribute__((aligned(128))) __scheduler_lock_id_t;
 extern "C" {
 …
+}
+void __schedule_thread( $thread * ) __attribute__((nonnull (1)));
+//Block current thread and release/wake-up the following resources
+void __leave_thread() __attribute__((noreturn));
+void __schedule_thread( $thread * )
+#if defined(NDEBUG) || (!defined(__CFA_DEBUG__) && !defined(__CFA_VERIFY__))
+        __attribute__((nonnull (1)))
+#endif
+;
+extern bool __preemption_enabled();
+//release/wake-up the following resources
+void __thread_finish( $thread * thrd );
 //-----------------------------------------------------------------------------
 …
 void * __create_pthread( pthread_t *, void * (*)(void *), void * );
+static inline void wake_fast(processor * this) {
+        __cfaabi_dbg_print_safe("Kernel : Waking up processor %p\n", this);
+        post( this->idleLock );
+}
+static inline void wake(processor * this) {
+        disable_interrupts();
+        wake_fast(this);
+        enable_interrupts( __cfaabi_dbg_ctx );
+}
+struct event_kernel_t {
+        alarm_list_t alarms;
+        __spinlock_t lock;
+};
+extern event_kernel_t * event_kernel;
+struct __cfa_kernel_preemption_state_t {
+        bool enabled;
+        bool in_progress;
+        unsigned short disable_count;
+};
+extern volatile thread_local __cfa_kernel_preemption_state_t preemption_state __attribute__ ((tls_model ( "initial-exec" )));
+void __destroy_pthread( pthread_t pthread, void * stack, void ** retval );
+extern cluster * mainCluster;
 //-----------------------------------------------------------------------------
 …
+)
+#define TICKET_BLOCKED (-1) // thread is blocked
+#define TICKET_RUNNING ( 0) // thread is running
+#define TICKET_UNBLOCK ( 1) // thread should ignore next block
 //-----------------------------------------------------------------------------
 // Utils
-#define KERNEL_STORAGE(T,X) static char storage_##X[sizeof(T)]
-static inline uint32_t __tls_rand() {
-        kernelTLS.rand_seed ^= kernelTLS.rand_seed << 6;
-        kernelTLS.rand_seed ^= kernelTLS.rand_seed >> 21;
-        kernelTLS.rand_seed ^= kernelTLS.rand_seed << 7;
-        return kernelTLS.rand_seed;
+}
-void doregister( struct cluster & cltr );
-void unregister( struct cluster & cltr );
 void doregister( struct cluster * cltr, struct $thread & thrd );
 void unregister( struct cluster * cltr, struct $thread & thrd );
+void doregister( struct cluster * cltr, struct processor * proc );
+void unregister( struct cluster * cltr, struct processor * proc );
+//-----------------------------------------------------------------------------
+// I/O
+void ^?{}(io_context & this, bool );
+//=======================================================================
+// Cluster lock API
+//=======================================================================
+// Cells use by the reader writer lock
+// while not generic it only relies on a opaque pointer
+struct __attribute__((aligned(128))) __scheduler_lock_id_t {
+        // Spin lock used as the underlying lock
+        volatile bool lock;
+        // Handle pointing to the proc owning this cell
+        // Used for allocating cells and debugging
+        __processor_id_t * volatile handle;
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                bool owned;
+        #endif
+};
+static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));
+// Lock-Free registering/unregistering of threads
+// Register a processor to a given cluster and get its unique id in return
+unsigned doregister( struct __processor_id_t * proc );
+// Unregister a processor from a given cluster using its id, getting back the original pointer
+void     unregister( struct __processor_id_t * proc );
+//-----------------------------------------------------------------------
+// Cluster idle lock/unlock
+static inline void lock(__cluster_idles & this) {
+        for() {
+                uint64_t l = this.lock;
+                if(
+                        (0 == (l % 2))
+                        && __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+                ) return;
+                Pause();
+        }
+}
+static inline void unlock(__cluster_idles & this) {
+        /* paranoid */ verify( 1 == (this.lock % 2) );
+        __atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
+}
+//=======================================================================
+// Reader-writer lock implementation
+// Concurrent with doregister/unregister,
+//    i.e., threads can be added at any point during or between the entry/exit
+//-----------------------------------------------------------------------
+// simple spinlock underlying the RWLock
+// Blocking acquire
+static inline void __atomic_acquire(volatile bool * ll) {
+        while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
+                while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
+                        Pause();
+        }
+        /* paranoid */ verify(*ll);
+}
+// Non-Blocking acquire
+static inline bool __atomic_try_acquire(volatile bool * ll) {
+        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
+// Release
+static inline void __atomic_unlock(volatile bool * ll) {
+        /* paranoid */ verify(*ll);
+        __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
+}
+//-----------------------------------------------------------------------
+// Reader-Writer lock protecting the ready-queues
+// while this lock is mostly generic some aspects
+// have been hard-coded to for the ready-queue for
+// simplicity and performance
+struct __scheduler_RWLock_t {
+        // total cachelines allocated
+        unsigned int max;
+        // cachelines currently in use
+        volatile unsigned int alloc;
+        // cachelines ready to itereate over
+        // (!= to alloc when thread is in second half of doregister)
+        volatile unsigned int ready;
+        // writer lock
+        volatile bool lock;
+        // data pointer
+        __scheduler_lock_id_t * data;
+};
+void  ?{}(__scheduler_RWLock_t & this);
+void ^?{}(__scheduler_RWLock_t & this);
+extern __scheduler_RWLock_t * __scheduler_lock;
+//-----------------------------------------------------------------------
+// Reader side : acquire when using the ready queue to schedule but not
+//  creating/destroying queues
+static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        unsigned iproc = kernelTLS().this_proc_id->id;
+        /*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
+        /*paranoid*/ verify(iproc < ready);
+        // Step 1 : make sure no writer are in the middle of the critical section
+        while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
+                Pause();
+        // Fence needed because we don't want to start trying to acquire the lock
+        // before we read a false.
+        // Not needed on x86
+        // std::atomic_thread_fence(std::memory_order_seq_cst);
+        // Step 2 : acquire our local lock
+        __atomic_acquire( &data[iproc].lock );
+        /*paranoid*/ verify(data[iproc].lock);
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                data[iproc].owned = true;
+        #endif
+}
+static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( kernelTLS().this_proc_id );
+        unsigned iproc = kernelTLS().this_proc_id->id;
+        /*paranoid*/ verify(data[iproc].handle == kernelTLS().this_proc_id);
+        /*paranoid*/ verify(iproc < ready);
+        /*paranoid*/ verify(data[iproc].lock);
+        /*paranoid*/ verify(data[iproc].owned);
+        #ifdef __CFA_WITH_VERIFY__
+                // Debug, check if this is owned for reading
+                data[iproc].owned = false;
+        #endif
+        __atomic_unlock(&data[iproc].lock);
+}
+#ifdef __CFA_WITH_VERIFY__
+        static inline bool ready_schedule_islocked(void) {
+                /* paranoid */ verify( ! __preemption_enabled() );
+                /*paranoid*/ verify( kernelTLS().this_proc_id );
+                __processor_id_t * proc = kernelTLS().this_proc_id;
+                return __scheduler_lock->data[proc->id].owned;
+        }
+        static inline bool ready_mutate_islocked() {
+                return __scheduler_lock->lock;
+        }
+#endif
+//-----------------------------------------------------------------------
+// Writer side : acquire when changing the ready queue, e.g. adding more
+//  queues or removing them.
+uint_fast32_t ready_mutate_lock( void );
+void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );
+//=======================================================================
+// Ready-Queue API
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+__attribute__((hot)) bool query(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// push thread onto a ready queue for a cluster
+// returns true if the list was previously empty, false otherwise
+__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// May return 0p spuriously
+__attribute__((hot)) struct $thread * pop(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// pop thread from the ready queue of a cluster
+// returns 0p if empty
+// guaranteed to find any threads added before this call
+__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);
+//-----------------------------------------------------------------------
+// remove thread from the ready queue of a cluster
+// returns bool if it wasn't found
+bool remove_head(struct cluster * cltr, struct $thread * thrd);
+//-----------------------------------------------------------------------
+// Increase the width of the ready queue (number of lanes) by 4
+void ready_queue_grow  (struct cluster * cltr, int target);
+//-----------------------------------------------------------------------
+// Decrease the width of the ready queue (number of lanes) by 4
+void ready_queue_shrink(struct cluster * cltr, int target);
 // Local Variables: //

libcfa/src/concurrency/monitor.cfa

-              r3c64c668
+              r58fe85a
 // Enter single monitor
 static void __enter( $monitor * this, const __monitor_group_t & group ) {
+        $thread * thrd = active_thread();
         // Lock the monitor spinlock
         lock( this->lock __cfaabi_dbg_ctx2 );
-        // Interrupts disable inside critical section
-        $thread * thrd = kernelTLS.this_thread;
         __cfaabi_dbg_print_safe( "Kernel : %10p Entering mon %p (%p)\n", thrd, this, this->owner);
+        if( !this->owner ) {
+        if( unlikely(0 != (0x1 & (uintptr_t)this->owner)) ) {
+                abort( "Attempt by thread \"%.256s\" (%p) to access joined monitor %p.", thrd->self_cor.name, thrd, this );
+        }
+        else if( !this->owner ) {
                 // No one has the monitor, just take it
                 __set_owner( this, thrd );
 …
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->next == 0p );
+                /* paranoid */ verify( thrd->link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->next == 1p );
+                /* paranoid */ verify( thrd->link.next == 1p );
                 unlock( this->lock );
 …
                 __cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
                 /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
                 return;
+        }
 …
         __cfaabi_dbg_print_safe( "Kernel : %10p Entered  mon %p\n", thrd, this);
         /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+        /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
         /* paranoid */ verify( this->lock.lock );
 …
+}
+static void __dtor_enter( $monitor * this, fptr_t func ) {
+static void __dtor_enter( $monitor * this, fptr_t func, bool join ) {
+        $thread * thrd = active_thread();
+        #if defined( __CFA_WITH_VERIFY__ )
+                bool is_thrd = this == &thrd->self_mon;
+        #endif
         // Lock the monitor spinlock
         lock( this->lock __cfaabi_dbg_ctx2 );
-        // Interrupts disable inside critical section
-        $thread * thrd = kernelTLS.this_thread;
         __cfaabi_dbg_print_safe( "Kernel : %10p Entering dtor for mon %p (%p)\n", thrd, this, this->owner);
 …
                 __set_owner( this, thrd );
+                verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
+                /* paranoid */ verify( !is_thrd || thrd->state == Halted || thrd->state == Cancelled );
                 unlock( this->lock );
                 return;
+        }
         else if( this->owner == thrd) {
+        else if( this->owner == thrd && !join) {
                 // We already have the monitor... but where about to destroy it so the nesting will fail
                 // Abort!
                 abort( "Attempt to destroy monitor %p by thread \"%.256s\" (%p) in nested mutex.", this, thrd->self_cor.name, thrd );
+        }
+        // SKULLDUGGERY: join will act as a dtor so it would normally trigger to above check
+        // because join will not release the monitor after it executed.
+        // to avoid that it sets the owner to the special value thrd | 1p before exiting
+        else if( this->owner == ($thread*)(1 | (uintptr_t)thrd) ) {
+                // restore the owner and just return
+                __cfaabi_dbg_print_safe( "Kernel : Destroying free mon %p\n", this);
+                // No one has the monitor, just take it
+                __set_owner( this, thrd );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
+                /* paranoid */ verify( !is_thrd || thrd->state == Halted || thrd->state == Cancelled );
+                unlock( this->lock );
+                return;
+        }
+        // The monitor is busy, if this is a thread and the thread owns itself, it better be active
+        /* paranoid */ verify( !is_thrd || this->owner != thrd || (thrd->state != Halted && thrd->state != Cancelled) );
         __lock_size_t count = 1;
 …
                 // Release the next thread
                 /* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( urgent->owner->waiting_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
                 unpark( urgent->owner->waiting_thread );
 …
                 // Some one was waiting for us, enter
+                /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
+                __cfaabi_dbg_print_safe( "Kernel : Destroying %p\n", this);
+                return;
+        }
         else {
 …
                 // Some one else has the monitor, wait in line for it
                 /* paranoid */ verify( thrd->next == 0p );
+                /* paranoid */ verify( thrd->link.next == 0p );
                 append( this->entry_queue, thrd );
                 /* paranoid */ verify( thrd->next == 1p );
+                /* paranoid */ verify( thrd->link.next == 1p );
                 unlock( this->lock );
 …
                 park();
                 /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
                 return;
+        }
-        __cfaabi_dbg_print_safe( "Kernel : Destroying %p\n", this);
+}
 …
         lock( this->lock __cfaabi_dbg_ctx2 );
         __cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", kernelTLS.this_thread, this, this->owner);
         /* paranoid */ verifyf( kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+        __cfaabi_dbg_print_safe( "Kernel : %10p Leaving mon %p (%p)\n", active_thread(), this, this->owner);
+        /* paranoid */ verifyf( active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
         // Leaving a recursion level, decrement the counter
 …
 // Leave single monitor for the last time
 void __dtor_leave( $monitor * this ) {
+void __dtor_leave( $monitor * this, bool join ) {
         __cfaabi_dbg_debug_do(
                 if( TL_GET( this_thread ) != this->owner ) {
                         abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, TL_GET( this_thread ), this->owner);
+                if( active_thread() != this->owner ) {
+                        abort( "Destroyed monitor %p has inconsistent owner, expected %p got %p.\n", this, active_thread(), this->owner);
+                }
                 if( this->recursion != 1 ) {
+                if( this->recursion != 1  && !join ) {
                         abort( "Destroyed monitor %p has %d outstanding nested calls.\n", this, this->recursion - 1);
+                }
+        )
+}
+extern "C" {
+        // Leave the thread monitor
+        // last routine called by a thread.
+        // Should never return
+        void __cfactx_thrd_leave() {
+                $thread * thrd = TL_GET( this_thread );
+                $monitor * this = &thrd->self_mon;
+                // Lock the monitor now
+                lock( this->lock __cfaabi_dbg_ctx2 );
+                disable_interrupts();
+                thrd->state = Halted;
+                /* paranoid */ verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
+                // Leaving a recursion level, decrement the counter
+                this->recursion -= 1;
+                // If we haven't left the last level of recursion
+                // it must mean there is an error
+                if( this->recursion != 0) { abort( "Thread internal monitor has unbalanced recursion" ); }
+                // Fetch the next thread, can be null
+                $thread * new_owner = next_thread( this );
+                // Release the monitor lock
+                unlock( this->lock );
+                // Unpark the next owner if needed
+                /* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                /* paranoid */ verify( ! kernelTLS.this_processor->destroyer );
+                /* paranoid */ verify( thrd->state == Halted );
+                kernelTLS.this_processor->destroyer = new_owner;
+                // Leave the thread
+                __leave_thread();
+                // Control flow should never reach here!
+        }
+        this->owner = ($thread*)(1 | (uintptr_t)this->owner);
+}
+void __thread_finish( $thread * thrd ) {
+        $monitor * this = &thrd->self_mon;
+        // Lock the monitor now
+        /* paranoid */ verify( 0x0D15EA5E0D15EA5Ep == thrd->canary );
+        /* paranoid */ verify( this->lock.lock );
+        /* paranoid */ verify( thrd->context.SP );
+        /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) > ((uintptr_t)__get_stack(thrd->curr_cor)->limit), "ERROR : $thread %p has been corrupted.\n StackPointer too large.\n", thrd );
+        /* paranoid */ verifyf( ((uintptr_t)thrd->context.SP) < ((uintptr_t)__get_stack(thrd->curr_cor)->base ), "ERROR : $thread %p has been corrupted.\n StackPointer too small.\n", thrd );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verifyf( thrd == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", thrd, this->owner, this->recursion, this );
+        /* paranoid */ verify( thrd->state == Halting );
+        /* paranoid */ verify( this->recursion == 1 );
+        // Leaving a recursion level, decrement the counter
+        this->recursion -= 1;
+        this->owner = 0p;
+        // Fetch the next thread, can be null
+        $thread * new_owner = next_thread( this );
+        // Mark the state as fully halted
+        thrd->state = Halted;
+        // Release the monitor lock
+        unlock( this->lock );
+        // Unpark the next owner if needed
+        /* paranoid */ verifyf( !new_owner || new_owner == this->owner, "Expected owner to be %p, got %p (m: %p)", new_owner, this->owner, this );
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify( thrd->state == Halted );
+        unpark( new_owner );
+}
 …
 // Sorts monitors before entering
 void ?{}( monitor_guard_t & this, $monitor * m [], __lock_size_t count, fptr_t func ) {
         $thread * thrd = TL_GET( this_thread );
+        $thread * thrd = active_thread();
         // Store current array
 …
         // Restore thread context
         TL_GET( this_thread )->monitors = this.prev;
+        active_thread()->monitors = this.prev;
+}
 // Ctor for monitor guard
 // Sorts monitors before entering
 void ?{}( monitor_dtor_guard_t & this, $monitor * m [], fptr_t func ) {
+void ?{}( monitor_dtor_guard_t & this, $monitor * m [], fptr_t func, bool join ) {
         // optimization
         $thread * thrd = TL_GET( this_thread );
+        $thread * thrd = active_thread();
         // Store current array
 …
         this.prev = thrd->monitors;
+        // Save whether we are in a join or not
+        this.join = join;
         // Update thread context (needed for conditions)
         (thrd->monitors){m, 1, func};
         __dtor_enter( this.m, func );
+        __dtor_enter( this.m, func, join );
+}
 …
 void ^?{}( monitor_dtor_guard_t & this ) {
         // Leave the monitors in order
         __dtor_leave( this.m );
+        __dtor_leave( this.m, this.join );
         // Restore thread context
         TL_GET( this_thread )->monitors = this.prev;
+        active_thread()->monitors = this.prev;
+}
 …
         // Create the node specific to this wait operation
         wait_ctx( TL_GET( this_thread ), user_info );
+        wait_ctx( active_thread(), user_info );
         // Append the current wait operation to the ones already queued on the condition
 …
         //Some more checking in debug
         __cfaabi_dbg_debug_do(
                 $thread * this_thrd = TL_GET( this_thread );
+                $thread * this_thrd = active_thread();
                 if ( this.monitor_count != this_thrd->monitors.size ) {
                         abort( "Signal on condition %p made with different number of monitor(s), expected %zi got %zi", &this, this.monitor_count, this_thrd->monitors.size );
 …
         // Create the node specific to this wait operation
         wait_ctx_primed( kernelTLS.this_thread, 0 )
+        wait_ctx_primed( active_thread(), 0 )
         //save contexts
 …
         //Find the thread to run
         $thread * signallee = pop_head( this.blocked )->waiting_thread;
-        /* paranoid */ verify( signallee->next == 0p );
         __set_owner( monitors, count, signallee );
 …
                                 // Create the node specific to this wait operation
                                 wait_ctx_primed( kernelTLS.this_thread, 0 );
+                                wait_ctx_primed( active_thread(), 0 );
                                 // Save monitor states
 …
         // Create the node specific to this wait operation
         wait_ctx_primed( kernelTLS.this_thread, 0 );
+        wait_ctx_primed( active_thread(), 0 );
         monitor_save;
 …
         for( __lock_size_t i = 0; i < count; i++) {
                 verify( monitors[i]->owner == kernelTLS.this_thread );
+                verify( monitors[i]->owner == active_thread() );
+        }
 …
 static inline void __set_owner( $monitor * monitors [], __lock_size_t count, $thread * owner ) {
         /* paranoid */ verify ( monitors[0]->lock.lock );
         /* paranoid */ verifyf( monitors[0]->owner == kernelTLS.this_thread, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, monitors[0]->owner, monitors[0]->recursion, monitors[0] );
+        /* paranoid */ verifyf( monitors[0]->owner == active_thread(), "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), monitors[0]->owner, monitors[0]->recursion, monitors[0] );
         monitors[0]->owner        = owner;
         monitors[0]->recursion    = 1;
         for( __lock_size_t i = 1; i < count; i++ ) {
                 /* paranoid */ verify ( monitors[i]->lock.lock );
                 /* paranoid */ verifyf( monitors[i]->owner == kernelTLS.this_thread, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, monitors[i]->owner, monitors[i]->recursion, monitors[i] );
+                /* paranoid */ verifyf( monitors[i]->owner == active_thread(), "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), monitors[i]->owner, monitors[i]->recursion, monitors[i] );
                 monitors[i]->owner        = owner;
                 monitors[i]->recursion    = 0;
 …
                 //regardless of if we are ready to baton pass,
                 //we need to set the monitor as in use
                 /* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
+                /* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
                 __set_owner( this,  urgent->owner->waiting_thread );
 …
         // Get the next thread in the entry_queue
         $thread * new_owner = pop_head( this->entry_queue );
         /* paranoid */ verifyf( !this->owner || kernelTLS.this_thread == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", kernelTLS.this_thread, this->owner, this->recursion, this );
         /* paranoid */ verify( !new_owner || new_owner->next == 0p );
+        /* paranoid */ verifyf( !this->owner || active_thread() == this->owner, "Expected owner to be %p, got %p (r: %i, m: %p)", active_thread(), this->owner, this->recursion, this );
+        /* paranoid */ verify( !new_owner || new_owner->link.next == 0p );
         __set_owner( this, new_owner );
 …
+        }
         __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? node->waiting_thread : 0p );
+        __cfaabi_dbg_print_safe( "Kernel :  Runing %i (%p)\n", ready2run, ready2run ? (thread*)node->waiting_thread : (thread*)0p );
         return ready2run ? node->waiting_thread : 0p;
+}
 static inline void brand_condition( condition & this ) {
         $thread * thrd = TL_GET( this_thread );
+        $thread * thrd = active_thread();
         if( !this.monitors ) {
                 // __cfaabi_dbg_print_safe( "Branding\n" );
 …
         // For each thread in the entry-queue
         for(    $thread ** thrd_it = &entry_queue.head;
                 *thrd_it != 1p;
                 thrd_it = &(*thrd_it)->next
+                (*thrd_it) != 1p;
+                thrd_it = &(*thrd_it)->link.next
         ) {
                 // For each acceptable check if it matches

libcfa/src/concurrency/monitor.hfa

-              r3c64c668
+              r58fe85a
         $monitor *    m;
         __monitor_group_t prev;
+        bool join;
 };
 void ?{}( monitor_dtor_guard_t & this, $monitor ** m, void (*func)() );
+void ?{}( monitor_dtor_guard_t & this, $monitor ** m, void (*func)(), bool join );
 void ^?{}( monitor_dtor_guard_t & this );
 …
               void wait        ( condition & this, uintptr_t user_info = 0 );
+static inline bool is_empty    ( condition & this ) { return this.blocked.head == 1p; }
               bool signal      ( condition & this );
               bool signal_block( condition & this );
 static inline bool is_empty    ( condition & this ) { return this.blocked.head == 1p; }
+static inline bool signal_all  ( condition & this ) { bool ret = false; while(!is_empty(this)) { ret = signal(this) || ret; } return ret; }
          uintptr_t front       ( condition & this );

libcfa/src/concurrency/mutex.cfa

-              r3c64c668
+              r58fe85a
         this.lock{};
         this.blocked_threads{};
+        this.is_locked = false;
+}
 …
         lock( lock __cfaabi_dbg_ctx2 );
         if( is_locked ) {
                 append( blocked_threads, kernelTLS.this_thread );
+                append( blocked_threads, active_thread() );
                 unlock( lock );
                 park();
 …
         lock( lock __cfaabi_dbg_ctx2 );
         if( owner == 0p ) {
                 owner = kernelTLS.this_thread;
+                owner = active_thread();
                 recursion_count = 1;
                 unlock( lock );
+        }
         else if( owner == kernelTLS.this_thread ) {
+        else if( owner == active_thread() ) {
                 recursion_count++;
                 unlock( lock );
+        }
         else {
                 append( blocked_threads, kernelTLS.this_thread );
+                append( blocked_threads, active_thread() );
                 unlock( lock );
                 park();
 …
         lock( lock __cfaabi_dbg_ctx2 );
         if( owner == 0p ) {
                 owner = kernelTLS.this_thread;
+                owner = active_thread();
                 recursion_count = 1;
                 ret = true;
+        }
         else if( owner == kernelTLS.this_thread ) {
+        else if( owner == active_thread() ) {
                 recursion_count++;
                 ret = true;
 …
 void wait(condition_variable & this) {
         lock( this.lock __cfaabi_dbg_ctx2 );
         append( this.blocked_threads, kernelTLS.this_thread );
+        append( this.blocked_threads, active_thread() );
         unlock( this.lock );
         park();
 …
 void wait(condition_variable & this, L & l) {
         lock( this.lock __cfaabi_dbg_ctx2 );
         append( this.blocked_threads, kernelTLS.this_thread );
+        append( this.blocked_threads, active_thread() );
         unlock(l);
         unlock(this.lock);

libcfa/src/concurrency/preemption.cfa

-              r3c64c668
+              r58fe85a
 // Created On       : Mon Jun 5 14:20:42 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Dec  5 16:34:05 2019
 // Update Count     : 43
+// Last Modified On : Fri Nov  6 07:42:13 2020
+// Update Count     : 54
 //
 …
 #include <assert.h>
-extern "C" {
 #include <errno.h>
 #include <stdio.h>
 …
 #include <unistd.h>
 #include <limits.h>                                                                             // PTHREAD_STACK_MIN
+}
 #include "bits/signal.hfa"
+#include "kernel_private.hfa"
 #if !defined(__CFA_DEFAULT_PREEMPTION__)
 …
 // FwdDeclarations : Signal handlers
 static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ );
+static void sigHandler_alarm    ( __CFA_SIGPARMS__ );
 static void sigHandler_segv     ( __CFA_SIGPARMS__ );
 static void sigHandler_ill      ( __CFA_SIGPARMS__ );
 …
 #elif defined( __x86_64 )
 #define CFA_REG_IP gregs[REG_RIP]
 #elif defined( __ARM_ARCH )
+#elif defined( __arm__ )
 #define CFA_REG_IP arm_pc
+#elif defined( __aarch64__ )
+#define CFA_REG_IP pc
 #else
 #error unknown hardware architecture
+#error unsupported hardware architecture
 #endif
 …
 // Get next expired node
 static inline alarm_node_t * get_expired( alarm_list_t * alarms, Time currtime ) {
         if( !alarms->head ) return 0p;                                          // If no alarms return null
         if( alarms->head->alarm >= currtime ) return 0p;        // If alarms head not expired return null
+        if( ! & (*alarms)`first ) return 0p;                                            // If no alarms return null
+        if( (*alarms)`first.alarm >= currtime ) return 0p;      // If alarms head not expired return null
         return pop(alarms);                                                                     // Otherwise just pop head
+}
 // Tick one frame of the Discrete Event Simulation for alarms
 static void tick_preemption() {
+static void tick_preemption(void) {
         alarm_node_t * node = 0p;                                                       // Used in the while loop but cannot be declared in the while condition
         alarm_list_t * alarms = &event_kernel->alarms;          // Local copy for ease of reading
 …
         while( node = get_expired( alarms, currtime ) ) {
                 // __cfaabi_dbg_print_buffer_decl( " KERNEL: preemption tick.\n" );
+                Duration period = node->period;
+                if( period == 0) {
+                        node->set = false;                  // Node is one-shot, just mark it as not pending
+                }
                 // Check if this is a kernel
                 if( node->kernel_alarm ) {
+                if( node->type == Kernel ) {
                         preempt( node->proc );
+                }
+                else if( node->type == User ) {
+                        timeout( node->thrd );
+                }
                 else {
                         timeout( node->thrd );
+                        node->callback(*node);
+                }
                 // Check if this is a periodic alarm
-                Duration period = node->period;
                 if( period > 0 ) {
                         // __cfaabi_dbg_print_buffer_local( " KERNEL: alarm period is %lu.\n", period.tv );
 …
                         insert( alarms, node );             // Reinsert the node for the next time it triggers
+                }
-                else {
-                        node->set = false;                  // Node is one-shot, just mark it as not pending
+                }
+        }
         // If there are still alarms pending, reset the timer
         if( alarms->head ) {
                 __cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
                 Duration delta = alarms->head->alarm - currtime;
                 Duration caped = max(delta, 50`us);
+        if( & (*alarms)`first ) {
+                __cfadbg_print_buffer_decl(preemption, " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
+                Duration delta = (*alarms)`first.alarm - currtime;
+                Duration capped = max(delta, 50`us);
                 // itimerval tim  = { caped };
                 // __cfaabi_dbg_print_buffer_local( "    Values are %lu, %lu, %lu %lu.\n", delta.tv, caped.tv, tim.it_value.tv_sec, tim.it_value.tv_usec);
                 __kernel_set_timer( caped );
+                __kernel_set_timer( capped );
+        }
+}
 …
 // Kernel Signal Tools
 //=============================================================================================
+__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
+// In a user-level threading system, there are handful of thread-local variables where this problem occurs on the ARM.
+//
+// For each kernel thread running user-level threads, there is a flag variable to indicate if interrupts are
+// enabled/disabled for that kernel thread. Therefore, this variable is made thread local.
+//
+// For example, this code fragment sets the state of the "interrupt" variable in thread-local memory.
+//
+// _Thread_local volatile int interrupts;
+// int main() {
+//     interrupts = 0; // disable interrupts }
+//
+// which generates the following code on the ARM
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000000610 <+0>:  mrs     x1, tpidr_el0
+//    0x0000000000000614 <+4>:  mov     w0, #0x0                        // #0
+//    0x0000000000000618 <+8>:  add     x1, x1, #0x0, lsl #12
+//    0x000000000000061c <+12>: add     x1, x1, #0x10
+//    0x0000000000000620 <+16>: str     wzr, [x1]
+//    0x0000000000000624 <+20>: ret
+//
+// The mrs moves a pointer from coprocessor register tpidr_el0 into register x1.  Register w0 is set to 0. The two adds
+// increase the TLS pointer with the displacement (offset) 0x10, which is the location in the TSL of variable
+// "interrupts".  Finally, 0 is stored into "interrupts" through the pointer in register x1 that points into the
+// TSL. Now once x1 has the pointer to the location of the TSL for kernel thread N, it can be be preempted at a
+// user-level and the user thread is put on the user-level ready-queue. When the preempted thread gets to the front of
+// the user-level ready-queue it is run on kernel thread M. It now stores 0 into "interrupts" back on kernel thread N,
+// turning off interrupt on the wrong kernel thread.
+//
+// On the x86, the following code is generated for the same code fragment.
+//
+// (gdb) disassemble main
+// Dump of assembler code for function main:
+//    0x0000000000400420 <+0>:  movl   $0x0,%fs:0xfffffffffffffffc
+//    0x000000000040042c <+12>: xor    %eax,%eax
+//    0x000000000040042e <+14>: retq
+//
+// and there is base-displacement addressing used to atomically reset variable "interrupts" off of the TSL pointer in
+// register "fs".
+//
+// Hence, the ARM has base-displacement address for the general purpose registers, BUT not to the coprocessor
+// registers. As a result, generating the address for the write into variable "interrupts" is no longer atomic.
+//
+// Note this problem does NOT occur when just using multiple kernel threads because the preemption ALWAYS restarts the
+// thread on the same kernel thread.
+//
+// The obvious question is why does ARM use a coprocessor register to store the TSL pointer given that coprocessor
+// registers are second-class registers with respect to the instruction set. One possible answer is that they did not
+// want to dedicate one of the general registers to hold the TLS pointer and there was a free coprocessor register
+// available.
+//-----------------------------------------------------------------------------
+// Some assembly required
+#define __cfaasm_label(label, when) when: asm volatile goto(".global __cfaasm_" #label "_" #when "\n" "__cfaasm_" #label "_" #when ":":::"memory":when)
+//----------
+// special case for preemption since used often
+bool __preemption_enabled() {
+        // create a assembler label before
+        // marked as clobber all to avoid movement
+        __cfaasm_label(check, before);
+        // access tls as normal
+        bool enabled = __cfaabi_tls.preemption_state.enabled;
+        // create a assembler label after
+        // marked as clobber all to avoid movement
+        __cfaasm_label(check, after);
+        return enabled;
+}
+struct asm_region {
+        void * before;
+        void * after;
+};
+static inline bool __cfaasm_in( void * ip, struct asm_region & region ) {
+        return ip >= region.before && ip <= region.after;
+}
+//----------
+// Get data from the TLS block
+// struct asm_region __cfaasm_get;
+uintptr_t __cfatls_get( unsigned long int offset ) __attribute__((__noinline__)); //no inline to avoid problems
+uintptr_t __cfatls_get( unsigned long int offset ) {
+        // create a assembler label before
+        // marked as clobber all to avoid movement
+        __cfaasm_label(get, before);
+        // access tls as normal (except for pointer arithmetic)
+        uintptr_t val = *(uintptr_t*)((uintptr_t)&__cfaabi_tls + offset);
+        // create a assembler label after
+        // marked as clobber all to avoid movement
+        __cfaasm_label(get, after);
+        return val;
+}
 extern "C" {
         // Disable interrupts by incrementing the counter
         void disable_interrupts() {
+                with( kernelTLS.preemption_state ) {
+                // create a assembler label before
+                // marked as clobber all to avoid movement
+                __cfaasm_label(dsable, before);
+                with( __cfaabi_tls.preemption_state ) {
                         #if GCC_VERSION > 50000
                         static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
 …
                         verify( new_val < 65_000u );              // If this triggers someone is disabling interrupts without enabling them
+                }
+                // create a assembler label after
+                // marked as clobber all to avoid movement
+                __cfaasm_label(dsable, after);
+        }
 …
         // If counter reaches 0, execute any pending __cfactx_switch
         void enable_interrupts( __cfaabi_dbg_ctx_param ) {
+                processor   * proc = kernelTLS.this_processor; // Cache the processor now since interrupts can start happening after the atomic store
+                with( kernelTLS.preemption_state ){
+                // Cache the processor now since interrupts can start happening after the atomic store
+                processor   * proc = __cfaabi_tls.this_processor;
+                /* paranoid */ verify( proc );
+                with( __cfaabi_tls.preemption_state ){
                         unsigned short prev = disable_count;
                         disable_count -= 1;
+                        verify( prev != 0u );                     // If this triggers someone is enabled already enabled interruptsverify( prev != 0u );
+                        // If this triggers someone is enabled already enabled interruptsverify( prev != 0u );
+                        /* paranoid */ verify( prev != 0u );
                         // Check if we need to prempt the thread because an interrupt was missed
                         if( prev == 1 ) {
                                 #if GCC_VERSION > 50000
                                 static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
+                                        static_assert(__atomic_always_lock_free(sizeof(enabled), &enabled), "Must be lock-free");
                                 #endif
 …
         // Don't execute any pending __cfactx_switch even if counter reaches 0
         void enable_interrupts_noPoll() {
+                unsigned short prev = kernelTLS.preemption_state.disable_count;
+                kernelTLS.preemption_state.disable_count -= 1;
+                verifyf( prev != 0u, "Incremented from %u\n", prev );                     // If this triggers someone is enabled already enabled interrupts
+                unsigned short prev = __cfaabi_tls.preemption_state.disable_count;
+                __cfaabi_tls.preemption_state.disable_count -= 1;
+                // If this triggers someone is enabled already enabled interrupts
+                /* paranoid */ verifyf( prev != 0u, "Incremented from %u\n", prev );
                 if( prev == 1 ) {
                         #if GCC_VERSION > 50000
                         static_assert(__atomic_always_lock_free(sizeof(kernelTLS.preemption_state.enabled), &kernelTLS.preemption_state.enabled), "Must be lock-free");
+                                static_assert(__atomic_always_lock_free(sizeof(__cfaabi_tls.preemption_state.enabled), &__cfaabi_tls.preemption_state.enabled), "Must be lock-free");
                         #endif
                         // Set enabled flag to true
                         // should be atomic to avoid preemption in the middle of the operation.
                         // use memory order RELAXED since there is no inter-thread on this variable requirements
                         __atomic_store_n(&kernelTLS.preemption_state.enabled, true, __ATOMIC_RELAXED);
+                        __atomic_store_n(&__cfaabi_tls.preemption_state.enabled, true, __ATOMIC_RELAXED);
                         // Signal the compiler that a fence is needed but only for signal handlers
 …
+        }
+}
+//-----------------------------------------------------------------------------
+// Kernel Signal Debug
+void __cfaabi_check_preemption() {
+        bool ready = __preemption_enabled();
+        if(!ready) { abort("Preemption should be ready"); }
+        __cfaasm_label(debug, before);
+                sigset_t oldset;
+                int ret;
+                ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+                if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
+                ret = sigismember(&oldset, SIGUSR1);
+                if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+                if(ret == 1) { abort("ERROR SIGUSR1 is disabled"); }
+                ret = sigismember(&oldset, SIGALRM);
+                if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+                if(ret == 0) { abort("ERROR SIGALRM is enabled"); }
+                ret = sigismember(&oldset, SIGTERM);
+                if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+                if(ret == 1) { abort("ERROR SIGTERM is disabled"); }
+        __cfaasm_label(debug, after);
+}
+#ifdef __CFA_WITH_VERIFY__
+bool __cfaabi_dbg_in_kernel() {
+        return !__preemption_enabled();
+}
+#endif
+#undef __cfaasm_label
+//-----------------------------------------------------------------------------
+// Signal handling
 // sigprocmask wrapper : unblock a single signal
 …
         if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
             abort( "internal error, pthread_sigmask" );
+                abort( "internal error, pthread_sigmask" );
+        }
+}
 …
 // reserved for future use
 static void timeout( $thread * this ) {
+        //TODO : implement waking threads
+}
+        unpark( this );
+}
+//-----------------------------------------------------------------------------
+// Some assembly required
+#if defined( __i386 )
+        #ifdef __PIC__
+                #define RELOC_PRELUDE( label ) \
+                        "calll   .Lcfaasm_prelude_" #label "$pb\n\t" \
+                        ".Lcfaasm_prelude_" #label "$pb:\n\t" \
+                        "popl    %%eax\n\t" \
+                        ".Lcfaasm_prelude_" #label "_end:\n\t" \
+                        "addl    $_GLOBAL_OFFSET_TABLE_+(.Lcfaasm_prelude_" #label "_end-.Lcfaasm_prelude_" #label "$pb), %%eax\n\t"
+                #define RELOC_PREFIX ""
+                #define RELOC_SUFFIX "@GOT(%%eax)"
+        #else
+                #define RELOC_PREFIX "$"
+                #define RELOC_SUFFIX ""
+        #endif
+        #define __cfaasm_label( label ) struct asm_region label = \
+                ({ \
+                        struct asm_region region; \
+                        asm( \
+                                RELOC_PRELUDE( label ) \
+                                "movl " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
+                                "movl " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
+                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+                        ); \
+                        region; \
+                });
+#elif defined( __x86_64 )
+        #ifdef __PIC__
+                #define RELOC_PREFIX ""
+                #define RELOC_SUFFIX "@GOTPCREL(%%rip)"
+        #else
+                #define RELOC_PREFIX "$"
+                #define RELOC_SUFFIX ""
+        #endif
+        #define __cfaasm_label( label ) struct asm_region label = \
+                ({ \
+                        struct asm_region region; \
+                        asm( \
+                                "movq " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
+                                "movq " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
+                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+                        ); \
+                        region; \
+                });
+#elif defined( __aarch64__ )
+        #ifdef __PIC__
+                // Note that this works only for gcc
+                #define __cfaasm_label( label ) struct asm_region label = \
+                ({ \
+                        struct asm_region region; \
+                        asm( \
+                                "adrp %[vb], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
+                                "ldr  %[vb], [%[vb], #:gotpage_lo15:__cfaasm_" #label "_before]" "\n\t" \
+                                "adrp %[va], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
+                                "ldr  %[va], [%[va], #:gotpage_lo15:__cfaasm_" #label "_after]"  "\n\t" \
+                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+                        ); \
+                        region; \
+                });
+        #else
+                #error this is not the right thing to do
+                /*
+                #define __cfaasm_label( label ) struct asm_region label = \
+                ({ \
+                        struct asm_region region; \
+                        asm( \
+                                "adrp %[vb], __cfaasm_" #label "_before"              "\n\t" \
+                                "add  %[vb], %[vb], :lo12:__cfaasm_" #label "_before" "\n\t" \
+                                "adrp %[va], :got:__cfaasm_" #label "_after"          "\n\t" \
+                                "add  %[va], %[va], :lo12:__cfaasm_" #label "_after"  "\n\t" \
+                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
+                        ); \
+                        region; \
+                });
+                */
+        #endif
+#else
+        #error unknown hardware architecture
+#endif
 // KERNEL ONLY
 …
 // If true  : preemption is safe
 // If false : preemption is unsafe and marked as pending
+static inline bool preemption_ready() {
+static inline bool preemption_ready( void * ip ) {
+        // Get all the region for which it is not safe to preempt
+        __cfaasm_label( get    );
+        __cfaasm_label( check  );
+        __cfaasm_label( dsable );
+        __cfaasm_label( debug  );
         // Check if preemption is safe
+        bool ready = kernelTLS.preemption_state.enabled && ! kernelTLS.preemption_state.in_progress;
+        bool ready = true;
+        if( __cfaasm_in( ip, get    ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, check  ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, dsable ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, debug  ) ) { ready = false; goto EXIT; };
+        if( !__cfaabi_tls.preemption_state.enabled) { ready = false; goto EXIT; };
+        if( __cfaabi_tls.preemption_state.in_progress ) { ready = false; goto EXIT; };
+EXIT:
         // Adjust the pending flag accordingly
         kernelTLS.this_processor->pending_preemption = !ready;
+        __cfaabi_tls.this_processor->pending_preemption = !ready;
         return ready;
+}
 …
 // Startup routine to activate preemption
 // Called from kernel_startup
 void kernel_start_preemption() {
+void __kernel_alarm_startup() {
         __cfaabi_dbg_print_safe( "Kernel : Starting preemption\n" );
         // Start with preemption disabled until ready
         kernelTLS.preemption_state.enabled = false;
         kernelTLS.preemption_state.disable_count = 1;
+        __cfaabi_tls.preemption_state.enabled = false;
+        __cfaabi_tls.preemption_state.disable_count = 1;
         // Initialize the event kernel
 …
         // Setup proper signal handlers
         __cfaabi_sigaction( SIGUSR1, sigHandler_ctxSwitch, SA_SIGINFO | SA_RESTART ); // __cfactx_switch handler
+        __cfaabi_sigaction( SIGALRM, sigHandler_alarm    , SA_SIGINFO | SA_RESTART ); // debug handler
         signal_block( SIGALRM );
 …
 // Shutdown routine to deactivate preemption
 // Called from kernel_shutdown
 void kernel_stop_preemption() {
+void __kernel_alarm_shutdown() {
         __cfaabi_dbg_print_safe( "Kernel : Preemption stopping\n" );
 …
         // Wait for the preemption thread to finish
+        pthread_join( alarm_thread, 0p );
+        free( alarm_stack );
+        __destroy_pthread( alarm_thread, alarm_stack, 0p );
         // Preemption is now fully stopped
 …
 // Kernel Signal Handlers
 //=============================================================================================
+__cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
 // Context switch signal handler
 // Receives SIGUSR1 signal and causes the current thread to yield
 static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ ) {
+        __cfaabi_dbg_debug_do( last_interrupt = (void *)(cxt->uc_mcontext.CFA_REG_IP); )
+        void * ip = (void *)(cxt->uc_mcontext.CFA_REG_IP);
+        __cfaabi_dbg_debug_do( last_interrupt = ip; )
         // SKULLDUGGERY: if a thread creates a processor and the immediately deletes it,
         // the interrupt that is supposed to force the kernel thread to preempt might arrive
         // before the kernel thread has even started running. When that happens an iterrupt
         // we a null 'this_processor' will be caught, just ignore it.
         if(! kernelTLS.this_processor ) return;
+        // before the kernel thread has even started running. When that happens, an interrupt
+        // with a null 'this_processor' will be caught, just ignore it.
+        if(! __cfaabi_tls.this_processor ) return;
         choose(sfp->si_value.sival_int) {
                 case PREEMPT_NORMAL   : ;// Normal case, nothing to do here
                 case PREEMPT_TERMINATE: verify( __atomic_load_n( &kernelTLS.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
+                case PREEMPT_TERMINATE: verify( __atomic_load_n( &__cfaabi_tls.this_processor->do_terminate, __ATOMIC_SEQ_CST ) );
                 default:
                         abort( "internal error, signal value is %d", sfp->si_value.sival_int );
 …
         // Check if it is safe to preempt here
         if( !preemption_ready() ) { return; }
         __cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", kernelTLS.this_processor, kernelTLS.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
+        if( !preemption_ready( ip ) ) { return; }
+        __cfaabi_dbg_print_buffer_decl( " KERNEL: preempting core %p (%p @ %p).\n", __cfaabi_tls.this_processor, __cfaabi_tls.this_thread, (void *)(cxt->uc_mcontext.CFA_REG_IP) );
         // Sync flag : prevent recursive calls to the signal handler
         kernelTLS.preemption_state.in_progress = true;
+        __cfaabi_tls.preemption_state.in_progress = true;
         // Clear sighandler mask before context switching.
 …
+        }
-        // TODO: this should go in finish action
         // Clear the in progress flag
         kernelTLS.preemption_state.in_progress = false;
+        __cfaabi_tls.preemption_state.in_progress = false;
         // Preemption can occur here
 …
         force_yield( __ALARM_PREEMPTION ); // Do the actual __cfactx_switch
+}
+static void sigHandler_alarm( __CFA_SIGPARMS__ ) {
+        abort("SIGALRM should never reach the signal handler");
+}
+#if !defined(__CFA_NO_STATISTICS__)
+        int __print_alarm_stats = 0;
+#endif
 // Main of the alarm thread
 // Waits on SIGALRM and send SIGUSR1 to whom ever needs it
 static void * alarm_loop( __attribute__((unused)) void * args ) {
+        __processor_id_t id;
+        id.full_proc = false;
+        id.id = doregister(&id);
+        __cfaabi_tls.this_proc_id = &id;
+        #if !defined(__CFA_NO_STATISTICS__)
+                struct __stats_t local_stats;
+                __cfaabi_tls.this_stats = &local_stats;
+                __init_stats( &local_stats );
+        #endif
         // Block sigalrms to control when they arrive
         sigset_t mask;
 …
 EXIT:
         __cfaabi_dbg_print_safe( "Kernel : Preemption thread stopping\n" );
+        unregister(&id);
+        #if !defined(__CFA_NO_STATISTICS__)
+                if( 0 != __print_alarm_stats ) {
+                        __print_stats( &local_stats, __print_alarm_stats, "Alarm", "Thread", 0p );
+                }
+        #endif
         return 0p;
+}
-//=============================================================================================
-// Kernel Signal Debug
-//=============================================================================================
-void __cfaabi_check_preemption() {
-        bool ready = kernelTLS.preemption_state.enabled;
-        if(!ready) { abort("Preemption should be ready"); }
-        sigset_t oldset;
-        int ret;
-        ret = pthread_sigmask(0, 0p, &oldset);
-        if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
-        ret = sigismember(&oldset, SIGUSR1);
-        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-        if(ret == 1) { abort("ERROR SIGUSR1 is disabled"); }
-        ret = sigismember(&oldset, SIGALRM);
-        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-        if(ret == 0) { abort("ERROR SIGALRM is enabled"); }
-        ret = sigismember(&oldset, SIGTERM);
-        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
-        if(ret == 1) { abort("ERROR SIGTERM is disabled"); }
+}
-#ifdef __CFA_WITH_VERIFY__
-bool __cfaabi_dbg_in_kernel() {
-        return !kernelTLS.preemption_state.enabled;
+}
-#endif
 // Local Variables: //

libcfa/src/concurrency/preemption.hfa

-              r3c64c668
+              r58fe85a
 #pragma once
+#include "bits/locks.hfa"
 #include "alarm.hfa"
-#include "kernel_private.hfa"
+void kernel_start_preemption();
+void kernel_stop_preemption();
+struct event_kernel_t {
+        alarm_list_t alarms;
+        __spinlock_t lock;
+};
+extern event_kernel_t * event_kernel;
 void update_preemption( processor * this, Duration duration );

libcfa/src/concurrency/thread.cfa

-              r3c64c668
+              r58fe85a
 #include "kernel_private.hfa"
+#include "exception.hfa"
 #define __CFA_INVOKE_PRIVATE__
 …
         context{ 0p, 0p };
         self_cor{ name, storage, storageSize };
+        ticket = TICKET_RUNNING;
         state = Start;
         preempted = __NO_PREEMPTION;
 …
         self_mon_p = &self_mon;
         curr_cluster = &cl;
+        next = 0p;
+        link.next = 0p;
+        link.prev = 0p;
+        link.preferred = -1;
+        #if defined( __CFA_WITH_VERIFY__ )
+                canary = 0x0D15EA5E0D15EA5Ep;
+        #endif
+        seqable.next = 0p;
+        seqable.back = 0p;
         node.next = 0p;
 …
 void ^?{}($thread& this) with( this ) {
+        #if defined( __CFA_WITH_VERIFY__ )
+                canary = 0xDEADDEADDEADDEADp;
+        #endif
         unregister(curr_cluster, this);
         ^self_cor{};
+}
+FORALL_DATA_INSTANCE(ThreadCancelled, (dtype thread_t), (thread_t))
+forall(dtype T)
+void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src) {
+        dst->virtual_table = src->virtual_table;
+        dst->the_thread = src->the_thread;
+        dst->the_exception = src->the_exception;
+}
+forall(dtype T)
+const char * msg(ThreadCancelled(T) *) {
+        return "ThreadCancelled";
+}
+forall(dtype T)
+static void default_thread_cancel_handler(ThreadCancelled(T) & ) {
+        abort( "Unhandled thread cancellation.\n" );
+}
+forall(dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
+void ?{}( thread_dtor_guard_t & this,
+                T & thrd, void(*defaultResumptionHandler)(ThreadCancelled(T) &)) {
+        $monitor * m = get_monitor(thrd);
+        $thread * desc = get_thread(thrd);
+        // Setup the monitor guard
+        void (*dtor)(T& mutex this) = ^?{};
+        bool join = defaultResumptionHandler != (void(*)(ThreadCancelled(T)&))0;
+        (this.mg){&m, (void(*)())dtor, join};
+        /* paranoid */ verifyf( Halted == desc->state || Cancelled == desc->state, "Expected thread to be Halted or Cancelled, was %d\n", (int)desc->state );
+        // After the guard set-up and any wait, check for cancellation.
+        struct _Unwind_Exception * cancellation = desc->self_cor.cancellation;
+        if ( likely( 0p == cancellation ) ) {
+                return;
+        } else if ( Cancelled == desc->state ) {
+                return;
+        }
+        desc->state = Cancelled;
+        if (!join) {
+                defaultResumptionHandler = default_thread_cancel_handler;
+        }
+        ThreadCancelled(T) except;
+        // TODO: Remove explitate vtable set once trac#186 is fixed.
+        except.virtual_table = &get_exception_vtable(&except);
+        except.the_thread = &thrd;
+        except.the_exception = __cfaehm_cancellation_exception( cancellation );
+        throwResume except;
+        except.the_exception->virtual_table->free( except.the_exception );
+        free( cancellation );
+        desc->self_cor.cancellation = 0p;
+}
+void ^?{}( thread_dtor_guard_t & this ) {
+        ^(this.mg){};
+}
 …
         this_thrd->context.[SP, FP] = this_thrd->self_cor.context.[SP, FP];
         verify( this_thrd->context.SP );
+        /* paranoid */ verify( this_thrd->context.SP );
         __schedule_thread(this_thrd);
+        __schedule_thread( this_thrd );
         enable_interrupts( __cfaabi_dbg_ctx );
+}
 …
+}
+//-----------------------------------------------------------------------------
+forall(dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
+T & join( T & this ) {
+        thread_dtor_guard_t guard = { this, defaultResumptionHandler };
+        return this;
+}
+uint64_t thread_rand() {
+        disable_interrupts();
+        uint64_t ret = __tls_rand();
+        enable_interrupts( __cfaabi_dbg_ctx );
+        return ret;
+}
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/thread.hfa

-              r3c64c668
+              r58fe85a
 #include "kernel.hfa"
 #include "monitor.hfa"
+#include "exception.hfa"
 //-----------------------------------------------------------------------------
 // thread trait
 trait is_thread(dtype T) {
       void ^?{}(T& mutex this);
       void main(T& this);
       $thread* get_thread(T& this);
+        void ^?{}(T& mutex this);
+        void main(T& this);
+        $thread* get_thread(T& this);
 };
+FORALL_DATA_EXCEPTION(ThreadCancelled, (dtype thread_t), (thread_t)) (
+        thread_t * the_thread;
+        exception_t * the_exception;
+);
+forall(dtype T)
+void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src);
+forall(dtype T)
+const char * msg(ThreadCancelled(T) *);
 // define that satisfies the trait without using the thread keyword
 …
 static inline void ?{}($thread & this, const char * const name, struct cluster & cl, size_t stackSize ) { this{ name, cl, 0p, stackSize }; }
+struct thread_dtor_guard_t {
+        monitor_dtor_guard_t mg;
+};
+forall( dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
+void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(ThreadCancelled(T) &) );
+void ^?{}( thread_dtor_guard_t & this );
 //-----------------------------------------------------------------------------
 // thread runner
 …
 forall( dtype T | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this );
-//-----------------------------------------------------------------------------
-// Thread getters
-static inline struct $thread * active_thread () { return TL_GET( this_thread ); }
 //-----------------------------------------------------------------------------
 …
 bool force_yield( enum __Preemption_Reason );
+static inline void yield() {
+        force_yield(__MANUAL_PREEMPTION);
+}
+//----------
+// sleep: force thread to block and be rescheduled after Duration duration
+void sleep( Duration duration );
+// Yield: yield N times
+static inline void yield( unsigned times ) {
+        for( times ) {
+                yield();
+        }
+}
+//----------
+// join
+forall( dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
+T & join( T & this );
 // Local Variables: //

Context Navigation

Legend:

libcfa/src/concurrency/CtxSwitch-arm32.S

libcfa/src/concurrency/CtxSwitch-i386.S

libcfa/src/concurrency/CtxSwitch-x86_64.S

libcfa/src/concurrency/alarm.cfa

libcfa/src/concurrency/alarm.hfa

libcfa/src/concurrency/coroutine.cfa

libcfa/src/concurrency/coroutine.hfa

libcfa/src/concurrency/invoke.c

libcfa/src/concurrency/invoke.h

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel_private.hfa

libcfa/src/concurrency/monitor.cfa

libcfa/src/concurrency/monitor.hfa

libcfa/src/concurrency/mutex.cfa

libcfa/src/concurrency/preemption.cfa

libcfa/src/concurrency/preemption.hfa

libcfa/src/concurrency/thread.cfa

libcfa/src/concurrency/thread.hfa

Download in other formats: