Changeset 7f6a7c9 for libcfa/src/concurrency

libcfa/src/concurrency/io.cfa

-              r428adbc
+              r7f6a7c9
         };
         static $io_context * __ioarbiter_allocate( $io_arbiter & this, __u32 idxs[], __u32 want );
         static void __ioarbiter_submit( $io_context * , __u32 idxs[], __u32 have, bool lazy );
         static void __ioarbiter_flush ( $io_context & );
         static inline void __ioarbiter_notify( $io_context & ctx );
+        static io_context$ * __ioarbiter_allocate( io_arbiter$ & this, __u32 idxs[], __u32 want );
+        static void __ioarbiter_submit( io_context$ * , __u32 idxs[], __u32 have, bool lazy );
+        static void __ioarbiter_flush ( io_context$ & );
+        static inline void __ioarbiter_notify( io_context$ & ctx );
 //=============================================================================================
 // I/O Polling
 //=============================================================================================
         static inline unsigned __flush( struct $io_context & );
         static inline __u32 __release_sqes( struct $io_context & );
+        static inline unsigned __flush( struct io_context$ & );
+        static inline __u32 __release_sqes( struct io_context$ & );
         extern void __kernel_unpark( thread$ * thrd, unpark_hint );
         static void ioring_syscsll( struct $io_context & ctx, unsigned int min_comp, unsigned int flags ) {
+        static void ioring_syscsll( struct io_context$ & ctx, unsigned int min_comp, unsigned int flags ) {
                 __STATS__( true, io.calls.flush++; )
                 int ret;
 …
+        }
         static bool try_acquire( $io_context * ctx ) __attribute__((nonnull(1))) {
+        static bool try_acquire( io_context$ * ctx ) __attribute__((nonnull(1))) {
                 /* paranoid */ verify( ! __preemption_enabled() );
                 /* paranoid */ verify( ready_schedule_islocked() );
 …
+        }
         static bool __cfa_do_drain( $io_context * ctx, cluster * cltr ) __attribute__((nonnull(1, 2))) {
+        static bool __cfa_do_drain( io_context$ * ctx, cluster * cltr ) __attribute__((nonnull(1, 2))) {
                 /* paranoid */ verify( ! __preemption_enabled() );
                 /* paranoid */ verify( ready_schedule_islocked() );
 …
                 cluster * const cltr = proc->cltr;
                 $io_context * const ctx = proc->io.ctx;
+                io_context$ * const ctx = proc->io.ctx;
                 /* paranoid */ verify( cltr );
                 /* paranoid */ verify( ctx );
 …
                 /* paranoid */ verify( proc->io.ctx );
                 $io_context & ctx = *proc->io.ctx;
+                io_context$ & ctx = *proc->io.ctx;
                 __ioarbiter_flush( ctx );
 …
         // Allocation
         // for user's convenience fill the sqes from the indexes
         static inline void __fill(struct io_uring_sqe * out_sqes[], __u32 want, __u32 idxs[], struct $io_context * ctx)  {
+        static inline void __fill(struct io_uring_sqe * out_sqes[], __u32 want, __u32 idxs[], struct io_context$ * ctx)  {
                 struct io_uring_sqe * sqes = ctx->sq.sqes;
                 for(i; want) {
 …
         // Try to directly allocate from the a given context
         // Not thread-safe
         static inline bool __alloc(struct $io_context * ctx, __u32 idxs[], __u32 want) {
+        static inline bool __alloc(struct io_context$ * ctx, __u32 idxs[], __u32 want) {
                 __sub_ring_t & sq = ctx->sq;
                 const __u32 mask  = *sq.mask;
 …
         // for convenience, return both the index and the pointer to the sqe
         // sqe == &sqes[idx]
         struct $io_context * cfa_io_allocate(struct io_uring_sqe * sqes[], __u32 idxs[], __u32 want) libcfa_public {
+        struct io_context$ * cfa_io_allocate(struct io_uring_sqe * sqes[], __u32 idxs[], __u32 want) libcfa_public {
                 // __cfadbg_print_safe(io, "Kernel I/O : attempting to allocate %u\n", want);
                 disable_interrupts();
                 processor * proc = __cfaabi_tls.this_processor;
                 $io_context * ctx = proc->io.ctx;
+                io_context$ * ctx = proc->io.ctx;
                 /* paranoid */ verify( __cfaabi_tls.this_processor );
                 /* paranoid */ verify( ctx );
 …
                 enable_interrupts();
                 $io_arbiter * ioarb = proc->cltr->io.arbiter;
+                io_arbiter$ * ioarb = proc->cltr->io.arbiter;
                 /* paranoid */ verify( ioarb );
                 // __cfadbg_print_safe(io, "Kernel I/O : falling back on arbiter for allocation\n");
                 struct $io_context * ret = __ioarbiter_allocate(*ioarb, idxs, want);
+                struct io_context$ * ret = __ioarbiter_allocate(*ioarb, idxs, want);
                 // __cfadbg_print_safe(io, "Kernel I/O : slow allocation completed from ring %d\n", ret->fd);
 …
         //=============================================================================================
         // submission
         static inline void __submit_only( struct $io_context * ctx, __u32 idxs[], __u32 have) {
+        static inline void __submit_only( struct io_context$ * ctx, __u32 idxs[], __u32 have) {
                 // We can proceed to the fast path
                 // Get the right objects
 …
+        }
         static inline void __submit( struct $io_context * ctx, __u32 idxs[], __u32 have, bool lazy) {
+        static inline void __submit( struct io_context$ * ctx, __u32 idxs[], __u32 have, bool lazy) {
                 __sub_ring_t & sq = ctx->sq;
                 __submit_only(ctx, idxs, have);
 …
+        }
         void cfa_io_submit( struct $io_context * inctx, __u32 idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1))) libcfa_public {
+        void cfa_io_submit( struct io_context$ * inctx, __u32 idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1))) libcfa_public {
                 // __cfadbg_print_safe(io, "Kernel I/O : attempting to submit %u (%s)\n", have, lazy ? "lazy" : "eager");
 …
                 __STATS__( true, if(!lazy) io.submit.eagr += 1; )
                 processor * proc = __cfaabi_tls.this_processor;
                 $io_context * ctx = proc->io.ctx;
+                io_context$ * ctx = proc->io.ctx;
                 /* paranoid */ verify( __cfaabi_tls.this_processor );
                 /* paranoid */ verify( ctx );
 …
         // by io_uring
         // This cannot be done by multiple threads
         static __u32 __release_sqes( struct $io_context & ctx ) {
+        static __u32 __release_sqes( struct io_context$ & ctx ) {
                 const __u32 mask = *ctx.sq.mask;
 …
+        }
         static $io_context * __ioarbiter_allocate( $io_arbiter & this, __u32 idxs[], __u32 want ) {
+        static io_context$ * __ioarbiter_allocate( io_arbiter$ & this, __u32 idxs[], __u32 want ) {
                 // __cfadbg_print_safe(io, "Kernel I/O : arbiter allocating\n");
 …
+        }
         static void __ioarbiter_notify( $io_arbiter & this, $io_context * ctx ) {
+        static void __ioarbiter_notify( io_arbiter$ & this, io_context$ * ctx ) {
                 /* paranoid */ verify( !empty(this.pending.queue) );
 …
+        }
         static void __ioarbiter_notify( $io_context & ctx ) {
+        static void __ioarbiter_notify( io_context$ & ctx ) {
                 if(!empty( ctx.arbiter->pending )) {
                         __ioarbiter_notify( *ctx.arbiter, &ctx );
 …
         // Simply append to the pending
         static void __ioarbiter_submit( $io_context * ctx, __u32 idxs[], __u32 have, bool lazy ) {
+        static void __ioarbiter_submit( io_context$ * ctx, __u32 idxs[], __u32 have, bool lazy ) {
                 __cfadbg_print_safe(io, "Kernel I/O : submitting %u from the arbiter to context %u\n", have, ctx->fd);
 …
+        }
         static void __ioarbiter_flush( $io_context & ctx ) {
+        static void __ioarbiter_flush( io_context$ & ctx ) {
                 if(!empty( ctx.ext_sq )) {
                         __STATS__( false, io.flush.external += 1; )
 …
         #if defined(CFA_WITH_IO_URING_IDLE)
                 bool __kernel_read(processor * proc, io_future_t & future, iovec & iov, int fd) {
                         $io_context * ctx = proc->io.ctx;
+                        io_context$ * ctx = proc->io.ctx;
                         /* paranoid */ verify( ! __preemption_enabled() );
                         /* paranoid */ verify( proc == __cfaabi_tls.this_processor );

libcfa/src/concurrency/io/call.cfa.in

-              r428adbc
+              r7f6a7c9
+        ;
         extern struct $io_context * cfa_io_allocate(struct io_uring_sqe * out_sqes[], __u32 out_idxs[], __u32 want)  __attribute__((nonnull (1,2)));
         extern void cfa_io_submit( struct $io_context * in_ctx, __u32 in_idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1,2)));
+        extern struct io_context$ * cfa_io_allocate(struct io_uring_sqe * out_sqes[], __u32 out_idxs[], __u32 want)  __attribute__((nonnull (1,2)));
+        extern void cfa_io_submit( struct io_context$ * in_ctx, __u32 in_idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1,2)));
 #endif
 …
                 __u32 idx;
                 struct io_uring_sqe * sqe;
+                struct $io_context * ctx = cfa_io_allocate( &sqe, &idx, 1 );
+                struct io_context$ * ctx = cfa_io_allocate( &sqe, &idx, 1 );
+                memset(sqe, 0, sizeof(*sqe));
                 sqe->opcode = IORING_OP_{op};
+                sqe->flags = sflags;
                 sqe->user_data = (uintptr_t)&future;
+                sqe->flags = sflags;
+                sqe->ioprio = 0;
+                sqe->fd = 0;
+                sqe->off = 0;
+                sqe->addr = 0;
+                sqe->len = 0;
+                sqe->fsync_flags = 0;
+                sqe->__pad2[0] = 0;
+                sqe->__pad2[1] = 0;
+                sqe->__pad2[2] = 0;{body}
+                {body}
                 asm volatile("": : :"memory");

libcfa/src/concurrency/io/setup.cfa

-              r428adbc
+              r7f6a7c9
         void ?{}(io_context_params & this) libcfa_public {}
         void  ?{}($io_context & this, struct cluster & cl) {}
         void ^?{}($io_context & this) {}
+        void  ?{}(io_context$ & this, struct cluster & cl) {}
+        void ^?{}(io_context$ & this) {}
         void __cfa_io_start( processor * proc ) {}
 …
         void __cfa_io_stop ( processor * proc ) {}
         $io_arbiter * create(void) { return 0p; }
         void destroy($io_arbiter *) {}
+        io_arbiter$ * create(void) { return 0p; }
+        void destroy(io_arbiter$ *) {}
 #else
 …
         static void __io_uring_setup ( $io_context & this, const io_context_params & params_in, int procfd );
         static void __io_uring_teardown( $io_context & this );
         static void __epoll_register($io_context & ctx);
         static void __epoll_unregister($io_context & ctx);
         void __ioarbiter_register( $io_arbiter & mutex, $io_context & ctx );
         void __ioarbiter_unregister( $io_arbiter & mutex, $io_context & ctx );
         void ?{}($io_context & this, processor * proc, struct cluster & cl) {
+        static void __io_uring_setup ( io_context$ & this, const io_context_params & params_in, int procfd );
+        static void __io_uring_teardown( io_context$ & this );
+        static void __epoll_register(io_context$ & ctx);
+        static void __epoll_unregister(io_context$ & ctx);
+        void __ioarbiter_register( io_arbiter$ & mutex, io_context$ & ctx );
+        void __ioarbiter_unregister( io_arbiter$ & mutex, io_context$ & ctx );
+        void ?{}(io_context$ & this, processor * proc, struct cluster & cl) {
                 /* paranoid */ verify( cl.io.arbiter );
                 this.proc = proc;
 …
+        }
         void ^?{}($io_context & this) {
+        void ^?{}(io_context$ & this) {
                 __cfadbg_print_safe(io_core, "Kernel I/O : tearing down io_context %u\n", this.fd);
 …
+        }
         static void __io_uring_setup( $io_context & this, const io_context_params & params_in, int procfd ) {
+        static void __io_uring_setup( io_context$ & this, const io_context_params & params_in, int procfd ) {
                 // Step 1 : call to setup
                 struct io_uring_params params;
 …
                 #if !defined(CFA_WITH_IO_URING_IDLE)
+                {
                         // Step 4 : eventfd
                         __cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd);
 …
                         __cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd);
+                #endif
+                }
+                #endif
+                // TODO: implement a proper version of this.
+                // I have not found a better maximum that works in general but users should be able to configure it
+                // the same way they configure other I/O options
                 // #if defined(CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS)
+                // {
                 //      // Step 5 : max worker count
                 //      __cfadbg_print_safe(io_core, "Kernel I/O : lmiting max workers for ring %d\n", fd);
 …
                 //      __cfadbg_print_safe(io_core, "Kernel I/O : lmited max workers for ring %d\n", fd);
+                // }
                 // #endif
 …
+        }
         static void __io_uring_teardown( $io_context & this ) {
+        static void __io_uring_teardown( io_context$ & this ) {
                 // Shutdown the io rings
                 struct __sub_ring_t & sq = this.sq;
 …
 // I/O Context Sleep
 //=============================================================================================
         // static inline void __epoll_ctl($io_context & ctx, int op, const char * error) {
+        // static inline void __epoll_ctl(io_context$ & ctx, int op, const char * error) {
         //      struct epoll_event ev;
         //      ev.events = EPOLLIN | EPOLLONESHOT;
 …
         // }
         // static void __epoll_register($io_context & ctx) {
+        // static void __epoll_register(io_context$ & ctx) {
         //      __epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
         // }
         // static void __epoll_unregister($io_context & ctx) {
+        // static void __epoll_unregister(io_context$ & ctx) {
         //      // Read the current epoch so we know when to stop
         //      size_t curr = __atomic_load_n(&iopoll.epoch, __ATOMIC_SEQ_CST);
 …
         // }
         // void __ioctx_prepare_block($io_context & ctx) {
+        // void __ioctx_prepare_block(io_context$ & ctx) {
         //      __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.fd, &ctx);
         //      __epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
 …
 // I/O Context Misc Setup
 //=============================================================================================
         void ?{}( $io_arbiter & this ) {
+        void ?{}( io_arbiter$ & this ) {
                 this.pending.empty = true;
+        }
         void ^?{}( $io_arbiter & mutex this ) {}
         $io_arbiter * create(void) {
+        void ^?{}( io_arbiter$ & mutex this ) {}
+        io_arbiter$ * create(void) {
                 return new();
+        }
         void destroy($io_arbiter * arbiter) {
+        void destroy(io_arbiter$ * arbiter) {
                 delete(arbiter);
+        }

libcfa/src/concurrency/io/types.hfa

-              r428adbc
+              r7f6a7c9
         struct processor;
         monitor $io_arbiter;
+        monitor io_arbiter$;
         //-----------------------------------------------------------------------
 …
         struct __attribute__((aligned(64))) $io_context {
                 $io_arbiter * arbiter;
+        struct __attribute__((aligned(64))) io_context$ {
+                io_arbiter$ * arbiter;
                 processor * proc;
 …
         };
         static inline unsigned long long ts($io_context *& this) {
+        static inline unsigned long long ts(io_context$ *& this) {
                 const __u32 head = *this->cq.head;
                 const __u32 tail = *this->cq.tail;
 …
                 __u32 * idxs;
                 __u32 want;
                 $io_context * ctx;
+                io_context$ * ctx;
         };
         monitor __attribute__((aligned(64))) $io_arbiter {
+        monitor __attribute__((aligned(64))) io_arbiter$ {
                 __outstanding_io_queue pending;
         };
 …
         #endif
         // void __ioctx_prepare_block($io_context & ctx);
+        // void __ioctx_prepare_block(io_context$ & ctx);
 #endif

libcfa/src/concurrency/iofwd.hfa

-              r428adbc
+              r7f6a7c9
 struct cluster;
 struct $io_context;
+struct io_context$;
 struct iovec;
 …
 //----------
 // underlying calls
 extern struct $io_context * cfa_io_allocate(struct io_uring_sqe * out_sqes[], __u32 out_idxs[], __u32 want)  __attribute__((nonnull (1,2)));
 extern void cfa_io_submit( struct $io_context * in_ctx, __u32 in_idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1,2)));
+extern struct io_context$ * cfa_io_allocate(struct io_uring_sqe * out_sqes[], __u32 out_idxs[], __u32 want)  __attribute__((nonnull (1,2)));
+extern void cfa_io_submit( struct io_context$ * in_ctx, __u32 in_idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1,2)));
 //----------

libcfa/src/concurrency/kernel.cfa

-              r428adbc
+              r7f6a7c9
         RUNNING:  while(true) {
                 thrd_dst->preempted = __NO_PREEMPTION;
-                thrd_dst->state = Active;
                 // Update global state
                 kernelTLS().this_thread = thrd_dst;
+                // Update the state after setting this_thread
+                // so that the debugger can find all active threads
+                // in tls storage
+                thrd_dst->state = Active;
                 /* paranoid */ verify( ! __preemption_enabled() );
 …
                 /* paranoid */ verify( ! __preemption_enabled() );
-                // Reset global state
-                kernelTLS().this_thread = 0p;
                 // We just finished running a thread, there are a few things that could have happened.
                 // 1 - Regular case : the thread has blocked and now one has scheduled it yet.
 …
                 if(unlikely(thrd_dst->preempted != __NO_PREEMPTION)) {
+                        // Reset the this_thread now that we know
+                        // the state isn't active anymore
+                        kernelTLS().this_thread = 0p;
                         // The thread was preempted, reschedule it and reset the flag
                         schedule_thread$( thrd_dst, UNPARK_LOCAL );
 …
                 if(unlikely(thrd_dst->state == Halting)) {
+                        // Reset the this_thread now that we know
+                        // the state isn't active anymore
+                        kernelTLS().this_thread = 0p;
                         // The thread has halted, it should never be scheduled/run again
                         // finish the thread
 …
                 /* paranoid */ verify( thrd_dst->state == Active );
                 thrd_dst->state = Blocked;
+                // Reset the this_thread now that we know
+                // the state isn't active anymore
+                kernelTLS().this_thread = 0p;
                 // set state of processor coroutine to active and the thread to inactive

libcfa/src/concurrency/kernel.hfa

-              r428adbc
+              r7f6a7c9
 // I/O
 struct cluster;
 struct $io_context;
 struct $io_arbiter;
+struct io_context$;
+struct io_arbiter$;
 struct io_context_params {
 …
         struct {
                 $io_context * ctx;
+                io_context$ * ctx;
                 unsigned target;
                 volatile bool pending;
 …
                 struct {
                         // Array of $io_
                         $io_context ** data;
+                        io_context$ ** data;
                         // Time since subqueues were processed
 …
         struct {
                 $io_arbiter * arbiter;
+                io_arbiter$ * arbiter;
                 io_context_params params;
         } io;

libcfa/src/concurrency/kernel/cluster.cfa

-              r428adbc
+              r7f6a7c9
 //=======================================================================
 void  ?{}(__scheduler_RWLock_t & this) {
         this.max   = __max_processors();
         this.alloc = 0;
         this.ready = 0;
         this.data  = alloc(this.max);
         this.write_lock  = false;
         /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.alloc), &this.alloc));
         /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.ready), &this.ready));
+        this.lock.max   = __max_processors();
+        this.lock.alloc = 0;
+        this.lock.ready = 0;
+        this.lock.data  = alloc(this.lock.max);
+        this.lock.write_lock  = false;
+        /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.lock.alloc), &this.lock.alloc));
+        /*paranoid*/ verify(__atomic_is_lock_free(sizeof(this.lock.ready), &this.lock.ready));
+}
 void ^?{}(__scheduler_RWLock_t & this) {
         free(this.data);
+        free(this.lock.data);
+}
 …
 //=======================================================================
 // Lock-Free registering/unregistering of threads
 unsigned register_proc_id( void ) with(*__scheduler_lock) {
+unsigned register_proc_id( void ) with(__scheduler_lock.lock) {
         __kernel_rseq_register();
 …
+        }
         if(max <= alloc) abort("Trying to create more than %ud processors", __scheduler_lock->max);
+        if(max <= alloc) abort("Trying to create more than %ud processors", __scheduler_lock.lock.max);
         // Step - 2 : F&A to get a new spot in the array.
         uint_fast32_t n = __atomic_fetch_add(&alloc, 1, __ATOMIC_SEQ_CST);
         if(max <= n) abort("Trying to create more than %ud processors", __scheduler_lock->max);
+        if(max <= n) abort("Trying to create more than %ud processors", __scheduler_lock.lock.max);
         // Step - 3 : Mark space as used and then publish it.
 …
+}
 void unregister_proc_id( unsigned id ) with(*__scheduler_lock) {
+void unregister_proc_id( unsigned id ) with(__scheduler_lock.lock) {
         /* paranoid */ verify(id < ready);
         /* paranoid */ verify(id == kernelTLS().sched_id);
 …
 // Writer side : acquire when changing the ready queue, e.g. adding more
 //  queues or removing them.
 uint_fast32_t ready_mutate_lock( void ) with(*__scheduler_lock) {
+uint_fast32_t ready_mutate_lock( void ) with(__scheduler_lock.lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
 …
+}
 void ready_mutate_unlock( uint_fast32_t last_s ) with(*__scheduler_lock) {
+void ready_mutate_unlock( uint_fast32_t last_s ) with(__scheduler_lock.lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
 …
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
         static void assign_io($io_context ** data, size_t count, dlist(processor) & list) {
+        static void assign_io(io_context$ ** data, size_t count, dlist(processor) & list) {
                 processor * it = &list`first;
                 while(it) {

libcfa/src/concurrency/kernel/cluster.hfa

r428adbc	r7f6a7c9
24	24	// Calc moving average based on existing average, before and current time.
25	25	static inline unsigned long long moving_average(unsigned long long currtsc, unsigned long long instsc, unsigned long long old_avg) {
26		~~/* paranoid */ verifyf( currtsc < 45000000000000000, "Suspiciously large current time: %'llu (%llx)\n", currtsc, currtsc );~~
27		~~/* paranoid */ verifyf( instsc < 45000000000000000, "Suspiciously large insert time: %'llu (%llx)\n", instsc, instsc );~~
28	26	/* paranoid */ verifyf( old_avg < 15000000000000, "Suspiciously large previous average: %'llu (%llx)\n", old_avg, old_avg );
29	27
…	…
65	63	}
66	64	}
67		return ~~(max + 2 * max) / 2~~;
	65	return 8 * max;
68	66	}
69	67

libcfa/src/concurrency/kernel/fwd.hfa

-              r428adbc
+              r7f6a7c9
 extern "C" {
         extern "Cforall" {
                 extern __attribute__((aligned(64))) thread_local struct KernelThreadData {
+                extern __attribute__((aligned(64))) __thread struct KernelThreadData {
                         struct thread$          * volatile this_thread;
                         struct processor        * volatile this_processor;
 …
                 // Similar to a binary semaphore with a 'one shot' semantic
                 // is expected to be discarded after each party call their side
+                enum(struct thread$ *) { oneshot_ARMED = 0p, oneshot_FULFILLED = 1p };
                 struct oneshot {
                         // Internal state :
                         //     0p     : is initial state (wait will block)
                         //     1p     : fulfilled (wait won't block)
+                        // armed      : initial state, wait will block
+                        // fulfilled  : wait won't block
                         // any thread : a thread is currently waiting
                         struct thread$ * volatile ptr;
 …
                 static inline {
                         void  ?{}(oneshot & this) {
                                 this.ptr = 0p;
+                                this.ptr = oneshot_ARMED;
+                        }
 …
                                 for() {
                                         struct thread$ * expected = this.ptr;
                                         if(expected == 1p) return false;
+                                        if(expected == oneshot_FULFILLED) return false;
                                         if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
                                                 park();
                                                 /* paranoid */ verify( this.ptr == 1p );
+                                                /* paranoid */ verify( this.ptr == oneshot_FULFILLED );
                                                 return true;
+                                        }
 …
                         // return true if a thread was unparked
                         thread$ * post(oneshot & this, bool do_unpark = true) {
                                 struct thread$ * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
                                 if( got == 0p || got == 1p ) return 0p;
+                                struct thread$ * got = __atomic_exchange_n( &this.ptr, oneshot_FULFILLED, __ATOMIC_SEQ_CST);
+                                if( got == oneshot_ARMED || got == oneshot_FULFILLED ) return 0p;
                                 if(do_unpark) unpark( got );
                                 return got;
 …
                 // thread on "any of" [a given set of] futures.
                 // does not support multiple threads waiting on the same future
+                enum(struct oneshot *) { future_ARMED = 0p, future_FULFILLED = 1p, future_PROGRESS = 2p, future_ABANDONED = 3p };
                 struct future_t {
                         // Internal state :
                         //     0p      : is initial state (wait will block)
                         //     1p      : fulfilled (wait won't block)
                         //     2p      : in progress ()
                         //     3p      : abandoned, server should delete
+                        // armed       : initial state, wait will block
+                        // fulfilled   : result is ready, wait won't block
+                        // progress    : someone else is in the process of fulfilling this
+                        // abandoned   : client no longer cares, server should delete
                         // any oneshot : a context has been setup to wait, a thread could wait on it
                         struct oneshot * volatile ptr;
 …
                 static inline {
                         void  ?{}(future_t & this) {
                                 this.ptr = 0p;
+                                this.ptr = future_ARMED;
+                        }
 …
                         void reset(future_t & this) {
                                 // needs to be in 0p or 1p
                                 __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                                __atomic_exchange_n( &this.ptr, future_ARMED, __ATOMIC_SEQ_CST);
+                        }
                         // check if the future is available
                         bool available( future_t & this ) {
                                 while( this.ptr == 2p ) Pause();
                                 return this.ptr == 1p;
+                                while( this.ptr == future_PROGRESS ) Pause();
+                                return this.ptr == future_FULFILLED;
+                        }
 …
                         // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
                         bool setup( future_t & this, oneshot & wait_ctx ) {
                                 /* paranoid */ verify( wait_ctx.ptr == 0p || wait_ctx.ptr == 1p );
+                                /* paranoid */ verify( wait_ctx.ptr == oneshot_ARMED || wait_ctx.ptr == oneshot_FULFILLED );
                                 // The future needs to set the wait context
                                 for() {
                                         struct oneshot * expected = this.ptr;
                                         // Is the future already fulfilled?
                                         if(expected == 1p) return false; // Yes, just return false (didn't block)
+                                        if(expected == future_FULFILLED) return false; // Yes, just return false (didn't block)
                                         // The future is not fulfilled, try to setup the wait context
 …
                                 // attempt to remove the context so it doesn't get consumed.
                                 if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                if(__atomic_compare_exchange_n( &this.ptr, &expected, future_ARMED, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
                                         // we still have the original context, then no one else saw it
                                         return false;
+                                }
                                 // expected == 0p: future was never actually setup, just return
                                 if( expected == 0p ) return false;
                                 // expected == 1p: the future is ready and the context was fully consumed
+                                // expected == ARMED: future was never actually setup, just return
+                                if( expected == future_ARMED ) return false;
+                                // expected == FULFILLED: the future is ready and the context was fully consumed
                                 // the server won't use the pointer again
                                 // It is safe to delete (which could happen after the return)
                                 if( expected == 1p ) return true;
                                 // expected == 2p: the future is ready but the context hasn't fully been consumed
+                                if( expected == future_FULFILLED ) return true;
+                                // expected == PROGRESS: the future is ready but the context hasn't fully been consumed
                                 // spin until it is safe to move on
                                 if( expected == 2p ) {
                                         while( this.ptr != 1p ) Pause();
                                         /* paranoid */ verify( this.ptr == 1p );
+                                if( expected == future_PROGRESS ) {
+                                        while( this.ptr != future_FULFILLED ) Pause();
+                                        /* paranoid */ verify( this.ptr == future_FULFILLED );
                                         return true;
+                                }
 …
                         // Mark the future as abandoned, meaning it will be deleted by the server
                         bool abandon( future_t & this ) {
                                 /* paranoid */ verify( this.ptr != 3p );
+                                /* paranoid */ verify( this.ptr != future_ABANDONED );
                                 // Mark the future as abandonned
                                 struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
+                                struct oneshot * got = __atomic_exchange_n( &this.ptr, future_ABANDONED, __ATOMIC_SEQ_CST);
                                 // If the future isn't already fulfilled, let the server delete it
                                 if( got == 0p ) return false;
                                 // got == 2p: the future is ready but the context hasn't fully been consumed
+                                if( got == future_ARMED ) return false;
+                                // got == PROGRESS: the future is ready but the context hasn't fully been consumed
                                 // spin until it is safe to move on
                                 if( got == 2p ) {
                                         while( this.ptr != 1p ) Pause();
                                         got = 1p;
+                                if( got == future_PROGRESS ) {
+                                        while( this.ptr != future_FULFILLED ) Pause();
+                                        got = future_FULFILLED;
+                                }
                                 // The future is completed delete it now
                                 /* paranoid */ verify( this.ptr != 1p );
+                                /* paranoid */ verify( this.ptr != future_FULFILLED );
                                 free( &this );
                                 return true;
 …
                                                 #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
                                         #endif
                                                 if( expected == 3p ) { free( &this ); return 0p; }
+                                                if( expected == future_ABANDONED ) { free( &this ); return 0p; }
                                         #if defined(__GNUC__) && __GNUC__ >= 7
                                                 #pragma GCC diagnostic pop
                                         #endif
                                         /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
                                         /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
+                                        /* paranoid */ verify( expected != future_FULFILLED ); // Future is already fulfilled, should not happen
+                                        /* paranoid */ verify( expected != future_PROGRESS ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
                                         // If there is a wait context, we need to consume it and mark it as consumed after
                                         // If there is no context then we can skip the in progress phase
                                         struct oneshot * want = expected == 0p ? 1p : 2p;
+                                        struct oneshot * want = expected == future_ARMED ? future_FULFILLED : future_PROGRESS;
                                         if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
                                                 if( expected == 0p ) { return 0p; }
+                                                if( expected == future_ARMED ) { return 0p; }
                                                 thread$ * ret = post( *expected, do_unpark );
                                                 __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+                                                __atomic_store_n( &this.ptr, future_FULFILLED, __ATOMIC_SEQ_CST);
                                                 return ret;
+                                        }
 …
                                 // Wait for the future to tru
                                 while( this.ptr == 2p ) Pause();
+                                while( this.ptr == future_PROGRESS ) Pause();
                                 // Make sure the state makes sense
                                 // Should be fulfilled, could be in progress but it's out of date if so
 …
                                 // and the oneshot should not be needed any more
                                 __attribute__((unused)) struct oneshot * was = this.ptr;
                                 /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
+                                /* paranoid */ verifyf( was == future_FULFILLED, "Expected this.ptr to be 1p, was %p\n", was );
                                 // Mark the future as fulfilled, to be consistent

libcfa/src/concurrency/kernel/private.hfa

-              r428adbc
+              r7f6a7c9
 #elif defined(CFA_HAVE_LINUX_RSEQ_H)
         extern "Cforall" {
                 extern __attribute__((aligned(64))) thread_local volatile struct rseq __cfaabi_rseq;
+                extern __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq;
+        }
 #else
 …
 //-----------------------------------------------------------------------------
 // I/O
 $io_arbiter * create(void);
 void destroy($io_arbiter *);
+io_arbiter$ * create(void);
+void destroy(io_arbiter$ *);
 //=======================================================================
 …
 // Blocking acquire
 static inline void __atomic_acquire(volatile bool * ll) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify(ll);
         while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
                 while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
 …
+        }
         /* paranoid */ verify(*ll);
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
 // Non-Blocking acquire
 static inline bool __atomic_try_acquire(volatile bool * ll) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify(ll);
         return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
 …
 // Release
 static inline void __atomic_unlock(volatile bool * ll) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify(ll);
         /* paranoid */ verify(*ll);
         __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
 …
 // have been hard-coded to for the ready-queue for
 // simplicity and performance
+struct __scheduler_RWLock_t {
+        // total cachelines allocated
+        unsigned int max;
+        // cachelines currently in use
+        volatile unsigned int alloc;
+        // cachelines ready to itereate over
+        // (!= to alloc when thread is in second half of doregister)
+        volatile unsigned int ready;
+        // writer lock
+        volatile bool write_lock;
+        // data pointer
+        volatile bool * volatile * data;
+union __attribute__((aligned(64))) __scheduler_RWLock_t {
+        struct {
+                __attribute__((aligned(64))) char padding;
+                // total cachelines allocated
+                __attribute__((aligned(64))) unsigned int max;
+                // cachelines currently in use
+                volatile unsigned int alloc;
+                // cachelines ready to itereate over
+                // (!= to alloc when thread is in second half of doregister)
+                volatile unsigned int ready;
+                // writer lock
+                volatile bool write_lock;
+                // data pointer
+                volatile bool * volatile * data;
+        } lock;
+        char pad[192];
 };
 …
 void ^?{}(__scheduler_RWLock_t & this);
 extern __scheduler_RWLock_t * __scheduler_lock;
+extern __scheduler_RWLock_t __scheduler_lock;
 //-----------------------------------------------------------------------
 // Reader side : acquire when using the ready queue to schedule but not
 //  creating/destroying queues
 static inline void ready_schedule_lock(void) with(*__scheduler_lock) {
+static inline void ready_schedule_lock(void) with(__scheduler_lock.lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
         /* paranoid */ verify( ! kernelTLS().in_sched_lock );
 …
+}
 static inline void ready_schedule_unlock(void) with(*__scheduler_lock) {
+static inline void ready_schedule_unlock(void) with(__scheduler_lock.lock) {
         /* paranoid */ verify( ! __preemption_enabled() );
         /* paranoid */ verify( data[kernelTLS().sched_id] == &kernelTLS().sched_lock );
 …
         static inline bool ready_mutate_islocked() {
                 return __scheduler_lock->write_lock;
+                return __scheduler_lock.lock.write_lock;
+        }
 #endif

libcfa/src/concurrency/kernel/startup.cfa

-              r428adbc
+              r7f6a7c9
 KERNEL_STORAGE(thread$,              mainThread);
 KERNEL_STORAGE(__stack_t,            mainThreadCtx);
 KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
+// KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
 KERNEL_STORAGE(eventfd_t,            mainIdleEventFd);
 KERNEL_STORAGE(io_future_t,          mainIdleFuture);
 …
 processor            * mainProcessor;
 thread$              * mainThread;
-__scheduler_RWLock_t * __scheduler_lock;
 extern "C" {
 …
 //-----------------------------------------------------------------------------
 // Global state
 thread_local struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {
+__thread struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {
         NULL,                                                                                           // cannot use 0p
         NULL,
 …
 };
+__scheduler_RWLock_t __scheduler_lock @= { 0 };
 #if   defined(CFA_HAVE_LINUX_LIBRSEQ)
         // No data needed
 #elif defined(CFA_HAVE_LINUX_RSEQ_H)
         extern "Cforall" {
                 __attribute__((aligned(64))) thread_local volatile struct rseq __cfaabi_rseq @= {
+                __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq @= {
                         .cpu_id : RSEQ_CPU_ID_UNINITIALIZED,
                 };
 …
         // Initialize the global scheduler lock
         __scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
         (*__scheduler_lock){};
+        // __scheduler_lock = (__scheduler_RWLock_t*)&storage___scheduler_lock;
+        (__scheduler_lock){};
         // Initialize the main cluster
 …
         ^(*mainCluster){};
         ^(*__scheduler_lock){};
+        ^(__scheduler_lock){};
         ^(__cfa_dbg_global_clusters.list){};

libcfa/src/concurrency/monitor.hfa

-              r428adbc
+              r7f6a7c9
 void ^?{}( monitor_dtor_guard_t & this );
+/*
 static inline forall( T & | sized(T) | { void ^?{}( T & mutex ); } )
 void delete( T * th ) {
 …
         free( th );
+}
+*/
 static inline forall( T & | sized(T) | { void ^?{}( T & mutex ); } )

libcfa/src/concurrency/preemption.cfa

-              r428adbc
+              r7f6a7c9
 //----------
 // special case for preemption since used often
 __attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() libcfa_public {
+__attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() libcfa_nopreempt libcfa_public {
         // create a assembler label before
         // marked as clobber all to avoid movement
 …
+}
+extern "C" {
+        __attribute__((visibility("hidden"))) extern void * const __start_cfatext_nopreempt;
+        __attribute__((visibility("hidden"))) extern void * const __stop_cfatext_nopreempt;
+        extern const __cfa_nopreempt_region __libcfa_nopreempt;
+        __attribute__((visibility("protected"))) const __cfa_nopreempt_region __libcfathrd_nopreempt @= {
+                (void * const)&__start_cfatext_nopreempt,
+                (void * const)&__stop_cfatext_nopreempt
+        };
+}
+static inline bool __cfaabi_in( void * const ip, const struct __cfa_nopreempt_region & const region ) {
+        return ip >= region.start && ip <= region.stop;
+}
 //----------
 // Get data from the TLS block
 // struct asm_region __cfaasm_get;
 uintptr_t __cfatls_get( unsigned long int offset ) __attribute__((__noinline__, visibility("default"))); //no inline to avoid problems
+uintptr_t __cfatls_get( unsigned long int offset ) libcfa_nopreempt libcfa_public; //no inline to avoid problems
 uintptr_t __cfatls_get( unsigned long int offset ) {
         // create a assembler label before
 …
 extern "C" {
         // Disable interrupts by incrementing the counter
         __attribute__((__noinline__, visibility("default"))) void disable_interrupts() libcfa_public {
+        void disable_interrupts() libcfa_nopreempt libcfa_public {
                 // create a assembler label before
                 // marked as clobber all to avoid movement
 …
         // Enable interrupts by decrementing the counter
         // If counter reaches 0, execute any pending __cfactx_switch
         void enable_interrupts( bool poll ) libcfa_public {
+        void enable_interrupts( bool poll ) libcfa_nopreempt libcfa_public {
                 // Cache the processor now since interrupts can start happening after the atomic store
                 processor   * proc = __cfaabi_tls.this_processor;
 …
+                }
+        }
+        // Check whether or not there is pending preemption
+        // force_yield( __POLL_PREEMPTION ) if appropriate
+        // return true if the thread was in an interruptable state
+        // i.e. on a real processor and not in the kernel
+        // (can return true even if no preemption was pending)
+        bool poll_interrupts() libcfa_public {
+                // Cache the processor now since interrupts can start happening after the atomic store
+                processor   * proc = publicTLS_get( this_processor );
+                if ( ! proc ) return false;
+                if ( ! __preemption_enabled() ) return false;
+                with( __cfaabi_tls.preemption_state ){
+                        // Signal the compiler that a fence is needed but only for signal handlers
+                        __atomic_signal_fence(__ATOMIC_RELEASE);
+                        if( proc->pending_preemption ) {
+                                proc->pending_preemption = false;
+                                force_yield( __POLL_PREEMPTION );
+                        }
+                }
+                return true;
+        }
+}
 …
 //-----------------------------------------------------------------------------
-// Some assembly required
-#if defined( __i386 )
-        #ifdef __PIC__
-                #define RELOC_PRELUDE( label ) \
-                        "calll   .Lcfaasm_prelude_" #label "$pb\n\t" \
-                        ".Lcfaasm_prelude_" #label "$pb:\n\t" \
-                        "popl    %%eax\n\t" \
-                        ".Lcfaasm_prelude_" #label "_end:\n\t" \
-                        "addl    $_GLOBAL_OFFSET_TABLE_+(.Lcfaasm_prelude_" #label "_end-.Lcfaasm_prelude_" #label "$pb), %%eax\n\t"
-                #define RELOC_PREFIX ""
-                #define RELOC_SUFFIX "@GOT(%%eax)"
-        #else
-                #define RELOC_PREFIX "$"
-                #define RELOC_SUFFIX ""
-        #endif
-        #define __cfaasm_label( label ) struct asm_region label = \
-                ({ \
-                        struct asm_region region; \
-                        asm( \
-                                RELOC_PRELUDE( label ) \
-                                "movl " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
-                                "movl " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
-                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
-                        ); \
-                        region; \
-                });
-#elif defined( __x86_64 )
-        #ifdef __PIC__
-                #define RELOC_PREFIX ""
-                #define RELOC_SUFFIX "@GOTPCREL(%%rip)"
-        #else
-                #define RELOC_PREFIX "$"
-                #define RELOC_SUFFIX ""
-        #endif
-        #define __cfaasm_label( label ) struct asm_region label = \
-                ({ \
-                        struct asm_region region; \
-                        asm( \
-                                "movq " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
-                                "movq " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
-                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
-                        ); \
-                        region; \
-                });
-#elif defined( __aarch64__ )
-        #ifdef __PIC__
-                // Note that this works only for gcc
-                #define __cfaasm_label( label ) struct asm_region label = \
-                ({ \
-                        struct asm_region region; \
-                        asm( \
-                                "adrp %[vb], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
-                                "ldr  %[vb], [%[vb], #:gotpage_lo15:__cfaasm_" #label "_before]" "\n\t" \
-                                "adrp %[va], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
-                                "ldr  %[va], [%[va], #:gotpage_lo15:__cfaasm_" #label "_after]"  "\n\t" \
-                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
-                        ); \
-                        region; \
-                });
-        #else
-                #error this is not the right thing to do
-                /*
-                #define __cfaasm_label( label ) struct asm_region label = \
-                ({ \
-                        struct asm_region region; \
-                        asm( \
-                                "adrp %[vb], __cfaasm_" #label "_before"              "\n\t" \
-                                "add  %[vb], %[vb], :lo12:__cfaasm_" #label "_before" "\n\t" \
-                                "adrp %[va], :got:__cfaasm_" #label "_after"          "\n\t" \
-                                "add  %[va], %[va], :lo12:__cfaasm_" #label "_after"  "\n\t" \
-                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
-                        ); \
-                        region; \
-                });
-                */
-        #endif
-#else
-        #error unknown hardware architecture
-#endif
 // KERNEL ONLY
 // Check if a __cfactx_switch signal handler shoud defer
 …
 // If false : preemption is unsafe and marked as pending
 static inline bool preemption_ready( void * ip ) {
-        // Get all the region for which it is not safe to preempt
-        __cfaasm_label( get    );
-        __cfaasm_label( check  );
-        __cfaasm_label( dsable );
-        // __cfaasm_label( debug  );
         // Check if preemption is safe
         bool ready = true;
+        if( __cfaasm_in( ip, get    ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, check  ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, dsable ) ) { ready = false; goto EXIT; };
+        // if( __cfaasm_in( ip, debug  ) ) { ready = false; goto EXIT; };
+        if( __cfaabi_in( ip, __libcfa_nopreempt ) ) { ready = false; goto EXIT; };
+        if( __cfaabi_in( ip, __libcfathrd_nopreempt ) ) { ready = false; goto EXIT; };
         if( !__cfaabi_tls.preemption_state.enabled) { ready = false; goto EXIT; };
         if( __cfaabi_tls.preemption_state.in_progress ) { ready = false; goto EXIT; };
 …
 // Kernel Signal Handlers
 //=============================================================================================
 __cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
+__cfaabi_dbg_debug_do( static __thread void * last_interrupt = 0; )
 // Context switch signal handler

Context Navigation

Legend:

libcfa/src/concurrency/io.cfa

libcfa/src/concurrency/io/call.cfa.in

libcfa/src/concurrency/io/setup.cfa

libcfa/src/concurrency/io/types.hfa

libcfa/src/concurrency/iofwd.hfa

libcfa/src/concurrency/kernel.cfa

libcfa/src/concurrency/kernel.hfa

libcfa/src/concurrency/kernel/cluster.cfa

libcfa/src/concurrency/kernel/cluster.hfa

libcfa/src/concurrency/kernel/fwd.hfa

libcfa/src/concurrency/kernel/private.hfa

libcfa/src/concurrency/kernel/startup.cfa

libcfa/src/concurrency/monitor.hfa

libcfa/src/concurrency/preemption.cfa

Download in other formats: