//
// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
//
// The contents of this file are covered under the licence agreement in the
// file "LICENCE" distributed with Cforall.
//
// kernel_private.hfa --
//
// Author           : Thierry Delisle
// Created On       : Mon Feb 13 12:27:26 2017
// Last Modified By : Peter A. Buhr
// Last Modified On : Wed Aug 12 08:21:33 2020
// Update Count     : 9
//

#pragma once

#include "kernel.hfa"
#include "thread.hfa"

#include "alarm.hfa"
#include "stats.hfa"

//-----------------------------------------------------------------------------
// Scheduler

struct __attribute__((aligned(128))) __scheduler_lock_id_t;

extern "C" {
	void disable_interrupts() OPTIONAL_THREAD;
	void enable_interrupts_noPoll();
	void enable_interrupts( __cfaabi_dbg_ctx_param );
}

void __schedule_thread( struct __processor_id_t *, $thread * )
#if defined(NDEBUG) || (!defined(__CFA_DEBUG__) && !defined(__CFA_VERIFY__))
	__attribute__((nonnull (2)))
#endif
;

//Block current thread and release/wake-up the following resources
void __leave_thread() __attribute__((noreturn));

//-----------------------------------------------------------------------------
// Processor
void main(processorCtx_t *);

void * __create_pthread( pthread_t *, void * (*)(void *), void * );


extern cluster * mainCluster;

//-----------------------------------------------------------------------------
// Threads
extern "C" {
      void __cfactx_invoke_thread(void (*main)(void *), void * this);
}

__cfaabi_dbg_debug_do(
	extern void __cfaabi_dbg_thread_register  ( $thread * thrd );
	extern void __cfaabi_dbg_thread_unregister( $thread * thrd );
)

// KERNEL ONLY unpark with out disabling interrupts
void __unpark( struct __processor_id_t *, $thread * thrd __cfaabi_dbg_ctx_param2 );

static inline bool __post(single_sem & this, struct __processor_id_t * id) {
	for() {
		struct $thread * expected = this.ptr;
		if(expected == 1p) return false;
		if(expected == 0p) {
			if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
				return false;
			}
		}
		else {
			if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
				__unpark( id, expected __cfaabi_dbg_ctx2 );
				return true;
			}
		}
	}
}

//-----------------------------------------------------------------------------
// Utils
void doregister( struct cluster * cltr, struct $thread & thrd );
void unregister( struct cluster * cltr, struct $thread & thrd );

//-----------------------------------------------------------------------------
// I/O
void ^?{}(io_context & this, bool );

//=======================================================================
// Cluster lock API
//=======================================================================
// Cells use by the reader writer lock
// while not generic it only relies on a opaque pointer
struct __attribute__((aligned(128))) __scheduler_lock_id_t {
	// Spin lock used as the underlying lock
	volatile bool lock;

	// Handle pointing to the proc owning this cell
	// Used for allocating cells and debugging
	__processor_id_t * volatile handle;

	#ifdef __CFA_WITH_VERIFY__
		// Debug, check if this is owned for reading
		bool owned;
	#endif
};

static_assert( sizeof(struct __scheduler_lock_id_t) <= __alignof(struct __scheduler_lock_id_t));

// Lock-Free registering/unregistering of threads
// Register a processor to a given cluster and get its unique id in return
unsigned doregister( struct __processor_id_t * proc );

// Unregister a processor from a given cluster using its id, getting back the original pointer
void     unregister( struct __processor_id_t * proc );

//-----------------------------------------------------------------------
// Cluster idle lock/unlock
static inline void lock(__cluster_idles & this) {
	for() {
		uint64_t l = this.lock;
		if(
			(0 == (l % 2))
			&& __atomic_compare_exchange_n(&this.lock, &l, l + 1, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
		) return;
		Pause();
	}
}

static inline void unlock(__cluster_idles & this) {
	/* paranoid */ verify( 1 == (this.lock % 2) );
	__atomic_fetch_add( &this.lock, 1, __ATOMIC_SEQ_CST );
}

//=======================================================================
// Reader-writer lock implementation
// Concurrent with doregister/unregister,
//    i.e., threads can be added at any point during or between the entry/exit

//-----------------------------------------------------------------------
// simple spinlock underlying the RWLock
// Blocking acquire
static inline void __atomic_acquire(volatile bool * ll) {
	while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
		while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
			Pause();
	}
	/* paranoid */ verify(*ll);
}

// Non-Blocking acquire
static inline bool __atomic_try_acquire(volatile bool * ll) {
	return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
}

// Release
static inline void __atomic_unlock(volatile bool * ll) {
	/* paranoid */ verify(*ll);
	__atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
}

//-----------------------------------------------------------------------
// Reader-Writer lock protecting the ready-queues
// while this lock is mostly generic some aspects
// have been hard-coded to for the ready-queue for
// simplicity and performance
struct __scheduler_RWLock_t {
	// total cachelines allocated
	unsigned int max;

	// cachelines currently in use
	volatile unsigned int alloc;

	// cachelines ready to itereate over
	// (!= to alloc when thread is in second half of doregister)
	volatile unsigned int ready;

	// writer lock
	volatile bool lock;

	// data pointer
	__scheduler_lock_id_t * data;
};

void  ?{}(__scheduler_RWLock_t & this);
void ^?{}(__scheduler_RWLock_t & this);

extern __scheduler_RWLock_t * __scheduler_lock;

//-----------------------------------------------------------------------
// Reader side : acquire when using the ready queue to schedule but not
//  creating/destroying queues
static inline void ready_schedule_lock( struct __processor_id_t * proc) with(*__scheduler_lock) {
	unsigned iproc = proc->id;
	/*paranoid*/ verify(data[iproc].handle == proc);
	/*paranoid*/ verify(iproc < ready);

	// Step 1 : make sure no writer are in the middle of the critical section
	while(__atomic_load_n(&lock, (int)__ATOMIC_RELAXED))
		Pause();

	// Fence needed because we don't want to start trying to acquire the lock
	// before we read a false.
	// Not needed on x86
	// std::atomic_thread_fence(std::memory_order_seq_cst);

	// Step 2 : acquire our local lock
	__atomic_acquire( &data[iproc].lock );
	/*paranoid*/ verify(data[iproc].lock);

	#ifdef __CFA_WITH_VERIFY__
		// Debug, check if this is owned for reading
		data[iproc].owned = true;
	#endif
}

static inline void ready_schedule_unlock( struct __processor_id_t * proc) with(*__scheduler_lock) {
	unsigned iproc = proc->id;
	/*paranoid*/ verify(data[iproc].handle == proc);
	/*paranoid*/ verify(iproc < ready);
	/*paranoid*/ verify(data[iproc].lock);
	/*paranoid*/ verify(data[iproc].owned);
	#ifdef __CFA_WITH_VERIFY__
		// Debug, check if this is owned for reading
		data[iproc].owned = false;
	#endif
	__atomic_unlock(&data[iproc].lock);
}

#ifdef __CFA_WITH_VERIFY__
	static inline bool ready_schedule_islocked( struct __processor_id_t * proc) {
		return __scheduler_lock->data[proc->id].owned;
	}

	static inline bool ready_mutate_islocked() {
		return __scheduler_lock->lock;
	}
#endif

//-----------------------------------------------------------------------
// Writer side : acquire when changing the ready queue, e.g. adding more
//  queues or removing them.
uint_fast32_t ready_mutate_lock( void );

void ready_mutate_unlock( uint_fast32_t /* value returned by lock */ );

//=======================================================================
// Ready-Queue API
//-----------------------------------------------------------------------
// pop thread from the ready queue of a cluster
// returns 0p if empty
__attribute__((hot)) bool query(struct cluster * cltr);

//-----------------------------------------------------------------------
// push thread onto a ready queue for a cluster
// returns true if the list was previously empty, false otherwise
__attribute__((hot)) bool push(struct cluster * cltr, struct $thread * thrd);

//-----------------------------------------------------------------------
// pop thread from the ready queue of a cluster
// returns 0p if empty
// May return 0p spuriously
__attribute__((hot)) struct $thread * pop(struct cluster * cltr);

//-----------------------------------------------------------------------
// pop thread from the ready queue of a cluster
// returns 0p if empty
// guaranteed to find any threads added before this call
__attribute__((hot)) struct $thread * pop_slow(struct cluster * cltr);

//-----------------------------------------------------------------------
// remove thread from the ready queue of a cluster
// returns bool if it wasn't found
bool remove_head(struct cluster * cltr, struct $thread * thrd);

//-----------------------------------------------------------------------
// Increase the width of the ready queue (number of lanes) by 4
void ready_queue_grow  (struct cluster * cltr, int target);

//-----------------------------------------------------------------------
// Decrease the width of the ready queue (number of lanes) by 4
void ready_queue_shrink(struct cluster * cltr, int target);


// Local Variables: //
// mode: c //
// tab-width: 4 //
// End: //