//
// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
//
// The contents of this file are covered under the licence agreement in the
// file "LICENCE" distributed with Cforall.
//
// kernel --
//
// Author           : Thierry Delisle
// Created On       : Tue Jan 17 12:27:26 2017
// Last Modified By : Peter A. Buhr
// Last Modified On : Tue Feb  4 12:29:26 2020
// Update Count     : 22
//

#pragma once

#include <stdbool.h>
#include <stdint.h>

#include "invoke.h"
#include "time_t.hfa"
#include "coroutine.hfa"

extern "C" {
#include <pthread.h>
#include <semaphore.h>
}

//-----------------------------------------------------------------------------
// Locks
struct semaphore {
	__spinlock_t lock;
	int count;
	__queue_t($thread) waiting;
};

void  ?{}(semaphore & this, int count = 1);
void ^?{}(semaphore & this);
bool   P (semaphore & this);
bool   V (semaphore & this);
bool   V (semaphore & this, unsigned count);


//-----------------------------------------------------------------------------
// Processor
extern struct cluster * mainCluster;

// Processor
coroutine processorCtx_t {
	struct processor * proc;
};

// Wrapper around kernel threads
struct processor {
	// Main state
	// Coroutine ctx who does keeps the state of the processor
	struct processorCtx_t runner;

	// Cluster from which to get threads
	struct cluster * cltr;
	unsigned int id;

	// Name of the processor
	const char * name;

	// Handle to pthreads
	pthread_t kernel_thread;

	// RunThread data
	// Action to do after a thread is ran
	$thread * destroyer;

	// Preemption data
	// Node which is added in the discrete event simulaiton
	struct alarm_node_t * preemption_alarm;

	// If true, a preemption was triggered in an unsafe region, the processor must preempt as soon as possible
	bool pending_preemption;

	// Idle lock (kernel semaphore)
	__bin_sem_t idle;

	// Termination
	// Set to true to notify the processor should terminate
	volatile bool do_terminate;

	// Termination synchronisation (user semaphore)
	semaphore terminated;

	// pthread Stack
	void * stack;

	// Link lists fields
	struct __dbg_node_cltr {
		processor * next;
		processor * prev;
	} node;

#ifdef __CFA_DEBUG__
	// Last function to enable preemption on this processor
	const char * last_enable;
#endif
};

void  ?{}(processor & this, const char name[], struct cluster & cltr);
void ^?{}(processor & this);

static inline void  ?{}(processor & this)                    { this{ "Anonymous Processor", *mainCluster}; }
static inline void  ?{}(processor & this, struct cluster & cltr)    { this{ "Anonymous Processor", cltr}; }
static inline void  ?{}(processor & this, const char name[]) { this{name, *mainCluster }; }

static inline [processor *&, processor *& ] __get( processor & this ) __attribute__((const)) { return this.node.[next, prev]; }

//-----------------------------------------------------------------------------
// I/O
struct __io_data;

#define CFA_CLUSTER_IO_POLLER_USER_THREAD    1 << 0 // 0x1
#define CFA_CLUSTER_IO_POLLER_THREAD_SUBMITS 1 << 1 // 0x2
// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 2 // 0x4
#define CFA_CLUSTER_IO_BUFFLEN_OFFSET        16


//-----------------------------------------------------------------------------
// Cluster Tools

// Cells use by the reader writer lock
// while not generic it only relies on a opaque pointer
struct __processor_id;

// Reader-Writer lock protecting the ready-queue
// while this lock is mostly generic some aspects
// have been hard-coded to for the ready-queue for
// simplicity and performance
struct __clusterRWLock_t {
	// total cachelines allocated
	unsigned int max;

	// cachelines currently in use
	volatile unsigned int alloc;

	// cachelines ready to itereate over
	// (!= to alloc when thread is in second half of doregister)
	volatile unsigned int ready;

	// writer lock
	volatile bool lock;

	// data pointer
	__processor_id * data;
};

void  ?{}(__clusterRWLock_t & this);
void ^?{}(__clusterRWLock_t & this);

// Intrusives lanes which are used by the relaxed ready queue
struct __attribute__((aligned(128))) __intrusive_lane_t {
	// spin lock protecting the queue
	volatile bool lock;

	// anchor for the head and the tail of the queue
	struct __sentinel_t {
		// Link lists fields
		// instrusive link field for threads
		// must be exactly as in $thread
		__thread_desc_link link;
	} before, after;

#if defined(__CFA_WITH_VERIFY__)
	// id of last processor to acquire the lock
	// needed only to check for mutual exclusion violations
	unsigned int last_id;

	// number of items on this list
	// needed only to check for deadlocks
	unsigned int count;
#endif

	// Optional statistic counters
	#if !defined(__CFA_NO_SCHED_STATS__)
		struct __attribute__((aligned(64))) {
			// difference between number of push and pops
			ssize_t diff;

			// total number of pushes and pops
			size_t  push;
			size_t  pop ;
		} stat;
	#endif
};

void  ?{}(__intrusive_lane_t & this);
void ^?{}(__intrusive_lane_t & this);

typedef unsigned long long __cfa_readyQ_mask_t;

// enum {
// 	__cfa_ready_queue_mask_size = (64 - sizeof(size_t)) / sizeof(size_t),
// 	__cfa_max_ready_queues = __cfa_ready_queue_mask_size * 8 * sizeof(size_t)
// };

#define __cfa_lane_mask_size ((64 - sizeof(size_t)) / sizeof(__cfa_readyQ_mask_t))
#define __cfa_max_lanes (__cfa_lane_mask_size * 8 * sizeof(__cfa_readyQ_mask_t))

//TODO adjust cache size to ARCHITECTURE
// Structure holding the relaxed ready queue
struct __attribute__((aligned(128))) __ready_queue_t {
	// Data tracking how many/which lanes are used
	// Aligned to 128 for cache locality
	struct {
		// number of non-empty lanes
		volatile size_t count;

		// bit mask, set bits indentify which lanes are non-empty
		volatile __cfa_readyQ_mask_t mask[ __cfa_lane_mask_size ];
	} used;

	// Data tracking the actual lanes
	// On a seperate cacheline from the used struct since
	// used can change on each push/pop but this data
	// only changes on shrink/grow
	struct __attribute__((aligned(64))) {
		// Arary of lanes
		__intrusive_lane_t * volatile data;

		// Number of lanes (empty or not)
		volatile size_t count;
	} lanes;

	// Statistics
	#if !defined(__CFA_NO_STATISTICS__)
		__attribute__((aligned(64))) struct {
			struct {
				// Push statistic
				struct {
					// number of attemps at pushing something
					volatile size_t attempt;

					// number of successes at pushing
					volatile size_t success;
				} push;

				// Pop statistic
				struct {
					// number of reads of the mask
					// picking an empty __cfa_readyQ_mask_t counts here
					// but not as an attempt
					volatile size_t maskrds;

					// number of attemps at poping something
					volatile size_t attempt;

					// number of successes at poping
					volatile size_t success;
				} pop;
			} pick;

			// stats on the "used" struct of the queue
			// tracks average number of queues that are not empty
			// when pushing / poping
			struct {
				volatile size_t value;
				volatile size_t count;
			} used;
		} global_stats;

	#endif
};

void  ?{}(__ready_queue_t & this);
void ^?{}(__ready_queue_t & this);

//-----------------------------------------------------------------------------
// Cluster
struct cluster {
	// Ready queue locks
	__clusterRWLock_t ready_lock;

	// Ready queue for threads
	__ready_queue_t ready_queue;

	// Name of the cluster
	const char * name;

	// Preemption rate on this cluster
	Duration preemption_rate;

	// List of processors
	__spinlock_t idle_lock;
	__dllist_t(struct processor) procs;
	__dllist_t(struct processor) idles;
	unsigned int nprocessors;

	// List of threads
	__spinlock_t thread_list_lock;
	__dllist_t(struct $thread) threads;
	unsigned int nthreads;

	// Link lists fields
	struct __dbg_node_cltr {
		cluster * next;
		cluster * prev;
	} node;

	struct __io_data * io;

	#if !defined(__CFA_NO_STATISTICS__)
		bool print_stats;
	#endif
};
extern Duration default_preemption();

void ?{} (cluster & this, const char name[], Duration preemption_rate, unsigned flags);
void ^?{}(cluster & this);

static inline void ?{} (cluster & this)                                           { this{"Anonymous Cluster", default_preemption(), 0}; }
static inline void ?{} (cluster & this, Duration preemption_rate)                 { this{"Anonymous Cluster", preemption_rate, 0}; }
static inline void ?{} (cluster & this, const char name[])                        { this{name, default_preemption(), 0}; }
static inline void ?{} (cluster & this, unsigned flags)                           { this{"Anonymous Cluster", default_preemption(), flags}; }
static inline void ?{} (cluster & this, Duration preemption_rate, unsigned flags) { this{"Anonymous Cluster", preemption_rate, flags}; }
static inline void ?{} (cluster & this, const char name[], unsigned flags)        { this{name, default_preemption(), flags}; }

static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }

static inline struct processor * active_processor() { return TL_GET( this_processor ); } // UNSAFE
static inline struct cluster   * active_cluster  () { return TL_GET( this_processor )->cltr; }

#if !defined(__CFA_NO_STATISTICS__)
	static inline void print_stats_at_exit( cluster & this ) {
		this.print_stats = true;
	}
#endif

// Local Variables: //
// mode: c //
// tab-width: 4 //
// End: //