Context Navigation

source: benchmark/readyQ/locality.cpp @ 788614c

ADTarm-ehast-experimentalenumforall-pointer-decayjacob/cs343-translationnew-ast-unique-exprpthread-emulationqualifiedEnum

Last change on this file since 788614c was 56ac392, checked in by Thierry Delisle <tdelisle@…>, 4 years ago
Moved single_sem to rq_bench.hpp which was duplicated across multiple fibre benchmarks.
Property mode set to `100644`
File size: 7.4 KB

Rev	Line
[c4241b6]	1	#include "rq_bench.hpp"
	2	#pragma GCC diagnostic push
	3	#pragma GCC diagnostic ignored "-Wunused-parameter"
	4	#include <libfibre/fibre.h>
	5	#pragma GCC diagnostic pop
	6
	7	struct Result {
	8	uint64_t count = 0;
	9	uint64_t dmigs = 0;
	10	uint64_t gmigs = 0;
	11	};
	12
	13	// ==================================================
	14	struct __attribute__((aligned(128))) MyData {
	15	uint64_t _p1[16]; // padding
	16	uint64_t * data;
	17	size_t len;
	18	BaseProcessor * ttid;
	19	size_t id;
	20	uint64_t _p2[16]; // padding
	21
	22	MyData(size_t id, size_t size)
	23	: data( (uintptr_t )aligned_alloc(128, size sizeof(uint64_t)) )
	24	, len( size )
	25	, ttid( &Context::CurrProcessor() )
	26	, id( id )
	27	{
	28	for(size_t i = 0; i < this->len; i++) {
	29	this->data[i] = 0;
	30	}
	31	}
	32
	33	uint64_t moved(BaseProcessor * ttid) {
	34	if(this->ttid == ttid) {
	35	return 0;
	36	}
	37	this->ttid = ttid;
	38	return 1;
	39	}
	40
	41	__attribute__((noinline)) void access(size_t idx) {
	42	size_t l = this->len;
	43	this->data[idx % l] += 1;
	44	}
	45	};
	46
	47	// ==================================================
	48	struct __attribute__((aligned(128))) MyCtx {
	49	struct MyData * volatile data;
	50
	51	struct {
	52	struct MySpot ** ptr;
	53	size_t len;
	54	} spots;
	55
	56	bench_sem sem;
	57
	58	Result result;
	59
	60	bool share;
	61	size_t cnt;
	62	BaseProcessor * ttid;
	63	size_t id;
	64
	65	MyCtx(MyData * d, MySpot ** spots, size_t len, size_t cnt, bool share, size_t id)
	66	: data( d )
	67	, spots{ .ptr = spots, .len = len }
	68	, share( share )
	69	, cnt( cnt )
	70	, ttid( &Context::CurrProcessor() )
	71	, id( id )
	72	{}
	73
	74	uint64_t moved(BaseProcessor * ttid) {
	75	if(this->ttid == ttid) {
	76	return 0;
	77	}
	78	this->ttid = ttid;
	79	return 1;
	80	}
	81	};
	82
	83	// ==================================================
	84	// Atomic object where a single thread can wait
	85	// May exchanges data
	86	struct __attribute__((aligned(128))) MySpot {
	87	MyCtx * volatile ptr;
	88	size_t id;
	89	uint64_t _p1[16]; // padding
	90
	91	MySpot(size_t id) : ptr( nullptr ), id( id ) {}
	92
	93
	94	static inline MyCtx * one() {
	95	return reinterpret_cast<MyCtx *>(1);
	96	}
	97
	98	// Main handshake of the code
	99	// Single seat, first thread arriving waits
	100	// Next threads unblocks current one and blocks in its place
	101	// if share == true, exchange data in the process
	102	bool put( MyCtx & ctx, MyData * data, bool share) {
	103	// Attempt to CAS our context into the seat
	104	for(;;) {
	105	MyCtx * expected = this->ptr;
	106	if (expected == one()) { // Seat is closed, return
	107	return true;
	108	}
	109
	110	if (__atomic_compare_exchange_n(&this->ptr, &expected, &ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
	111	if(expected) {
	112	if(share) {
	113	expected->data = data;
	114	}
	115	expected->sem.post();
	116	}
	117	break; // We got the seat
	118	}
	119	}
	120
	121	// Block once on the seat
	122	ctx.sem.wait();
	123
	124	// Someone woke us up, get the new data
	125	return false;
	126	}
	127
	128	// Shutdown the spot
	129	// Wake current thread and mark seat as closed
	130	void release() {
	131	struct MyCtx * val = __atomic_exchange_n(&this->ptr, one(), __ATOMIC_SEQ_CST);
	132	if (!val) {
	133	return;
	134	}
	135
	136	// Someone was there, release them
	137	val->sem.post();
	138	}
	139	};
	140
	141	// ==================================================
	142	// Random number generator, Go's native one is to slow and global
	143	uint64_t __xorshift64( uint64_t & state ) {
	144	uint64_t x = state;
	145	x ^= x << 13;
	146	x ^= x >> 7;
	147	x ^= x << 17;
	148	return state = x;
	149	}
	150
	151	// ==================================================
	152	// Do some work by accessing 'cnt' cells in the array
	153	__attribute__((noinline)) void work(MyData & data, size_t cnt, uint64_t & state) {
	154	for (size_t i = 0; i < cnt; i++) {
	155	data.access(__xorshift64(state));
	156	}
	157	}
	158
	159	void thread_main( MyCtx & ctx ) {
[f03209d3]	160	uint64_t state = ctx.id;
[c4241b6]	161
	162	// Wait for start
	163	ctx.sem.wait();
	164
	165	// Main loop
	166	for(;;) {
	167	// Touch our current data, write to invalidate remote cache lines
	168	work( *ctx.data, ctx.cnt, state );
	169
	170	// Wait on a random spot
	171	uint64_t idx = __xorshift64(state) % ctx.spots.len;
	172	bool closed = ctx.spots.ptr[idx]->put(ctx, ctx.data, ctx.share);
	173
	174	// Check if the experiment is over
	175	if (closed) break;
	176	if ( clock_mode && stop) break;
	177	if (!clock_mode && ctx.result.count >= stop_count) break;
	178
	179	// Check everything is consistent
	180	assert( ctx.data );
	181
	182	// write down progress and check migrations
	183	BaseProcessor * ttid = &Context::CurrProcessor();
	184	ctx.result.count += 1;
	185	ctx.result.gmigs += ctx.moved(ttid);
	186	ctx.result.dmigs += ctx.data->moved(ttid);
	187	}
	188
	189	__atomic_fetch_add(&threads_left, -1, __ATOMIC_SEQ_CST);
	190	}
	191
	192	// ==================================================
	193	int main(int argc, char * argv[]) {
	194	unsigned wsize = 2;
	195	unsigned wcnt = 2;
[f03209d3]	196	unsigned nspots = 0;
[c4241b6]	197	bool share = false;
	198	option_t opt[] = {
	199	BENCH_OPT,
[f03209d3]	200	{ 'n', "nspots", "Number of spots where threads sleep (nthreads - nspots are active at the same time)", nspots},
[c4241b6]	201	{ 'w', "worksize", "Size of the array for each threads, in words (64bit)", wsize},
	202	{ 'c', "workcnt" , "Number of words to touch when working (random pick, cells can be picked more than once)", wcnt },
	203	{ 's', "share" , "Pass the work data to the next thread when blocking", share, parse_truefalse }
	204	};
	205	BENCH_OPT_PARSE("libfibre cycle benchmark");
	206
	207	std::cout.imbue(std::locale(""));
	208	setlocale(LC_ALL, "");
	209
	210	unsigned long long global_count = 0;
	211	unsigned long long global_gmigs = 0;
	212	unsigned long long global_dmigs = 0;
	213
[f03209d3]	214	if( nspots == 0 ) { nspots = nthreads - nprocs; }
	215
[c4241b6]	216	uint64_t start, end;
	217	{
	218	FibreInit(1, nprocs);
	219	MyData * data_arrays[nthreads];
	220	for(size_t i = 0; i < nthreads; i++) {
	221	data_arrays[i] = new MyData( i, wsize );
	222	}
	223
[f03209d3]	224	MySpot * spots[nspots];
	225	for(unsigned i = 0; i < nspots; i++) {
[c4241b6]	226	spots[i] = new MySpot{ i };
	227	}
	228
[f03209d3]	229	threads_left = nthreads - nspots;
[c4241b6]	230	Fibre * threads[nthreads];
	231	MyCtx * thddata[nthreads];
	232	{
	233	for(size_t i = 0; i < nthreads; i++) {
	234	thddata[i] = new MyCtx(
	235	data_arrays[i],
	236	spots,
[f03209d3]	237	nspots,
[c4241b6]	238	wcnt,
	239	share,
	240	i
	241	);
	242	threads[i] = new Fibre( reinterpret_cast<void ()(void )>(thread_main), thddata[i] );
	243	}
	244
	245	bool is_tty = isatty(STDOUT_FILENO);
[e54d0c3]	246	start = timeHiRes();
[c4241b6]	247
	248	for(size_t i = 0; i < nthreads; i++) {
	249	thddata[i]->sem.post();
	250	}
	251	wait<Fibre>(start, is_tty);
	252
	253	stop = true;
[e54d0c3]	254	end = timeHiRes();
[c4241b6]	255	printf("\nDone\n");
	256
	257	for(size_t i = 0; i < nthreads; i++) {
	258	thddata[i]->sem.post();
	259	fibre_join( threads[i], nullptr );
	260	global_count += thddata[i]->result.count;
	261	global_gmigs += thddata[i]->result.gmigs;
	262	global_dmigs += thddata[i]->result.dmigs;
	263	}
	264	}
	265
	266	for(size_t i = 0; i < nthreads; i++) {
	267	delete( data_arrays[i] );
	268	}
	269
[f03209d3]	270	for(size_t i = 0; i < nspots; i++) {
[c4241b6]	271	delete( spots[i] );
	272	}
	273	}
	274
	275	printf("Duration (ms) : %'ld\n", to_miliseconds(end - start));
	276	printf("Number of processors : %'d\n", nprocs);
	277	printf("Number of threads : %'d\n", nthreads);
[f03209d3]	278	printf("Number of spots : %'d\n", nspots);
[c4241b6]	279	printf("Work size (64bit words): %'15u\n", wsize);
	280	printf("Total Operations(ops) : %'15llu\n", global_count);
	281	printf("Total G Migrations : %'15llu\n", global_gmigs);
	282	printf("Total D Migrations : %'15llu\n", global_dmigs);
[f03209d3]	283	printf("Ops per second : %'18.2lf\n", ((double)global_count) / to_fseconds(end - start));
	284	printf("ns per ops : %'18.2lf\n", ((double)(end - start)) / global_count);
	285	printf("Ops per threads : %'15llu\n", global_count / nthreads);
	286	printf("Ops per procs : %'15llu\n", global_count / nprocs);
	287	printf("Ops/sec/procs : %'18.2lf\n", (((double)global_count) / nprocs) / to_fseconds(end - start));
	288	printf("ns per ops/procs : %'18.2lf\n", ((double)(end - start)) / (global_count / nprocs));
[c4241b6]	289	fflush(stdout);
[e54d0c3]	290	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: