Context Navigation

ready_queue.cfa @ c42b8a1

ADTast-experimentalenumpthread-emulationqualifiedEnum

Last change on this file since c42b8a1 was c42b8a1, checked in by Thierry Delisle <tdelisle@…>, 2 years ago
Major cleanup and moved cluster growth to new file
Property mode set to `100644`
File size: 9.3 KB

Line
1	//
2	// Cforall Version 1.0.0 Copyright (C) 2019 University of Waterloo
3	//
4	// The contents of this file are covered under the licence agreement in the
5	// file "LICENCE" distributed with Cforall.
6	//
7	// ready_queue.cfa --
8	//
9	// Author : Thierry Delisle
10	// Created On : Mon Nov dd 16:29:18 2019
11	// Last Modified By :
12	// Last Modified On :
13	// Update Count :
14	//
15
16	#define __cforall_thread__
17	#define _GNU_SOURCE
18
19	// #define __CFA_DEBUG_PRINT_READY_QUEUE__
20
21
22	#define USE_AWARE_STEALING
23
24	#include "bits/defs.hfa"
25	#include "device/cpu.hfa"
26	#include "kernel_private.hfa"
27
28	#include "limits.hfa"
29
30	// #include <errno.h>
31	// #include <unistd.h>
32
33	#include "ready_subqueue.hfa"
34
35	static const size_t cache_line_size = 64;
36
37	#if !defined(__CFA_NO_STATISTICS__)
38	#define __STATS(...) __VA_ARGS__
39	#else
40	#define __STATS(...)
41	#endif
42
43	static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats));
44	static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats));
45	static inline struct thread$ * search(struct cluster * cltr);
46
47	//=======================================================================
48	// Cforall Ready Queue used for scheduling
49	//=======================================================================
50	void ?{}(__ready_queue_t & this) with (this) {
51	lanes.data = 0p;
52	lanes.tscs = 0p;
53	lanes.caches = 0p;
54	lanes.help = 0p;
55	lanes.count = 0;
56	}
57
58	void ^?{}(__ready_queue_t & this) with (this) {
59	free(lanes.data);
60	free(lanes.tscs);
61	free(lanes.caches);
62	free(lanes.help);
63	}
64
65	//-----------------------------------------------------------------------
66	__attribute__((hot)) void push(struct cluster * cltr, struct thread$ * thrd, unpark_hint hint) with (cltr->ready_queue) {
67	processor * const proc = kernelTLS().this_processor;
68	const bool external = (!proc) \|\| (cltr != proc->cltr);
69	const bool remote = hint == UNPARK_REMOTE;
70
71	unsigned i;
72	if( external \|\| remote ) {
73	// Figure out where thread was last time and make sure it's valid
74	/* paranoid */ verify(thrd->preferred >= 0);
75	if(thrd->preferred * __readyq_shard_factor < lanes.count) {
76	/* paranoid / verify(thrd->preferred __readyq_shard_factor < lanes.count);
77	unsigned start = thrd->preferred * __readyq_shard_factor;
78	do {
79	unsigned r = __tls_rand();
80	i = start + (r % __readyq_shard_factor);
81	/* paranoid */ verify( i < lanes.count );
82	// If we can't lock it retry
83	} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
84	} else {
85	do {
86	i = __tls_rand() % lanes.count;
87	} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
88	}
89	} else {
90	do {
91	unsigned r = proc->rdq.its++;
92	i = proc->rdq.id + (r % __readyq_shard_factor);
93	/* paranoid */ verify( i < lanes.count );
94	// If we can't lock it retry
95	} while( !__atomic_try_acquire( &lanes.data[i].lock ) );
96	}
97
98	// Actually push it
99	push(lanes.data[i], thrd);
100
101	// Unlock and return
102	__atomic_unlock( &lanes.data[i].lock );
103
104	#if !defined(__CFA_NO_STATISTICS__)
105	if(unlikely(external \|\| remote)) __atomic_fetch_add(&cltr->stats->ready.push.extrn.success, 1, __ATOMIC_RELAXED);
106	else __tls_stats()->ready.push.local.success++;
107	#endif
108	}
109
110	static inline unsigned long long calc_cutoff(const unsigned long long ctsc, const processor * proc, __ready_queue_t & rdq) {
111	unsigned start = proc->rdq.id;
112	unsigned long long max = 0;
113	for(i; __readyq_shard_factor) {
114	unsigned long long ptsc = ts(rdq.lanes.data[start + i]);
115	if(ptsc != -1ull) {
116	/* paranoid */ verify( start + i < rdq.lanes.count );
117	unsigned long long tsc = moving_average(ctsc, ptsc, rdq.lanes.tscs[start + i].ma);
118	if(tsc > max) max = tsc;
119	}
120	}
121	return (max + 2 * max) / 2;
122	}
123
124	__attribute__((hot)) struct thread$ * pop_fast(struct cluster * cltr) with (cltr->ready_queue) {
125	/* paranoid */ verify( lanes.count > 0 );
126	/* paranoid */ verify( kernelTLS().this_processor );
127	/* paranoid */ verify( kernelTLS().this_processor->rdq.id < lanes.count );
128
129	processor * const proc = kernelTLS().this_processor;
130	unsigned this = proc->rdq.id;
131	/* paranoid */ verify( this < lanes.count );
132	__cfadbg_print_safe(ready_queue, "Kernel : pop from %u\n", this);
133
134	// Figure out the current cpu and make sure it is valid
135	const int cpu = __kernel_getcpu();
136	/* paranoid */ verify(cpu >= 0);
137	/* paranoid */ verify(cpu < cpu_info.hthrd_count);
138	unsigned this_cache = cpu_info.llc_map[cpu].cache;
139
140	// Super important: don't write the same value over and over again
141	// We want to maximise our chances that his particular values stays in cache
142	if(lanes.caches[this / __readyq_shard_factor].id != this_cache)
143	__atomic_store_n(&lanes.caches[this / __readyq_shard_factor].id, this_cache, __ATOMIC_RELAXED);
144
145	const unsigned long long ctsc = rdtscl();
146
147	if(proc->rdq.target == MAX) {
148	uint64_t chaos = __tls_rand();
149	unsigned ext = chaos & 0xff;
150	unsigned other = (chaos >> 8) % (lanes.count);
151
152	if(ext < 3 \|\| __atomic_load_n(&lanes.caches[other / __readyq_shard_factor].id, __ATOMIC_RELAXED) == this_cache) {
153	proc->rdq.target = other;
154	}
155	}
156	else {
157	const unsigned target = proc->rdq.target;
158	__cfadbg_print_safe(ready_queue, "Kernel : %u considering helping %u, tcsc %llu\n", this, target, lanes.tscs[target].tv);
159	/* paranoid */ verify( lanes.tscs[target].tv != MAX );
160	if(target < lanes.count) {
161	const unsigned long long cutoff = calc_cutoff(ctsc, proc, cltr->ready_queue);
162	const unsigned long long age = moving_average(ctsc, lanes.tscs[target].tv, lanes.tscs[target].ma);
163	__cfadbg_print_safe(ready_queue, "Kernel : Help attempt on %u from %u, age %'llu vs cutoff %'llu, %s\n", target, this, age, cutoff, age > cutoff ? "yes" : "no");
164	if(age > cutoff) {
165	thread$ * t = try_pop(cltr, target __STATS(, __tls_stats()->ready.pop.help));
166	if(t) return t;
167	}
168	}
169	proc->rdq.target = MAX;
170	}
171
172	for(__readyq_shard_factor) {
173	unsigned i = this + (proc->rdq.itr++ % __readyq_shard_factor);
174	if(thread$ * t = try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.local))) return t;
175	}
176
177	// All lanes where empty return 0p
178	return 0p;
179
180	}
181	__attribute__((hot)) struct thread$ * pop_slow(struct cluster * cltr) with (cltr->ready_queue) {
182	unsigned i = __tls_rand() % lanes.count;
183	return try_pop(cltr, i __STATS(, __tls_stats()->ready.pop.steal));
184	}
185	__attribute__((hot)) struct thread$ * pop_search(struct cluster * cltr) {
186	return search(cltr);
187	}
188
189	//=======================================================================
190	// Various Ready Queue utilities
191	//=======================================================================
192	// these function work the same or almost the same
193	// whether they are using work-stealing or relaxed fifo scheduling
194
195	//-----------------------------------------------------------------------
196	// try to pop from a lane given by index w
197	static inline struct thread$ * try_pop(struct cluster * cltr, unsigned w __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
198	/* paranoid */ verify( w < lanes.count );
199	__STATS( stats.attempt++; )
200
201	// Get relevant elements locally
202	__intrusive_lane_t & lane = lanes.data[w];
203
204	// If list looks empty retry
205	if( is_empty(lane) ) {
206	return 0p;
207	}
208
209	// If we can't get the lock retry
210	if( !__atomic_try_acquire(&lane.lock) ) {
211	return 0p;
212	}
213
214	// If list is empty, unlock and retry
215	if( is_empty(lane) ) {
216	__atomic_unlock(&lane.lock);
217	return 0p;
218	}
219
220	// Actually pop the list
221	struct thread$ * thrd;
222	unsigned long long tsc_before = ts(lane);
223	unsigned long long tsv;
224	[thrd, tsv] = pop(lane);
225
226	/* paranoid */ verify(thrd);
227	/* paranoid */ verify(tsv);
228	/* paranoid */ verify(lane.lock);
229
230	// Unlock and return
231	__atomic_unlock(&lane.lock);
232
233	// Update statistics
234	__STATS( stats.success++; )
235
236	if (tsv != MAX) {
237	unsigned long long now = rdtscl();
238	unsigned long long pma = __atomic_load_n(&lanes.tscs[w].ma, __ATOMIC_RELAXED);
239	__atomic_store_n(&lanes.tscs[w].tv, tsv, __ATOMIC_RELAXED);
240	__atomic_store_n(&lanes.tscs[w].ma, moving_average(now, tsc_before, pma), __ATOMIC_RELAXED);
241	}
242
243	thrd->preferred = w / __readyq_shard_factor;
244
245	// return the popped thread
246	return thrd;
247	}
248
249	//-----------------------------------------------------------------------
250	// try to pop from any lanes making sure you don't miss any threads push
251	// before the start of the function
252	static inline struct thread$ * search(struct cluster * cltr) with (cltr->ready_queue) {
253	/* paranoid */ verify( lanes.count > 0 );
254	unsigned count = __atomic_load_n( &lanes.count, __ATOMIC_RELAXED );
255	unsigned offset = __tls_rand();
256	for(i; count) {
257	unsigned idx = (offset + i) % count;
258	struct thread$ * thrd = try_pop(cltr, idx __STATS(, __tls_stats()->ready.pop.search));
259	if(thrd) {
260	return thrd;
261	}
262	}
263
264	// All lanes where empty return 0p
265	return 0p;
266	}
267
268	//-----------------------------------------------------------------------
269	// get preferred ready for new thread
270	unsigned ready_queue_new_preferred() {
271	unsigned pref = MAX;
272	if(struct thread$ * thrd = publicTLS_get( this_thread )) {
273	pref = thrd->preferred;
274	}
275
276	return pref;
277	}
278
279	//-----------------------------------------------------------------------
280	// Given 2 indexes, pick the list with the oldest push an try to pop from it
281	static inline struct thread$ * try_pop(struct cluster * cltr, unsigned i, unsigned j __STATS(, __stats_readyQ_pop_t & stats)) with (cltr->ready_queue) {
282	// Pick the bet list
283	int w = i;
284	if( __builtin_expect(!is_empty(lanes.data[j]), true) ) {
285	w = (ts(lanes.data[i]) < ts(lanes.data[j])) ? i : j;
286	}
287
288	return try_pop(cltr, w __STATS(, stats));
289	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format