Context Navigation

source: doc/theses/mike_brooks_MMath/plots/ListCommon.py@ e35ecd0

Visit:

Last change on this file since e35ecd0 was e35ecd0, checked in by Michael Brooks <mlbrooks@…>, 2 weeks ago
save draft of list perf fx-interaction plot
Property mode set to `100644`
File size: 18.1 KB

Line
1	import pandas as pd
2	import numpy as np
3	import math
4	import os
5	from subprocess import Popen, PIPE
6	from scipy.stats import gmean
7
8	def getDataset( infile ):
9	# grep to remove lines that end in comma; these were error runs
10	with Popen("grep '[^,]$' " + infile, shell=True, stdout=PIPE) as process:
11	timings = pd.read_csv(
12	process.stdout,
13	names=['RunMoment', 'RunIdx', 'Args', 'Program', 'Width',
14	'expt_ops_completed', 'expt_elapsed_sec', 'mean_op_dur_ns'],
15	dtype={'RunMoment': str,
16	'RunIdx': np.int64,
17	'Args': str,
18	'Program': str,
19	'Width': np.int64,
20	'expt_ops_completed': np.int64,
21	'expt_elapsed_sec': np.float64,
22	'mean_op_dur_ns': np.float64},
23	parse_dates=['RunMoment']
24	)
25	# print(timings.head())
26
27	## parse executable name and args
28
29	timings[['ExperimentDurSec',
30	'CheckDonePeriod',
31	'Length',
32	'ExperimentDurOpCount',
33	'Seed',
34	'InterleaveFrac']] = timings['Args'].str.strip().str.split(expand=True)
35	timings["Length"] = pd.to_numeric(timings["Length"])
36	timings["InterleaveFrac"] = pd.to_numeric(timings["InterleaveFrac"]).round(3)
37
38	timings["NumNodes"] = timings["Length"] * timings["Width"]
39
40	timings[['__ProgramPrefix',
41	'fx',
42	'op']] = timings['Program'].str.split('--', expand=True)
43
44	timings[['movement',
45	'polarity',
46	'accessor']] = timings['op'].str.split('-', expand=True)
47
48	## SizeZone as NumNodes t-shirt size
49	timings['SizeZone'] = np.select(
50	condlist = [
51	(4 <= timings['NumNodes']) & (timings['NumNodes'] <= 16),
52	(48 <= timings['NumNodes']) & (timings['NumNodes'] <= 256)
53	],
54	choicelist = [
55	'SM',
56	'ML'
57	],
58	default = 'none'
59	)
60
61	return timings
62
63	# `c` = column name
64	def c( baseName, marginalizeOn ):
65	margSlug = str.join( "_", marginalizeOn )
66	return baseName + "_" + margSlug
67
68	explanations = ['movement', 'polarity', 'accessor',
69	'NumNodes',
70	'SizeZone', # note fd: NumNodes -> SizeZone
71	'fx',
72	'machine',
73	'InterleaveFrac', # unused and always zero
74	]
75
76	# helper for avoiding pollution from e.g. alternate cfa list versions
77	# when a preference-limiting factor is marginalized, make bl value from preferred subset
78	# but still stamp result everywhere; e.g. even cfa-strip has canon-bl-relative perf
79	# when conditioning on such factor, peer groups are already small enough to stop such pollution
80	# use nontrivial marginalizeOn when calculating baseline values, to achieve the above outside-canonical behaviour non-degenerately
81	# use default full marginalizeOn when removing points from a graph, which leaves only canonical points
82	def getJustCanon( timings,
83	marginalizeOn = explanations, *,
84	# no c++: bl is for comparing intrusives
85	# no lq-list: sparse
86	# no cfa-fredDisbled: bl is for comparing prod-readies
87	fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp'],
88	szInc = ['SM', 'ML'],
89	sExcl = [1]
90	): # all explanations marginalized => maximally aggressive filter
91	if 'fx' in marginalizeOn:
92	fxIsCanon = timings.fx.isin(fxInc)
93	timings = timings[ fxIsCanon ]
94	if 'SizeZone' in marginalizeOn:
95	szIsCanon = timings.SizeZone.isin(szInc)
96	timings = timings[ szIsCanon ]
97	if 'NumNodes' in marginalizeOn:
98	sIsCanon = ~ timings.NumNodes.isin(sExcl)
99	timings = timings[ sIsCanon ]
100	return timings
101
102
103	def annotateBaseline( timings, marginalizeOn ):
104	c_tgtPeers = c( 'Peers', marginalizeOn )
105	c_tgtBl = c("Baseline", marginalizeOn)
106	c_tgtRel = c("OpDurRel", marginalizeOn)
107	if c_tgtBl in timings.columns or c_tgtRel in timings.columns:
108	assert( c_tgtBl in timings.columns and c_tgtRel in timings.columns )
109	return
110	# size handling:
111	# two ordinary baselines (sz-nn, nn) and one synthetic baseline (sz)
112	# the SizeZone-only baseline has no interpretation wrt a real peer group
113	# it isolates the effect of belonging to one SZ or the other
114	# while conditioning away the specific-size effects within the SZ
115	# notably in zone SM, opDur-v-size usually pitches upward
116	# comparing to sz-only baseline gets rid of "they all pitch up," while keeping "SM is faster then ML"
117	if 'SizeZone' in marginalizeOn and 'NumNodes' not in marginalizeOn:
118	# special case: sz-only synthetic benchmark
119	margNeither = list( set(marginalizeOn) - {'SizeZone'} )
120	margBoth = list( set(marginalizeOn) \| {'NumNodes'} )
121	margJustNn = list( set(margNeither) \| {'NumNodes'} )
122	annotateBaseline( timings, margNeither )
123	annotateBaseline( timings, margBoth )
124	annotateBaseline( timings, margJustNn )
125	c_neitherRel = c("OpDurRel", margNeither)
126	c_bothBl = c("Baseline", margBoth)
127	c_justNnBl = c("Baseline", margJustNn)
128	timings[ c_tgtBl ] = np.nan
129	timings[ c_tgtRel ] = timings[ c_justNnBl ] / timings[ c_bothBl ] * timings[ c_neitherRel ]
130	else: # general case
131	# prevent non-canonical samples from polluting baseline values
132	# note, depending on the presentation, the polluting points may already be removed from timings entirely
133	canonSrc = getJustCanon(timings, marginalizeOn)
134	# print(f"for marg on {marginalizeOn}, \|canonSrc\| = {len(canonSrc)}, \|timings\| = {len(timings)}", file=sys.stderr)
135	conditionOn = list( set(explanations) - set(marginalizeOn) )
136	# print( "marginalizing on", marginalizeOn, "conditioning on", conditionOn, file=sys.stderr )
137
138	if conditionOn:
139	stats = canonSrc.groupby(conditionOn)['mean_op_dur_ns'].agg(**{
140	c_tgtPeers: 'count',
141	c_tgtBl: gmean
142	})
143	group_lookup = timings.set_index(conditionOn).index
144	timings[c_tgtPeers] = stats[c_tgtPeers].reindex(group_lookup).values
145	timings[c_tgtBl] = stats[c_tgtBl].reindex(group_lookup).values
146	else:
147	stats = canonSrc.groupby((lambda _: 0))['mean_op_dur_ns'].agg(**{
148	c_tgtPeers: 'count',
149	c_tgtBl: gmean
150	})
151	# Extract the single row
152	row = stats.iloc[0]
153	# Broadcast to all rows
154	timings[c_tgtPeers] = row[c_tgtPeers]
155	timings[c_tgtBl] = row[c_tgtBl]
156
157
158	# everywhere := itself / [preferred-subset derived]
159	timings[c_tgtRel] = timings['mean_op_dur_ns'] / timings[c_tgtBl]
160
161
162	# longer column name (Peers_%, Baseline_%, OpDurRel_%) gives larger peer group and more (total) variation
163	def annotateCommonBaselines( timings ):
164	def applyGeneralExplanations( bgMarginalizeOn ):
165	def fg( marginalizeOn ):
166	return bgMarginalizeOn + marginalizeOn
167	annotateBaseline( timings, fg( [] ) ) # all-in baseline (all factors conditioned): only inter-run differences
168	annotateBaseline( timings, fg( ['movement', 'polarity'] ) )
169	annotateBaseline( timings, fg( ['accessor'] ) )
170	annotateBaseline( timings, fg( ['machine'] ) )
171
172	annotateBaseline( timings, fg( ['SizeZone', 'NumNodes'] ) ) # SizeZone is NOT redundant; conditioned on neither
173	annotateBaseline( timings, fg( ['NumNodes'] ) ) # still conditioned on SizeZone
174	annotateBaseline( timings, fg( ['SizeZone'] ) ) # synthetic: conditioned on NumNodes but not SizeZone
175	applyGeneralExplanations( [] )
176	applyGeneralExplanations( ['fx'] )
177
178	def getMachineDataset( dsname, machine ):
179	infileLocal = f"results-{machine}-{dsname}.csv"
180	infile = os.path.dirname(os.path.abspath(__file__)) + '/../benchmarks/list/' + infileLocal
181	timings = getDataset( infile )
182	timings['machine'] = machine
183	return timings
184
185	allMachines = ['swift', 'java']
186
187
188	# general, as in exclude the stripped-down experimental CFAs
189	general_fxs_full = ['cfa-cfa', 'cpp-stlref', 'upp-upp', 'lq-tailq', 'lq-list']
190	general_fxs_intrusive = ['cfa-cfa', 'upp-upp', 'lq-tailq', 'lq-list']
191
192	def getSingleResults(
193	dsname = 'general',
194	machines = allMachines,
195	*,
196	fxs = general_fxs_full,
197	tgtMovement = 'all',
198	tgtPolarity = 'all',
199	tgtAccessor = 'all',
200	tgtInterleave = 0.0 ):
201
202	timings = pd.concat([
203	getMachineDataset( dsname, m )
204	for m in machines ])
205
206	# print(timings, file=sys.stderr)
207
208	movements = timings['movement'].unique()
209	polarities = timings['polarity'].unique()
210	accessors = timings['accessor'].unique()
211	interleaves = timings['InterleaveFrac'].unique()
212
213	if movements.size > 1:
214	movements = np.append(movements, 'all')
215	if polarities.size > 1:
216	polarities = np.append(polarities, 'all')
217	if accessors.size > 1:
218	accessors = np.append(accessors, 'all')
219
220	# print(f"trying to filter {dsname} {machines} {len(timings)}", file=sys.stderr)
221	grp = timings.groupby('fx')
222	# print(f"with fxs {grp.groups.keys()}", file=sys.stderr)
223	timings = pd.concat([
224	grp.get_group(fx)
225	for fx in fxs ])
226
227	if (tgtMovement != 'all'):
228	grp = timings.groupby('movement')
229	timings = grp.get_group(tgtMovement)
230	if (tgtPolarity != 'all'):
231	grp = timings.groupby('polarity')
232	timings = grp.get_group(tgtPolarity)
233	if (tgtAccessor != 'all'):
234	grp = timings.groupby('accessor')
235	timings = grp.get_group(tgtAccessor)
236	if (tgtInterleave != 'all'):
237	timings = timings[ timings['InterleaveFrac'] == float(tgtInterleave) ]
238
239
240	return timings
241
242	def stripMachine(pyCore):
243	parts = str.split(pyCore, '-')
244	exceptLast = parts[ 0 : -1 ]
245	return str.join('-', exceptLast)
246
247	def getSummaryMeta(metaFileCore):
248	metafile = os.path.dirname(os.path.abspath(__file__)) + "/" + metaFileCore + '-meta.dat'
249	metadata = pd.read_csv(
250	metafile,
251	names=['OpIx', 'Op'],
252	delimiter='\t'
253	)
254	metadata[['movement',
255	'polarity',
256	'accessor']] = metadata['Op'].str.split('\\\\n', expand=True)
257	metadata.replace('*', 'all', inplace=True)
258	metadata.replace('S', 'stack', inplace=True)
259	metadata.replace('Q', 'queue', inplace=True)
260	metadata.replace('iF', 'insfirst', inplace=True)
261	metadata.replace('iL', 'inslast', inplace=True)
262	metadata.replace('H', 'allhead', inplace=True)
263	metadata.replace('Ie', 'inselem', inplace=True)
264	metadata.replace('Re', 'remelem', inplace=True)
265	return metadata
266
267	swiftSweetspot = (lambda x: x > 16 and x < 150)
268	# swiftSweetspot = (lambda x: x > 4 and x < 32)
269	javaSweetspot = (lambda x: x >= 24 and x <= 256)
270
271	def printManySummary(*,
272	dsname = 'general',
273	machines = allMachines,
274	metafileCore,
275	fxs,
276	sizeQual,
277	tgtInterleave = 0.0,
278	marginalizeOn = ['fx'] ) :
279
280	metadata = getSummaryMeta(metafileCore)
281
282	measure = c( 'OpDurRel', marginalizeOn )
283
284	print("# op_num\tfx_num\tfx\tmean\tstdev\tmin\tmax\tcount\tpl95\tpl68\tp50\tph68\tph95")
285
286	for op in metadata.itertuples():
287	timings = getSingleResults(dsname, machines,
288	fxs=fxs,
289	tgtMovement = op.movement,
290	tgtPolarity = op.polarity,
291	tgtAccessor = op.accessor,
292	tgtInterleave = tgtInterleave )
293	annotateBaseline(timings, marginalizeOn)
294
295	timings = timings[ timings['fx'].isin(fxs) ]
296	timings = timings[ timings['NumNodes'].apply(sizeQual) ]
297
298	fxnums = timings['fx'].apply(
299	lambda fx: fxs.index(fx) + 1
300	)
301	timings.insert(loc=0, column='fx_num', value=fxnums)
302	timings.insert(loc=0, column='op_num', value=op.OpIx)
303
304	grouped = timings.groupby(['op_num', 'fx_num', 'fx'])
305
306	aggregated = grouped[measure].agg(
307	["mean", "std", "min", "max", "count",
308	lambda x: x.quantile(0.025),
309	lambda x: x.quantile(0.16),
310	lambda x: x.quantile(0.5),
311	lambda x: x.quantile(0.84),
312	lambda x: x.quantile(0.975)]
313	)
314
315	text = aggregated.to_csv(header=False, index=True, sep='\t')
316	print(text, end='')
317
318	def printSingleDetail(
319	dsname = 'general',
320	machines = allMachines,
321	*,
322	fxs = general_fxs_full,
323	tgtMovement = 'all',
324	tgtPolarity = 'all',
325	tgtAccessor = 'all',
326	tgtInterleave = 0.0,
327	measureBase = 'mean_op_dur_ns',
328	marginalizeOn = explanations ):
329
330
331	timings = getSingleResults(dsname, machines,
332	fxs = fxs,
333	tgtMovement = tgtMovement,
334	tgtPolarity = tgtPolarity,
335	tgtAccessor = tgtAccessor,
336	tgtInterleave = tgtInterleave)
337
338	if measureBase == 'OpDurRel':
339	annotateBaseline(timings, marginalizeOn)
340	measure = c( measureBase, marginalizeOn )
341	elif measureBase == 'mean_op_dur_ns':
342	measure = measureBase
343	else:
344	raise RuntimeError(f"measureBase '{measureBase}' not handled")
345
346	groupedFx = timings.groupby('fx')
347	for fx, fgroup in groupedFx:
348	# print(fgroup.head())
349	groupedRun = fgroup.groupby(['NumNodes']) # , 'fx', 'op'
350	aggregated = groupedRun[measure].agg(
351	["mean", "std", "min", "max", "count", "sum"]
352	)
353	aggregated['mean_no_outlr'] = (
354	( aggregated['sum'] - aggregated['min'] - aggregated['max'] )
355	/
356	( aggregated['count'] - 2 )
357	)
358
359	#print(aggregated.head())
360
361	print('"{header}"'.format(header=fx))
362	text = aggregated.to_csv(header=False, index=True, sep='\t')
363	print(text)
364	print()
365	print()
366
367	def aMeanNoOutlr(range):
368	return ( range.sum() - range.min() - range.max() ) / ( range.count() - 2 )
369
370	def gMeanNoOutlr(range):
371	return ( range.prod() / range.min() / range.max() ) ** ( 1 / ( range.count() - 2 ) )
372
373
374	def trimPer( df, criteria ):
375	for field, values in criteria.items():
376	areMatches = df[ field ].isin(values)
377	df = df[ areMatches ]
378	return df
379
380	# The range from 0.9759 to 1.0247 (which is 1.05 x wide) has 1.0 in its centre.
381	# This is the bucket with key 0.
382	# Logs of values in this bucket go from -0.5 to +0.5.
383	# Rounding a log value to the nearest integer gives the key.
384	# Exponentiating a key directly gives the centre of its bucket.
385	# Exponentiating a key less 0.5 gives the bottom of its bucket.
386	# Gnuplot expects the latter.
387
388	bucketMin = 0.25
389	bucketMax = 4.0
390	bucketGrain = 1.05
391	bktKeyLo = math.floor( math.log(bucketMin, bucketGrain) )
392	bktKeyHi = math.ceil( math.log(bucketMax, bucketGrain) )
393
394	def bktKeyOfVal( relDur ):
395	distance = math.log(relDur, bucketGrain)
396	key = round( distance )
397	return key
398
399	def bktIxOfVal( relDur ):
400	return bktKeyToIx( bktKeyOfVal( relDur ) )
401
402	def botValOfBucketK( key ):
403	return bucketGrain ** ( key - 0.5 )
404
405	def topValOfBucketBotVal( botVal ):
406	return bucketGrain * botVal
407
408	def bktKeyToIx( key ):
409	return key - bktKeyLo
410
411	def bktIxToKey( ix ):
412	return ix + bktKeyLo
413
414	def botOfBucketOfVal( relDur ):
415	return botValOfBucketK( bktKeyOfVal( relDur ) )
416
417	buckets = [ botValOfBucketK(key) for key in range(bktKeyLo, bktKeyHi) ]
418
419	# printSingleDetail
420	def printHistos(*,
421	tgtMovement = 'all',
422	tgtPolarity = 'all',
423	tgtAccessor = 'all',
424	tgtInterleave = 0.0,
425	earlyFilter = {}, # exclude from benchmarking
426	lateFilter = {}, # exclude from output
427	drillOn = ['fx'],
428	marginalizeOn = None ): # None means match drill-on
429
430	if marginalizeOn == None:
431	marginalizeOn = drillOn
432
433	# watch out for filtering too early here; need everything sticking around until baselines are applies
434	# ie, maybe I should get rid of all the tgt parms at the pre-benchmark layers
435	timings = getSingleResults(
436	tgtMovement = tgtMovement,
437	tgtPolarity = tgtPolarity,
438	tgtAccessor = tgtAccessor,
439	tgtInterleave = tgtInterleave)
440	timings = getJustCanon( timings,
441	fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp', 'lq-list'],
442	szInc = ['SM', 'ML'],
443	sExcl = [1] )
444
445	timings = trimPer( timings, earlyFilter )
446
447	options = timings.groupby(explanations)
448	aggregated = options.agg(
449	mean_op_dur_ns = ('mean_op_dur_ns', gMeanNoOutlr)
450	).reset_index()
451	annotateBaseline(aggregated, marginalizeOn)
452
453	aggregated = trimPer( aggregated, lateFilter )
454
455	# if examining "why CFA slow" need both
456	# - getVariousCfa inplace of getJust Canon
457	# - do annotate-then-filter because baseline needs to stay cfa-tailq-upp
458	# (filter-then-annotate is fine for general cases (where all three canons are included) and good for build time)
459
460
461	c_measure = c('OpDurRel', marginalizeOn)
462	# options = timings.groupby(explanations)
463
464	# aggregated = options.agg(
465	# **{measure:(measure,gMeanNoOutlr)}
466	# ).reset_index()
467
468	c_measureBkt = 'BUCKET_' + c_measure
469	aggregated[ c_measureBkt ] = aggregated[c_measure].apply( botOfBucketOfVal )
470
471	drillgrp = aggregated.groupby(drillOn)
472
473	# print(f'measure is {measure}')
474	# print()
475	# print()
476
477	for dkey, dgroup in drillgrp:
478	# print(mgroup, file=sys.stderr)
479
480	histo_raw = dgroup[ c_measureBkt ].value_counts()
481	for b in buckets:
482	if b not in histo_raw.keys():
483	# print( f"{b} := 0", file=sys.stderr )
484	histo_raw[b] = 0
485	histo_raw = histo_raw.sort_index()
486
487	histo = histo_raw.rename("count").reset_index()
488	histo = histo.rename(columns={c_measureBkt: "y_lo"})
489	y_lo_col_loc = histo.columns.get_loc("y_lo")
490	histo.insert(y_lo_col_loc + 1, "y_hi", histo["y_lo"].apply(topValOfBucketBotVal))
491
492	dkey_str = list( map( str, dkey ) )
493	header = str.join(', ', dkey_str)
494	print(f'"{header}"')
495	text = histo.to_csv(header=False, index=False, sep='\t')
496	print(text)
497	print()
498	print()
499
500	# print(f'"{header}" FULL')
501	# text = group.to_csv(header=False, index=True, sep='\t')
502	# print(text)
503	# print()
504	# print()
505
506	# print(f'"RAW"')
507	# text = timings.to_csv(header=False, index=True, sep='\t')
508	# print(text)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: