Context Navigation

source: doc/theses/mike_brooks_MMath/plots/ListCommon.py@ 1abcec9b

Visit:

Last change on this file since 1abcec9b was 1abcec9b, checked in by Michael Brooks <mlbrooks@…>, 2 weeks ago
Add overlaid means to list perf histograms. Add 2nd-order graph to the paper and its discussion.
Property mode set to `100644`
File size: 18.6 KB

Line
1	import pandas as pd
2	import numpy as np
3	import math
4	import os
5	import sys
6	from subprocess import Popen, PIPE
7	from scipy.stats import gmean
8
9	def getDataset( infile ):
10	# grep to remove lines that end in comma; these were error runs
11	with Popen("grep '[^,]$' " + infile, shell=True, stdout=PIPE) as process:
12	timings = pd.read_csv(
13	process.stdout,
14	names=['RunMoment', 'RunIdx', 'Args', 'Program', 'Width',
15	'expt_ops_completed', 'expt_elapsed_sec', 'mean_op_dur_ns'],
16	dtype={'RunMoment': str,
17	'RunIdx': np.int64,
18	'Args': str,
19	'Program': str,
20	'Width': np.int64,
21	'expt_ops_completed': np.int64,
22	'expt_elapsed_sec': np.float64,
23	'mean_op_dur_ns': np.float64},
24	parse_dates=['RunMoment']
25	)
26	# print(timings.head())
27
28	## parse executable name and args
29
30	timings[['ExperimentDurSec',
31	'CheckDonePeriod',
32	'Length',
33	'ExperimentDurOpCount',
34	'Seed',
35	'InterleaveFrac']] = timings['Args'].str.strip().str.split(expand=True)
36	timings["Length"] = pd.to_numeric(timings["Length"])
37	timings["InterleaveFrac"] = pd.to_numeric(timings["InterleaveFrac"]).round(3)
38
39	timings["NumNodes"] = timings["Length"] * timings["Width"]
40
41	timings[['__ProgramPrefix',
42	'fx',
43	'op']] = timings['Program'].str.split('--', expand=True)
44
45	timings[['movement',
46	'polarity',
47	'accessor']] = timings['op'].str.split('-', expand=True)
48
49	## SizeZone as NumNodes t-shirt size
50	timings['SizeZone'] = np.select(
51	condlist = [
52	(4 <= timings['NumNodes']) & (timings['NumNodes'] <= 16),
53	(48 <= timings['NumNodes']) & (timings['NumNodes'] <= 256)
54	],
55	choicelist = [
56	'SM',
57	'ML'
58	],
59	default = 'none'
60	)
61
62	return timings
63
64	# `c` = column name
65	def c( baseName, marginalizeOn ):
66	margSlug = str.join( "_", marginalizeOn )
67	return baseName + "_" + margSlug
68
69	explanations = ['movement', 'polarity', 'accessor',
70	'NumNodes',
71	'SizeZone', # note fd: NumNodes -> SizeZone
72	'fx',
73	'machine',
74	'InterleaveFrac', # unused and always zero
75	]
76
77	# helper for avoiding pollution from e.g. alternate cfa list versions
78	# when a preference-limiting factor is marginalized, make bl value from preferred subset
79	# but still stamp result everywhere; e.g. even cfa-strip has canon-bl-relative perf
80	# when conditioning on such factor, peer groups are already small enough to stop such pollution
81	# use nontrivial marginalizeOn when calculating baseline values, to achieve the above outside-canonical behaviour non-degenerately
82	# use default full marginalizeOn when removing points from a graph, which leaves only canonical points
83	def getJustCanon( timings,
84	marginalizeOn = explanations, *,
85	# no c++: bl is for comparing intrusives
86	# no lq-list: sparse
87	# no cfa-fredDisbled: bl is for comparing prod-readies
88	fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp'],
89	szInc = ['SM', 'ML'],
90	sExcl = [1]
91	): # all explanations marginalized => maximally aggressive filter
92	if 'fx' in marginalizeOn:
93	fxIsCanon = timings.fx.isin(fxInc)
94	timings = timings[ fxIsCanon ]
95	if 'SizeZone' in marginalizeOn:
96	szIsCanon = timings.SizeZone.isin(szInc)
97	timings = timings[ szIsCanon ]
98	if 'NumNodes' in marginalizeOn:
99	sIsCanon = ~ timings.NumNodes.isin(sExcl)
100	timings = timings[ sIsCanon ]
101	return timings
102
103
104	def annotateBaseline( timings, marginalizeOn ):
105	c_tgtPeers = c( 'Peers', marginalizeOn )
106	c_tgtBl = c("Baseline", marginalizeOn)
107	c_tgtRel = c("OpDurRel", marginalizeOn)
108	if c_tgtBl in timings.columns or c_tgtRel in timings.columns:
109	assert( c_tgtBl in timings.columns and c_tgtRel in timings.columns )
110	return
111	# size handling:
112	# two ordinary baselines (sz-nn, nn) and one synthetic baseline (sz)
113	# the SizeZone-only baseline has no interpretation wrt a real peer group
114	# it isolates the effect of belonging to one SZ or the other
115	# while conditioning away the specific-size effects within the SZ
116	# notably in zone SM, opDur-v-size usually pitches upward
117	# comparing to sz-only baseline gets rid of "they all pitch up," while keeping "SM is faster then ML"
118	if 'SizeZone' in marginalizeOn and 'NumNodes' not in marginalizeOn:
119	# special case: sz-only synthetic benchmark
120	margNeither = list( set(marginalizeOn) - {'SizeZone'} )
121	margBoth = list( set(marginalizeOn) \| {'NumNodes'} )
122	margJustNn = list( set(margNeither) \| {'NumNodes'} )
123	annotateBaseline( timings, margNeither )
124	annotateBaseline( timings, margBoth )
125	annotateBaseline( timings, margJustNn )
126	c_neitherRel = c("OpDurRel", margNeither)
127	c_bothBl = c("Baseline", margBoth)
128	c_justNnBl = c("Baseline", margJustNn)
129	timings[ c_tgtBl ] = np.nan
130	timings[ c_tgtRel ] = timings[ c_justNnBl ] / timings[ c_bothBl ] * timings[ c_neitherRel ]
131	else: # general case
132	# prevent non-canonical samples from polluting baseline values
133	# note, depending on the presentation, the polluting points may already be removed from timings entirely
134	canonSrc = getJustCanon(timings, marginalizeOn)
135	# print(f"for marg on {marginalizeOn}, \|canonSrc\| = {len(canonSrc)}, \|timings\| = {len(timings)}", file=sys.stderr)
136	conditionOn = list( set(explanations) - set(marginalizeOn) )
137	# print( "marginalizing on", marginalizeOn, "conditioning on", conditionOn, file=sys.stderr )
138
139	if conditionOn:
140	stats = canonSrc.groupby(conditionOn)['mean_op_dur_ns'].agg(**{
141	c_tgtPeers: 'count',
142	c_tgtBl: gmean
143	})
144	group_lookup = timings.set_index(conditionOn).index
145	timings[c_tgtPeers] = stats[c_tgtPeers].reindex(group_lookup).values
146	timings[c_tgtBl] = stats[c_tgtBl].reindex(group_lookup).values
147	else:
148	stats = canonSrc.groupby((lambda _: 0))['mean_op_dur_ns'].agg(**{
149	c_tgtPeers: 'count',
150	c_tgtBl: gmean
151	})
152	# Extract the single row
153	row = stats.iloc[0]
154	# Broadcast to all rows
155	timings[c_tgtPeers] = row[c_tgtPeers]
156	timings[c_tgtBl] = row[c_tgtBl]
157
158
159	# everywhere := itself / [preferred-subset derived]
160	timings[c_tgtRel] = timings['mean_op_dur_ns'] / timings[c_tgtBl]
161
162
163	# longer column name (Peers_%, Baseline_%, OpDurRel_%) gives larger peer group and more (total) variation
164	def annotateCommonBaselines( timings ):
165	def applyGeneralExplanations( bgMarginalizeOn ):
166	def fg( marginalizeOn ):
167	return bgMarginalizeOn + marginalizeOn
168	annotateBaseline( timings, fg( [] ) ) # all-in baseline (all factors conditioned): only inter-run differences
169	annotateBaseline( timings, fg( ['movement', 'polarity'] ) )
170	annotateBaseline( timings, fg( ['accessor'] ) )
171	annotateBaseline( timings, fg( ['machine'] ) )
172
173	annotateBaseline( timings, fg( ['SizeZone', 'NumNodes'] ) ) # SizeZone is NOT redundant; conditioned on neither
174	annotateBaseline( timings, fg( ['NumNodes'] ) ) # still conditioned on SizeZone
175	annotateBaseline( timings, fg( ['SizeZone'] ) ) # synthetic: conditioned on NumNodes but not SizeZone
176	applyGeneralExplanations( [] )
177	applyGeneralExplanations( ['fx'] )
178
179	def getMachineDataset( dsname, machine ):
180	infileLocal = f"results-{machine}-{dsname}.csv"
181	infile = os.path.dirname(os.path.abspath(__file__)) + '/../benchmarks/list/' + infileLocal
182	timings = getDataset( infile )
183	timings['machine'] = machine
184	return timings
185
186	allMachines = ['swift', 'java']
187
188
189	# general, as in exclude the stripped-down experimental CFAs
190	general_fxs_full = ['cfa-cfa', 'cpp-stlref', 'upp-upp', 'lq-tailq', 'lq-list']
191	general_fxs_intrusive = ['cfa-cfa', 'upp-upp', 'lq-tailq', 'lq-list']
192
193	def getSingleResults(
194	dsname = 'general',
195	machines = allMachines,
196	*,
197	fxs = general_fxs_full,
198	tgtMovement = 'all',
199	tgtPolarity = 'all',
200	tgtAccessor = 'all',
201	tgtInterleave = 0.0 ):
202
203	timings = pd.concat([
204	getMachineDataset( dsname, m )
205	for m in machines ])
206
207	# print(timings, file=sys.stderr)
208
209	movements = timings['movement'].unique()
210	polarities = timings['polarity'].unique()
211	accessors = timings['accessor'].unique()
212	interleaves = timings['InterleaveFrac'].unique()
213
214	if movements.size > 1:
215	movements = np.append(movements, 'all')
216	if polarities.size > 1:
217	polarities = np.append(polarities, 'all')
218	if accessors.size > 1:
219	accessors = np.append(accessors, 'all')
220
221	# print(f"trying to filter {dsname} {machines} {len(timings)}", file=sys.stderr)
222	grp = timings.groupby('fx')
223	# print(f"with fxs {grp.groups.keys()}", file=sys.stderr)
224	timings = pd.concat([
225	grp.get_group(fx)
226	for fx in fxs ])
227
228	if (tgtMovement != 'all'):
229	grp = timings.groupby('movement')
230	timings = grp.get_group(tgtMovement)
231	if (tgtPolarity != 'all'):
232	grp = timings.groupby('polarity')
233	timings = grp.get_group(tgtPolarity)
234	if (tgtAccessor != 'all'):
235	grp = timings.groupby('accessor')
236	timings = grp.get_group(tgtAccessor)
237	if (tgtInterleave != 'all'):
238	timings = timings[ timings['InterleaveFrac'] == float(tgtInterleave) ]
239
240
241	return timings
242
243	def stripMachine(pyCore):
244	parts = str.split(pyCore, '-')
245	exceptLast = parts[ 0 : -1 ]
246	return str.join('-', exceptLast)
247
248	def getSummaryMeta(metaFileCore):
249	metafile = os.path.dirname(os.path.abspath(__file__)) + "/" + metaFileCore + '-meta.dat'
250	metadata = pd.read_csv(
251	metafile,
252	names=['OpIx', 'Op'],
253	delimiter='\t'
254	)
255	metadata[['movement',
256	'polarity',
257	'accessor']] = metadata['Op'].str.split('\\\\n', expand=True)
258	metadata.replace('*', 'all', inplace=True)
259	metadata.replace('S', 'stack', inplace=True)
260	metadata.replace('Q', 'queue', inplace=True)
261	metadata.replace('iF', 'insfirst', inplace=True)
262	metadata.replace('iL', 'inslast', inplace=True)
263	metadata.replace('H', 'allhead', inplace=True)
264	metadata.replace('Ie', 'inselem', inplace=True)
265	metadata.replace('Re', 'remelem', inplace=True)
266	return metadata
267
268	def printSingleSummaryFrom( measure, dfgrouped, *,
269	file = sys.stdout,
270	index = True,
271	end = '\n' ) :
272	aggregated = dfgrouped[measure].agg([
273	("gmean", gmean), "std", "min", "max", "count",
274	lambda x: x.quantile(0.025),
275	lambda x: x.quantile(0.16),
276	lambda x: x.quantile(0.5),
277	lambda x: x.quantile(0.84),
278	lambda x: x.quantile(0.975)
279	])
280	text = aggregated.to_csv(header=False, index=index, sep='\t')
281	print(text, file=file, end=end)
282
283
284	swiftSweetspot = (lambda x: x > 16 and x < 150)
285	# swiftSweetspot = (lambda x: x > 4 and x < 32)
286	javaSweetspot = (lambda x: x >= 24 and x <= 256)
287
288	def printManySummary(*,
289	dsname = 'general',
290	machines = allMachines,
291	metafileCore,
292	fxs,
293	sizeQual,
294	tgtInterleave = 0.0,
295	marginalizeOn = ['fx'] ) :
296
297	metadata = getSummaryMeta(metafileCore)
298
299	measure = c( 'OpDurRel', marginalizeOn )
300
301	print("# op_num\tfx_num\tfx\tmean\tstdev\tmin\tmax\tcount\tpl95\tpl68\tp50\tph68\tph95")
302
303	for op in metadata.itertuples():
304	timings = getSingleResults(dsname, machines,
305	fxs=fxs,
306	tgtMovement = op.movement,
307	tgtPolarity = op.polarity,
308	tgtAccessor = op.accessor,
309	tgtInterleave = tgtInterleave )
310	annotateBaseline(timings, marginalizeOn)
311
312	timings = timings[ timings['fx'].isin(fxs) ]
313	timings = timings[ timings['NumNodes'].apply(sizeQual) ]
314
315	fxnums = timings['fx'].apply(
316	lambda fx: fxs.index(fx) + 1
317	)
318	timings.insert(loc=0, column='fx_num', value=fxnums)
319	timings.insert(loc=0, column='op_num', value=op.OpIx)
320
321	grouped = timings.groupby(['op_num', 'fx_num', 'fx'])
322	printSingleSummaryFrom( measure, grouped, end ='' )
323
324
325	def printSingleDetail(
326	dsname = 'general',
327	machines = allMachines,
328	*,
329	fxs = general_fxs_full,
330	tgtMovement = 'all',
331	tgtPolarity = 'all',
332	tgtAccessor = 'all',
333	tgtInterleave = 0.0,
334	measureBase = 'mean_op_dur_ns',
335	marginalizeOn = explanations ):
336
337
338	timings = getSingleResults(dsname, machines,
339	fxs = fxs,
340	tgtMovement = tgtMovement,
341	tgtPolarity = tgtPolarity,
342	tgtAccessor = tgtAccessor,
343	tgtInterleave = tgtInterleave)
344
345	if measureBase == 'OpDurRel':
346	annotateBaseline(timings, marginalizeOn)
347	measure = c( measureBase, marginalizeOn )
348	elif measureBase == 'mean_op_dur_ns':
349	measure = measureBase
350	else:
351	raise RuntimeError(f"measureBase '{measureBase}' not handled")
352
353	groupedFx = timings.groupby('fx')
354	for fx, fgroup in groupedFx:
355	# print(fgroup.head())
356	groupedRun = fgroup.groupby(['NumNodes']) # , 'fx', 'op'
357	aggregated = groupedRun[measure].agg(
358	["mean", "std", "min", "max", "count", "sum"]
359	)
360	aggregated['mean_no_outlr'] = (
361	( aggregated['sum'] - aggregated['min'] - aggregated['max'] )
362	/
363	( aggregated['count'] - 2 )
364	)
365
366	#print(aggregated.head())
367
368	print('"{header}"'.format(header=fx))
369	text = aggregated.to_csv(header=False, index=True, sep='\t')
370	print(text)
371	print()
372	print()
373
374	def aMeanNoOutlr(range):
375	return ( range.sum() - range.min() - range.max() ) / ( range.count() - 2 )
376
377	def gMeanNoOutlr(range):
378	return ( range.prod() / range.min() / range.max() ) ** ( 1 / ( range.count() - 2 ) )
379
380
381	def trimPer( df, criteria ):
382	for field, values in criteria.items():
383	areMatches = df[ field ].isin(values)
384	df = df[ areMatches ]
385	return df
386
387	# The range from 0.9759 to 1.0247 (which is 1.05 x wide) has 1.0 in its centre.
388	# This is the bucket with key 0.
389	# Logs of values in this bucket go from -0.5 to +0.5.
390	# Rounding a log value to the nearest integer gives the key.
391	# Exponentiating a key directly gives the centre of its bucket.
392	# Exponentiating a key less 0.5 gives the bottom of its bucket.
393	# Gnuplot expects the latter.
394
395	bucketMin = 0.25
396	bucketMax = 4.0
397	bucketGrain = 1.05
398	bktKeyLo = math.floor( math.log(bucketMin, bucketGrain) )
399	bktKeyHi = math.ceil( math.log(bucketMax, bucketGrain) )
400
401	def bktKeyOfVal( relDur ):
402	distance = math.log(relDur, bucketGrain)
403	key = round( distance )
404	return key
405
406	def bktIxOfVal( relDur ):
407	return bktKeyToIx( bktKeyOfVal( relDur ) )
408
409	def botValOfBucketK( key ):
410	return bucketGrain ** ( key - 0.5 )
411
412	def topValOfBucketBotVal( botVal ):
413	return bucketGrain * botVal
414
415	def bktKeyToIx( key ):
416	return key - bktKeyLo
417
418	def bktIxToKey( ix ):
419	return ix + bktKeyLo
420
421	def botOfBucketOfVal( relDur ):
422	return botValOfBucketK( bktKeyOfVal( relDur ) )
423
424	buckets = [ botValOfBucketK(key) for key in range(bktKeyLo, bktKeyHi) ]
425
426	# printSingleDetail
427	def printHistos(*,
428	tgtMovement = 'all',
429	tgtPolarity = 'all',
430	tgtAccessor = 'all',
431	tgtInterleave = 0.0,
432	earlyFilter = {}, # exclude from benchmarking
433	lateFilter = {}, # exclude from output
434	drillOn = ['fx'],
435	marginalizeOn = None, # None means match drill-on
436	sumFile = sys.stdout,
437	detFile = sys.stdout ):
438
439	if marginalizeOn == None:
440	marginalizeOn = drillOn
441
442	# watch out for filtering too early here; need everything sticking around until baselines are applies
443	# ie, maybe I should get rid of all the tgt parms at the pre-benchmark layers
444	timings = getSingleResults(
445	tgtMovement = tgtMovement,
446	tgtPolarity = tgtPolarity,
447	tgtAccessor = tgtAccessor,
448	tgtInterleave = tgtInterleave)
449	timings = getJustCanon( timings,
450	fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp', 'lq-list'],
451	szInc = ['SM', 'ML'],
452	sExcl = [1] )
453
454	timings = trimPer( timings, earlyFilter )
455
456	options = timings.groupby(explanations)
457	aggregated = options.agg(
458	mean_op_dur_ns = ('mean_op_dur_ns', gMeanNoOutlr)
459	).reset_index()
460	annotateBaseline(aggregated, marginalizeOn)
461
462	aggregated = trimPer( aggregated, lateFilter )
463
464	# if examining "why CFA slow" need both
465	# - getVariousCfa inplace of getJust Canon
466	# - do annotate-then-filter because baseline needs to stay cfa-tailq-upp
467	# (filter-then-annotate is fine for general cases (where all three canons are included) and good for build time)
468
469
470	c_measure = c('OpDurRel', marginalizeOn)
471	# options = timings.groupby(explanations)
472
473	# aggregated = options.agg(
474	# **{measure:(measure,gMeanNoOutlr)}
475	# ).reset_index()
476
477	c_measureBkt = 'BUCKET_' + c_measure
478	aggregated[ c_measureBkt ] = aggregated[c_measure].apply( botOfBucketOfVal )
479
480	drillgrp = aggregated.groupby(drillOn)
481
482	# print(f'measure is {measure}')
483	# print()
484	# print()
485
486	for dkey, dgroup in drillgrp:
487	# print(mgroup, file=sys.stderr)
488
489	dkey_str = list( map( str, dkey ) )
490	header = str.join(', ', dkey_str)
491
492	dgroup_sole = dgroup.groupby((lambda _: 0))
493	print(f'"{header}"', file=sumFile)
494	printSingleSummaryFrom(
495	c_measure, dgroup_sole, file=sumFile, index=False )
496	print(file=sumFile)
497	print(file=sumFile)
498
499	histo_raw = dgroup[ c_measureBkt ].value_counts()
500	for b in buckets:
501	if b not in histo_raw.keys():
502	# print( f"{b} := 0", file=sys.stderr )
503	histo_raw[b] = 0
504	histo_raw = histo_raw.sort_index()
505
506	histo = histo_raw.rename("count").reset_index()
507	histo = histo.rename(columns={c_measureBkt: "y_lo"})
508	y_lo_col_loc = histo.columns.get_loc("y_lo")
509	histo.insert(y_lo_col_loc + 1, "y_hi", histo["y_lo"].apply(topValOfBucketBotVal))
510
511	print(f'"{header}"', file=detFile)
512	text = histo.to_csv(header=False, index=False, sep='\t')
513	print(text, file=detFile)
514	print(file=detFile)
515	print(file=detFile)
516
517	# print(f'"{header}" FULL')
518	# text = group.to_csv(header=False, index=True, sep='\t')
519	# print(text)
520	# print()
521	# print()
522
523	# print(f'"RAW"')
524	# text = timings.to_csv(header=False, index=True, sep='\t')
525	# print(text)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: