Context Navigation

source: doc/theses/mike_brooks_MMath/plots/ListCommon.py@ bf73608

Visit:

Last change on this file since bf73608 was bf73608, checked in by Michael Brooks <mlbrooks@…>, 4 days ago
revisions to ll perf intro and graph formatting
Property mode set to `100644`
File size: 18.6 KB

Line
1	import pandas as pd
2	import numpy as np
3	import math
4	import os
5	import sys
6	from subprocess import Popen, PIPE
7	from scipy.stats import gmean
8
9	def getDataset( infile ):
10	# grep to remove lines that end in comma; these were error runs
11	with Popen("grep '[^,]$' " + infile, shell=True, stdout=PIPE) as process:
12	timings = pd.read_csv(
13	process.stdout,
14	names=['RunMoment', 'RunIdx', 'Args', 'Program', 'Width',
15	'expt_ops_completed', 'expt_elapsed_sec', 'mean_op_dur_ns'],
16	dtype={'RunMoment': str,
17	'RunIdx': np.int64,
18	'Args': str,
19	'Program': str,
20	'Width': np.int64,
21	'expt_ops_completed': np.int64,
22	'expt_elapsed_sec': np.float64,
23	'mean_op_dur_ns': np.float64},
24	parse_dates=['RunMoment']
25	)
26	# print(timings.head())
27
28	## parse executable name and args
29
30	timings[['ExperimentDurSec',
31	'CheckDonePeriod',
32	'Length',
33	'ExperimentDurOpCount',
34	'Seed',
35	'InterleaveFrac']] = timings['Args'].str.strip().str.split(expand=True)
36	timings["Length"] = pd.to_numeric(timings["Length"])
37	timings["InterleaveFrac"] = pd.to_numeric(timings["InterleaveFrac"]).round(3)
38
39	timings["NumNodes"] = timings["Length"] * timings["Width"]
40
41	timings[['__ProgramPrefix',
42	'fx',
43	'op']] = timings['Program'].str.split('--', expand=True)
44
45	timings[['movement',
46	'polarity',
47	'accessor']] = timings['op'].str.split('-', expand=True)
48
49	## SizeZone as NumNodes t-shirt size
50	timings['SizeZone'] = np.select(
51	condlist = [
52	( 6 <= timings['NumNodes']) & (timings['NumNodes'] <= 20),
53	(50 <= timings['NumNodes']) & (timings['NumNodes'] <= 200)
54	],
55	choicelist = [
56	'SM',
57	'ML'
58	],
59	default = 'none'
60	)
61
62	return timings
63
64	# `c` = column name
65	def c( baseName, marginalizeOn ):
66	margSlug = str.join( "_", marginalizeOn )
67	return baseName + "_" + margSlug
68
69	explanations = ['movement', 'polarity', 'accessor',
70	'NumNodes', 'Width', 'Length',
71	'SizeZone', # note fd: NumNodes -> SizeZone
72	'fx',
73	'machine',
74	'InterleaveFrac', # unused and always zero
75	]
76
77	# helper for avoiding pollution from e.g. alternate cfa list versions
78	# when a preference-limiting factor is marginalized, make bl value from preferred subset
79	# but still stamp result everywhere; e.g. even cfa-strip has canon-bl-relative perf
80	# when conditioning on such factor, peer groups are already small enough to stop such pollution
81	# use nontrivial marginalizeOn when calculating baseline values, to achieve the above outside-canonical behaviour non-degenerately
82	# use default full marginalizeOn when removing points from a graph, which leaves only canonical points
83	def getJustCanon( timings,
84	marginalizeOn = explanations, *,
85	# no c++: bl is for comparing intrusives
86	# no lq-list: sparse
87	# no cfa-fredDisbled: bl is for comparing prod-readies
88	fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp'],
89	szInc = ['SM', 'ML'],
90	sExcl = [1]
91	): # all explanations marginalized => maximally aggressive filter
92	if 'fx' in marginalizeOn:
93	fxIsCanon = timings.fx.isin(fxInc)
94	timings = timings[ fxIsCanon ]
95	if 'SizeZone' in marginalizeOn:
96	szIsCanon = timings.SizeZone.isin(szInc)
97	timings = timings[ szIsCanon ]
98	if 'NumNodes' in marginalizeOn:
99	sIsCanon = ~ timings.NumNodes.isin(sExcl)
100	timings = timings[ sIsCanon ]
101	return timings
102
103
104	def annotateBaseline( timings, marginalizeOn ):
105	c_tgtPeers = c( 'Peers', marginalizeOn )
106	c_tgtBl = c("Baseline", marginalizeOn)
107	c_tgtRel = c("OpDurRel", marginalizeOn)
108	if c_tgtBl in timings.columns or c_tgtRel in timings.columns:
109	assert( c_tgtBl in timings.columns and c_tgtRel in timings.columns )
110	return
111	# size handling:
112	# two ordinary baselines (sz-nn, nn) and one synthetic baseline (sz)
113	# the SizeZone-only baseline has no interpretation wrt a real peer group
114	# it isolates the effect of belonging to one SZ or the other
115	# while conditioning away the specific-size effects within the SZ
116	# notably in zone SM, opDur-v-size usually pitches upward
117	# comparing to sz-only baseline gets rid of "they all pitch up," while keeping "SM is faster then ML"
118	if 'SizeZone' in marginalizeOn and 'NumNodes' not in marginalizeOn:
119	# special case: sz-only synthetic benchmark
120	margNeither = list( set(marginalizeOn) - {'SizeZone'} )
121	margBoth = list( set(marginalizeOn) \| {'NumNodes'} )
122	margJustNn = list( set(margNeither) \| {'NumNodes'} )
123	annotateBaseline( timings, margNeither )
124	annotateBaseline( timings, margBoth )
125	annotateBaseline( timings, margJustNn )
126	c_neitherRel = c("OpDurRel", margNeither)
127	c_bothBl = c("Baseline", margBoth)
128	c_justNnBl = c("Baseline", margJustNn)
129	timings[ c_tgtBl ] = np.nan
130	timings[ c_tgtRel ] = timings[ c_justNnBl ] / timings[ c_bothBl ] * timings[ c_neitherRel ]
131	else: # general case
132	# prevent non-canonical samples from polluting baseline values
133	# note, depending on the presentation, the polluting points may already be removed from timings entirely
134	canonSrc = getJustCanon(timings, marginalizeOn)
135	# print(f"for marg on {marginalizeOn}, \|canonSrc\| = {len(canonSrc)}, \|timings\| = {len(timings)}", file=sys.stderr)
136	conditionOn = list( set(explanations) - set(marginalizeOn) )
137	# print( "marginalizing on", marginalizeOn, "conditioning on", conditionOn, file=sys.stderr )
138
139	if conditionOn:
140	stats = canonSrc.groupby(conditionOn)['mean_op_dur_ns'].agg(**{
141	c_tgtPeers: 'count',
142	c_tgtBl: gmean
143	})
144	group_lookup = timings.set_index(conditionOn).index
145	timings[c_tgtPeers] = stats[c_tgtPeers].reindex(group_lookup).values
146	timings[c_tgtBl] = stats[c_tgtBl].reindex(group_lookup).values
147	else:
148	stats = canonSrc.groupby((lambda _: 0))['mean_op_dur_ns'].agg(**{
149	c_tgtPeers: 'count',
150	c_tgtBl: gmean
151	})
152	# Extract the single row
153	row = stats.iloc[0]
154	# Broadcast to all rows
155	timings[c_tgtPeers] = row[c_tgtPeers]
156	timings[c_tgtBl] = row[c_tgtBl]
157
158
159	# everywhere := itself / [preferred-subset derived]
160	timings[c_tgtRel] = timings['mean_op_dur_ns'] / timings[c_tgtBl]
161
162
163	# longer column name (Peers_%, Baseline_%, OpDurRel_%) gives larger peer group and more (total) variation
164	def annotateCommonBaselines( timings ):
165	def applyGeneralExplanations( bgMarginalizeOn ):
166	def fg( marginalizeOn ):
167	return bgMarginalizeOn + marginalizeOn
168	annotateBaseline( timings, fg( [] ) ) # all-in baseline (all factors conditioned): only inter-run differences
169	annotateBaseline( timings, fg( ['movement', 'polarity'] ) )
170	annotateBaseline( timings, fg( ['accessor'] ) )
171	annotateBaseline( timings, fg( ['machine'] ) )
172
173	annotateBaseline( timings, fg( ['SizeZone', 'NumNodes'] ) ) # SizeZone is NOT redundant; conditioned on neither
174	annotateBaseline( timings, fg( ['NumNodes'] ) ) # still conditioned on SizeZone
175	annotateBaseline( timings, fg( ['SizeZone'] ) ) # synthetic: conditioned on NumNodes but not SizeZone
176	applyGeneralExplanations( [] )
177	applyGeneralExplanations( ['fx'] )
178
179	def getMachineDataset( dsname, machine ):
180	infileLocal = f"results-{machine}-{dsname}.csv"
181	infile = os.path.dirname(os.path.abspath(__file__)) + '/../benchmarks/list/' + infileLocal
182	timings = getDataset( infile )
183	timings['machine'] = machine
184	return timings
185
186	allMachines = ['swift', 'java']
187
188
189	# general, as in exclude the stripped-down experimental CFAs
190	general_fxs_full = ['cfa-cfa', 'cpp-stlref', 'upp-upp', 'lq-tailq', 'lq-list']
191	general_fxs_intrusive = ['cfa-cfa', 'upp-upp', 'lq-tailq', 'lq-list']
192
193	def getSingleResults(
194	dsnames = ['general'],
195	machines = allMachines,
196	*,
197	fxs = general_fxs_full,
198	tgtMovement = 'all',
199	tgtPolarity = 'all',
200	tgtAccessor = 'all',
201	tgtInterleave = 0.0 ):
202
203	timings = pd.concat([
204	getMachineDataset( d, m )
205	for d in dsnames
206	for m in machines ])
207
208	# print(timings, file=sys.stderr)
209
210	movements = timings['movement'].unique()
211	polarities = timings['polarity'].unique()
212	accessors = timings['accessor'].unique()
213	interleaves = timings['InterleaveFrac'].unique()
214
215	if movements.size > 1:
216	movements = np.append(movements, 'all')
217	if polarities.size > 1:
218	polarities = np.append(polarities, 'all')
219	if accessors.size > 1:
220	accessors = np.append(accessors, 'all')
221
222	# print(f"trying to filter {dsname} {machines} {len(timings)}", file=sys.stderr)
223	grp = timings.groupby('fx')
224	# print(f"with fxs {grp.groups.keys()}", file=sys.stderr)
225	timings = pd.concat([
226	grp.get_group(fx)
227	for fx in fxs ])
228
229	if (tgtMovement != 'all'):
230	grp = timings.groupby('movement')
231	timings = grp.get_group(tgtMovement)
232	if (tgtPolarity != 'all'):
233	grp = timings.groupby('polarity')
234	timings = grp.get_group(tgtPolarity)
235	if (tgtAccessor != 'all'):
236	grp = timings.groupby('accessor')
237	timings = grp.get_group(tgtAccessor)
238	if (tgtInterleave != 'all'):
239	timings = timings[ timings['InterleaveFrac'] == float(tgtInterleave) ]
240
241
242	return timings
243
244	def stripMachine(pyCore):
245	parts = str.split(pyCore, '-')
246	exceptLast = parts[ 0 : -1 ]
247	return str.join('-', exceptLast)
248
249	def getSummaryMeta(metaFileCore):
250	metafile = os.path.dirname(os.path.abspath(__file__)) + "/" + metaFileCore + '-meta.dat'
251	metadata = pd.read_csv(
252	metafile,
253	names=['OpIx', 'Op'],
254	delimiter='\t'
255	)
256	metadata[['movement',
257	'polarity',
258	'accessor']] = metadata['Op'].str.split('\\\\n', expand=True)
259	metadata.replace('*', 'all', inplace=True)
260	metadata.replace('S', 'stack', inplace=True)
261	metadata.replace('Q', 'queue', inplace=True)
262	metadata.replace('iF', 'insfirst', inplace=True)
263	metadata.replace('iL', 'inslast', inplace=True)
264	metadata.replace('H', 'allhead', inplace=True)
265	metadata.replace('Ie', 'inselem', inplace=True)
266	metadata.replace('Re', 'remelem', inplace=True)
267	return metadata
268
269	def printSingleSummaryFrom( measure, dfgrouped, *,
270	file = sys.stdout,
271	index = True,
272	end = '\n' ) :
273	aggregated = dfgrouped[measure].agg([
274	("gmean", gmean), "std", "min", "max", "count",
275	lambda x: x.quantile(0.025),
276	lambda x: x.quantile(0.16),
277	lambda x: x.quantile(0.5),
278	lambda x: x.quantile(0.84),
279	lambda x: x.quantile(0.975)
280	])
281	text = aggregated.to_csv(header=False, index=index, sep='\t')
282	print(text, file=file, end=end)
283
284
285	swiftSweetspot = (lambda x: x > 16 and x < 150)
286	# swiftSweetspot = (lambda x: x > 4 and x < 32)
287	javaSweetspot = (lambda x: x >= 24 and x <= 256)
288
289	def printManySummary(*,
290	dsnames = ['general'],
291	machines = allMachines,
292	metafileCore,
293	fxs,
294	sizeQual,
295	tgtInterleave = 0.0,
296	marginalizeOn = ['fx'] ) :
297
298	metadata = getSummaryMeta(metafileCore)
299
300	measure = c( 'OpDurRel', marginalizeOn )
301
302	print("# op_num\tfx_num\tfx\tmean\tstdev\tmin\tmax\tcount\tpl95\tpl68\tp50\tph68\tph95")
303
304	for op in metadata.itertuples():
305	timings = getSingleResults(dsnames, machines,
306	fxs=fxs,
307	tgtMovement = op.movement,
308	tgtPolarity = op.polarity,
309	tgtAccessor = op.accessor,
310	tgtInterleave = tgtInterleave )
311	annotateBaseline(timings, marginalizeOn)
312
313	timings = timings[ timings['fx'].isin(fxs) ]
314	timings = timings[ timings['NumNodes'].apply(sizeQual) ]
315
316	fxnums = timings['fx'].apply(
317	lambda fx: fxs.index(fx) + 1
318	)
319	timings.insert(loc=0, column='fx_num', value=fxnums)
320	timings.insert(loc=0, column='op_num', value=op.OpIx)
321
322	grouped = timings.groupby(['op_num', 'fx_num', 'fx'])
323	printSingleSummaryFrom( measure, grouped, end ='' )
324
325
326	def printSingleDetail(
327	dsnames = ['general'],
328	machines = allMachines,
329	*,
330	fxs = general_fxs_full,
331	tgtMovement = 'all',
332	tgtPolarity = 'all',
333	tgtAccessor = 'all',
334	tgtInterleave = 0.0,
335	measureBase = 'mean_op_dur_ns',
336	marginalizeOn = explanations ):
337
338
339	timings = getSingleResults(dsnames, machines,
340	fxs = fxs,
341	tgtMovement = tgtMovement,
342	tgtPolarity = tgtPolarity,
343	tgtAccessor = tgtAccessor,
344	tgtInterleave = tgtInterleave)
345
346	if measureBase == 'OpDurRel':
347	annotateBaseline(timings, marginalizeOn)
348	measure = c( measureBase, marginalizeOn )
349	elif measureBase == 'mean_op_dur_ns':
350	measure = measureBase
351	else:
352	raise RuntimeError(f"measureBase '{measureBase}' not handled")
353
354	groupedFx = timings.groupby('fx')
355	for fx, fgroup in groupedFx:
356	# print(fgroup.head())
357	groupedRun = fgroup.groupby(['NumNodes']) # , 'fx', 'op'
358	aggregated = groupedRun[measure].agg(
359	["mean", "std", "min", "max", "count", "sum"]
360	)
361	aggregated['mean_no_outlr'] = (
362	( aggregated['sum'] - aggregated['min'] - aggregated['max'] )
363	/
364	( aggregated['count'] - 2 )
365	)
366
367	#print(aggregated.head())
368
369	print('"{header}"'.format(header=fx))
370	text = aggregated.to_csv(header=False, index=True, sep='\t')
371	print(text)
372	print()
373	print()
374
375	def aMeanNoOutlr(range):
376	return ( range.sum() - range.min() - range.max() ) / ( range.count() - 2 )
377
378	def gMeanNoOutlr(range):
379	return ( range.prod() / range.min() / range.max() ) ** ( 1 / ( range.count() - 2 ) )
380
381
382	def trimPer( df, criteria ):
383	for field, values in criteria.items():
384	areMatches = df[ field ].isin(values)
385	df = df[ areMatches ]
386	return df
387
388	# The range from 0.9759 to 1.0247 (which is 1.05 x wide) has 1.0 in its centre.
389	# This is the bucket with key 0.
390	# Logs of values in this bucket go from -0.5 to +0.5.
391	# Rounding a log value to the nearest integer gives the key.
392	# Exponentiating a key directly gives the centre of its bucket.
393	# Exponentiating a key less 0.5 gives the bottom of its bucket.
394	# Gnuplot expects the latter.
395
396	bucketMin = 0.25
397	bucketMax = 4.0
398	bucketGrain = 1.05
399	bktKeyLo = math.floor( math.log(bucketMin, bucketGrain) )
400	bktKeyHi = math.ceil( math.log(bucketMax, bucketGrain) )
401
402	def bktKeyOfVal( relDur ):
403	distance = math.log(relDur, bucketGrain)
404	key = round( distance )
405	return key
406
407	def bktIxOfVal( relDur ):
408	return bktKeyToIx( bktKeyOfVal( relDur ) )
409
410	def botValOfBucketK( key ):
411	return bucketGrain ** ( key - 0.5 )
412
413	def topValOfBucketBotVal( botVal ):
414	return bucketGrain * botVal
415
416	def bktKeyToIx( key ):
417	return key - bktKeyLo
418
419	def bktIxToKey( ix ):
420	return ix + bktKeyLo
421
422	def botOfBucketOfVal( relDur ):
423	return botValOfBucketK( bktKeyOfVal( relDur ) )
424
425	buckets = [ botValOfBucketK(key) for key in range(bktKeyLo, bktKeyHi) ]
426
427	# printSingleDetail
428	def printHistos(*,
429	tgtMovement = 'all',
430	tgtPolarity = 'all',
431	tgtAccessor = 'all',
432	tgtInterleave = 0.0,
433	earlyFilter = {}, # exclude from benchmarking
434	lateFilter = {}, # exclude from output
435	drillOn = ['fx'],
436	marginalizeOn = None, # None means match drill-on
437	sumFile = sys.stdout,
438	detFile = sys.stdout ):
439
440	if marginalizeOn == None:
441	marginalizeOn = drillOn
442
443	# watch out for filtering too early here; need everything sticking around until baselines are applies
444	# ie, maybe I should get rid of all the tgt parms at the pre-benchmark layers
445	timings = getSingleResults(
446	tgtMovement = tgtMovement,
447	tgtPolarity = tgtPolarity,
448	tgtAccessor = tgtAccessor,
449	tgtInterleave = tgtInterleave)
450	timings = getJustCanon( timings,
451	fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp', 'lq-list'],
452	szInc = ['SM', 'ML'],
453	sExcl = [1] )
454
455	timings = trimPer( timings, earlyFilter )
456
457	options = timings.groupby(explanations)
458	aggregated = options.agg(
459	mean_op_dur_ns = ('mean_op_dur_ns', gMeanNoOutlr)
460	).reset_index()
461	annotateBaseline(aggregated, marginalizeOn)
462
463	aggregated = trimPer( aggregated, lateFilter )
464
465	# if examining "why CFA slow" need both
466	# - getVariousCfa inplace of getJust Canon
467	# - do annotate-then-filter because baseline needs to stay cfa-tailq-upp
468	# (filter-then-annotate is fine for general cases (where all three canons are included) and good for build time)
469
470
471	c_measure = c('OpDurRel', marginalizeOn)
472	# options = timings.groupby(explanations)
473
474	# aggregated = options.agg(
475	# **{measure:(measure,gMeanNoOutlr)}
476	# ).reset_index()
477
478	c_measureBkt = 'BUCKET_' + c_measure
479	aggregated[ c_measureBkt ] = aggregated[c_measure].apply( botOfBucketOfVal )
480
481	drillgrp = aggregated.groupby(drillOn)
482
483	# print(f'measure is {measure}')
484	# print()
485	# print()
486
487	for dkey, dgroup in drillgrp:
488	# print(mgroup, file=sys.stderr)
489
490	dkey_str = list( map( str, dkey ) )
491	header = str.join(', ', dkey_str)
492
493	dgroup_sole = dgroup.groupby((lambda _: 0))
494	print(f'"{header}"', file=sumFile)
495	printSingleSummaryFrom(
496	c_measure, dgroup_sole, file=sumFile, index=False )
497	print(file=sumFile)
498	print(file=sumFile)
499
500	histo_raw = dgroup[ c_measureBkt ].value_counts()
501	for b in buckets:
502	if b not in histo_raw.keys():
503	# print( f"{b} := 0", file=sys.stderr )
504	histo_raw[b] = 0
505	histo_raw = histo_raw.sort_index()
506
507	histo = histo_raw.rename("count").reset_index()
508	histo = histo.rename(columns={c_measureBkt: "y_lo"})
509	y_lo_col_loc = histo.columns.get_loc("y_lo")
510	histo.insert(y_lo_col_loc + 1, "y_hi", histo["y_lo"].apply(topValOfBucketBotVal))
511
512	print(f'"{header}"', file=detFile)
513	text = histo.to_csv(header=False, index=False, sep='\t')
514	print(text, file=detFile)
515	print(file=detFile)
516	print(file=detFile)
517
518	# print(f'"{header}" FULL')
519	# text = group.to_csv(header=False, index=True, sep='\t')
520	# print(text)
521	# print()
522	# print()
523
524	# print(f'"RAW"')
525	# text = timings.to_csv(header=False, index=True, sep='\t')
526	# print(text)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: