Context Navigation

source: doc/theses/mike_brooks_MMath/plots/ListCommon.py@ 741d004

Visit:

Last change on this file since 741d004 was 408f954, checked in by Michael Brooks <mlbrooks@…>, 9 days ago
fix list perf plotting issues with physical factors
Property mode set to `100644`
File size: 18.9 KB

Line
1	import pandas as pd
2	import numpy as np
3	import math
4	import os
5	import sys
6	from subprocess import Popen, PIPE
7	from scipy.stats import gmean
8
9	def getDataset( infile ):
10	# grep to remove lines that end in comma; these were error runs
11	with Popen("grep '[^,]$' " + infile, shell=True, stdout=PIPE) as process:
12	timings = pd.read_csv(
13	process.stdout,
14	names=['RunMoment', 'RunIdx', 'Args', 'Program', 'Width',
15	'expt_ops_completed', 'expt_elapsed_sec', 'mean_op_dur_ns'],
16	dtype={'RunMoment': str,
17	'RunIdx': np.int64,
18	'Args': str,
19	'Program': str,
20	'Width': np.int64,
21	'expt_ops_completed': np.int64,
22	'expt_elapsed_sec': np.float64,
23	'mean_op_dur_ns': np.float64},
24	parse_dates=['RunMoment']
25	)
26	# print(timings.head())
27
28	## parse executable name and args
29
30	timings[['ExperimentDurSec',
31	'CheckDonePeriod',
32	'Length',
33	'ExperimentDurOpCount',
34	'Seed',
35	'InterleaveFrac']] = timings['Args'].str.strip().str.split(expand=True)
36	timings["Length"] = pd.to_numeric(timings["Length"])
37	timings["InterleaveFrac"] = pd.to_numeric(timings["InterleaveFrac"]).round(3)
38
39	timings["NumNodes"] = timings["Length"] * timings["Width"]
40
41	timings[['__ProgramPrefix',
42	'fx',
43	'op']] = timings['Program'].str.split('--', expand=True)
44
45	timings[['movement',
46	'polarity',
47	'accessor']] = timings['op'].str.split('-', expand=True)
48
49	## SizeZone as NumNodes t-shirt size
50	timings['SizeZone'] = np.select(
51	condlist = [
52	( 6 <= timings['NumNodes']) & (timings['NumNodes'] <= 20),
53	(50 <= timings['NumNodes']) & (timings['NumNodes'] <= 200)
54	],
55	choicelist = [
56	'SM',
57	'ML'
58	],
59	default = 'none'
60	)
61
62	return timings
63
64	# `c` = column name
65	def c( baseName, marginalizeOn ):
66	margSlug = str.join( "_", marginalizeOn )
67	return baseName + "_" + margSlug
68
69	explanations = ['movement', 'polarity', 'accessor',
70	'NumNodes', 'Width', 'Length',
71	'SizeZone', # note fd: NumNodes -> SizeZone
72	'fx',
73	'machine',
74	'InterleaveFrac', # unused and always zero
75	]
76
77	# helper for avoiding pollution from e.g. alternate cfa list versions
78	# when a preference-limiting factor is marginalized, make bl value from preferred subset
79	# but still stamp result everywhere; e.g. even cfa-strip has canon-bl-relative perf
80	# when conditioning on such factor, peer groups are already small enough to stop such pollution
81	# use nontrivial marginalizeOn when calculating baseline values, to achieve the above outside-canonical behaviour non-degenerately
82	# use default full marginalizeOn when removing points from a graph, which leaves only canonical points
83	def getJustCanon( timings,
84	marginalizeOn = explanations, *,
85	# no c++: bl is for comparing intrusives
86	# no lq-list: sparse
87	# no cfa-fredDisbled: bl is for comparing prod-readies
88	fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp'],
89	szInc = ['SM', 'ML'],
90	sExcl = [1]
91	): # all explanations marginalized => maximally aggressive filter
92	if 'fx' in marginalizeOn:
93	fxIsCanon = timings.fx.isin(fxInc)
94	timings = timings[ fxIsCanon ]
95	if 'SizeZone' in marginalizeOn:
96	szIsCanon = timings.SizeZone.isin(szInc)
97	timings = timings[ szIsCanon ]
98	if 'NumNodes' in marginalizeOn:
99	sIsCanon = ~ timings.NumNodes.isin(sExcl)
100	timings = timings[ sIsCanon ]
101	return timings
102
103	# calls = 0
104
105	def annotateBaseline( timings, marginalizeOn ):
106	# global calls
107	# print( "XXX", calls, marginalizeOn, file=sys.stderr )
108	c_tgtPeers = c( 'Peers', marginalizeOn )
109	c_tgtBl = c("Baseline", marginalizeOn)
110	c_tgtRel = c("OpDurRel", marginalizeOn)
111	if c_tgtBl in timings.columns or c_tgtRel in timings.columns:
112	assert( c_tgtBl in timings.columns and c_tgtRel in timings.columns )
113	return
114	# size handling:
115	# two ordinary baselines (sz-nn, nn) and one synthetic baseline (sz)
116	# the SizeZone-only baseline has no interpretation wrt a real peer group
117	# it isolates the effect of belonging to one SZ or the other
118	# while conditioning away the specific-size effects within the SZ
119	# notably in zone SM, opDur-v-size usually pitches upward
120	# comparing to sz-only baseline gets rid of "they all pitch up," while keeping "SM is faster then ML"
121	if 'SizeZone' in marginalizeOn and 'NumNodes' not in marginalizeOn:
122	assert( 'Length' not in marginalizeOn ) # need to treat them in lockstep because of fd
123	# calls += 1
124	# special case: sz-only synthetic benchmark
125	margNeither = list( set(marginalizeOn) - {'SizeZone'} )
126	margBoth = list( set(marginalizeOn) \| {'NumNodes', 'Length'} )
127	margJustNn = list( set(margNeither) \| {'NumNodes', 'Length'} )
128	annotateBaseline( timings, margNeither )
129	annotateBaseline( timings, margBoth )
130	annotateBaseline( timings, margJustNn )
131	c_neitherRel = c("OpDurRel", margNeither)
132	c_bothBl = c("Baseline", margBoth)
133	c_justNnBl = c("Baseline", margJustNn)
134	timings[ c_tgtBl ] = np.nan
135	timings[ c_tgtRel ] = timings[ c_justNnBl ] / timings[ c_bothBl ] * timings[ c_neitherRel ]
136	else: # general case
137	# prevent non-canonical samples from polluting baseline values
138	# note, depending on the presentation, the polluting points may already be removed from timings entirely
139	canonSrc = getJustCanon(timings, marginalizeOn)
140	# print(f"for marg on {marginalizeOn}, \|canonSrc\| = {len(canonSrc)}, \|timings\| = {len(timings)}", file=sys.stderr)
141	conditionOn = list( set(explanations) - set(marginalizeOn) )
142	# print( "marginalizing on", marginalizeOn, "conditioning on", conditionOn, file=sys.stderr )
143	# calls += 1
144
145	if conditionOn:
146	stats = canonSrc.groupby(conditionOn)['mean_op_dur_ns'].agg(**{
147	c_tgtPeers: 'count',
148	c_tgtBl: gmean
149	})
150	group_lookup = timings.set_index(conditionOn).index
151	timings[c_tgtPeers] = stats[c_tgtPeers].reindex(group_lookup).values
152	timings[c_tgtBl] = stats[c_tgtBl].reindex(group_lookup).values
153	else:
154	stats = canonSrc.groupby((lambda _: 0))['mean_op_dur_ns'].agg(**{
155	c_tgtPeers: 'count',
156	c_tgtBl: gmean
157	})
158	# Extract the single row
159	row = stats.iloc[0]
160	# Broadcast to all rows
161	timings[c_tgtPeers] = row[c_tgtPeers]
162	timings[c_tgtBl] = row[c_tgtBl]
163
164
165	# everywhere := itself / [preferred-subset derived]
166	timings[c_tgtRel] = timings['mean_op_dur_ns'] / timings[c_tgtBl]
167
168
169	# longer column name (Peers_%, Baseline_%, OpDurRel_%) gives larger peer group and more (total) variation
170	def annotateCommonBaselines( timings ):
171	def applyGeneralExplanations( bgMarginalizeOn ):
172	def fg( marginalizeOn ):
173	return bgMarginalizeOn + marginalizeOn
174	annotateBaseline( timings, fg( [] ) ) # all-in baseline (all factors conditioned): only inter-run differences
175	annotateBaseline( timings, fg( ['movement', 'polarity'] ) )
176	annotateBaseline( timings, fg( ['accessor'] ) )
177	annotateBaseline( timings, fg( ['machine'] ) )
178
179	annotateBaseline( timings, fg( ['SizeZone', 'NumNodes'] ) ) # SizeZone is NOT redundant; conditioned on neither
180	annotateBaseline( timings, fg( ['NumNodes'] ) ) # still conditioned on SizeZone
181	annotateBaseline( timings, fg( ['SizeZone'] ) ) # synthetic: conditioned on NumNodes but not SizeZone
182	applyGeneralExplanations( [] )
183	applyGeneralExplanations( ['fx'] )
184
185	def getMachineDataset( dsname, machine ):
186	infileLocal = f"results-{machine}-{dsname}.csv"
187	infile = os.path.dirname(os.path.abspath(__file__)) + '/../benchmarks/list/' + infileLocal
188	timings = getDataset( infile )
189	timings['machine'] = machine
190	return timings
191
192	allMachines = ['swift', 'java']
193
194
195	# general, as in exclude the stripped-down experimental CFAs
196	general_fxs_full = ['cfa-cfa', 'cpp-stlref', 'upp-upp', 'lq-tailq', 'lq-list']
197	general_fxs_intrusive = ['cfa-cfa', 'upp-upp', 'lq-tailq', 'lq-list']
198
199	def getSingleResults(
200	dsnames = ['general'],
201	machines = allMachines,
202	*,
203	fxs = general_fxs_full,
204	tgtMovement = 'all',
205	tgtPolarity = 'all',
206	tgtAccessor = 'all',
207	tgtInterleave = 0.0 ):
208
209	timings = pd.concat([
210	getMachineDataset( d, m )
211	for d in dsnames
212	for m in machines ])
213
214	# print(timings, file=sys.stderr)
215
216	movements = timings['movement'].unique()
217	polarities = timings['polarity'].unique()
218	accessors = timings['accessor'].unique()
219	interleaves = timings['InterleaveFrac'].unique()
220
221	if movements.size > 1:
222	movements = np.append(movements, 'all')
223	if polarities.size > 1:
224	polarities = np.append(polarities, 'all')
225	if accessors.size > 1:
226	accessors = np.append(accessors, 'all')
227
228	# print(f"trying to filter {dsname} {machines} {len(timings)}", file=sys.stderr)
229	grp = timings.groupby('fx')
230	# print(f"with fxs {grp.groups.keys()}", file=sys.stderr)
231	timings = pd.concat([
232	grp.get_group(fx)
233	for fx in fxs ])
234
235	if (tgtMovement != 'all'):
236	grp = timings.groupby('movement')
237	timings = grp.get_group(tgtMovement)
238	if (tgtPolarity != 'all'):
239	grp = timings.groupby('polarity')
240	timings = grp.get_group(tgtPolarity)
241	if (tgtAccessor != 'all'):
242	grp = timings.groupby('accessor')
243	timings = grp.get_group(tgtAccessor)
244	if (tgtInterleave != 'all'):
245	timings = timings[ timings['InterleaveFrac'] == float(tgtInterleave) ]
246
247
248	return timings
249
250	def stripMachine(pyCore):
251	parts = str.split(pyCore, '-')
252	exceptLast = parts[ 0 : -1 ]
253	return str.join('-', exceptLast)
254
255	def getSummaryMeta(metaFileCore):
256	metafile = os.path.dirname(os.path.abspath(__file__)) + "/" + metaFileCore + '-meta.dat'
257	metadata = pd.read_csv(
258	metafile,
259	names=['OpIx', 'Op'],
260	delimiter='\t'
261	)
262	metadata[['movement',
263	'polarity',
264	'accessor']] = metadata['Op'].str.split('\\\\n', expand=True)
265	metadata.replace('*', 'all', inplace=True)
266	metadata.replace('S', 'stack', inplace=True)
267	metadata.replace('Q', 'queue', inplace=True)
268	metadata.replace('iF', 'insfirst', inplace=True)
269	metadata.replace('iL', 'inslast', inplace=True)
270	metadata.replace('H', 'allhead', inplace=True)
271	metadata.replace('Ie', 'inselem', inplace=True)
272	metadata.replace('Re', 'remelem', inplace=True)
273	return metadata
274
275	def printSingleSummaryFrom( measure, dfgrouped, *,
276	file = sys.stdout,
277	index = True,
278	end = '\n' ) :
279	aggregated = dfgrouped[measure].agg([
280	("gmean", gmean), "std", "min", "max", "count",
281	lambda x: x.quantile(0.025),
282	lambda x: x.quantile(0.16),
283	lambda x: x.quantile(0.5),
284	lambda x: x.quantile(0.84),
285	lambda x: x.quantile(0.975)
286	])
287	text = aggregated.to_csv(header=False, index=index, sep='\t')
288	print(text, file=file, end=end)
289
290
291	swiftSweetspot = (lambda x: x > 16 and x < 150)
292	# swiftSweetspot = (lambda x: x > 4 and x < 32)
293	javaSweetspot = (lambda x: x >= 24 and x <= 256)
294
295	def printManySummary(*,
296	dsnames = ['general'],
297	machines = allMachines,
298	metafileCore,
299	fxs,
300	sizeQual,
301	tgtInterleave = 0.0,
302	marginalizeOn = ['fx'] ) :
303
304	metadata = getSummaryMeta(metafileCore)
305
306	measure = c( 'OpDurRel', marginalizeOn )
307
308	print("# op_num\tfx_num\tfx\tmean\tstdev\tmin\tmax\tcount\tpl95\tpl68\tp50\tph68\tph95")
309
310	for op in metadata.itertuples():
311	timings = getSingleResults(dsnames, machines,
312	fxs=fxs,
313	tgtMovement = op.movement,
314	tgtPolarity = op.polarity,
315	tgtAccessor = op.accessor,
316	tgtInterleave = tgtInterleave )
317	annotateBaseline(timings, marginalizeOn)
318
319	timings = timings[ timings['fx'].isin(fxs) ]
320	timings = timings[ timings['NumNodes'].apply(sizeQual) ]
321
322	fxnums = timings['fx'].apply(
323	lambda fx: fxs.index(fx) + 1
324	)
325	timings.insert(loc=0, column='fx_num', value=fxnums)
326	timings.insert(loc=0, column='op_num', value=op.OpIx)
327
328	grouped = timings.groupby(['op_num', 'fx_num', 'fx'])
329	printSingleSummaryFrom( measure, grouped, end ='' )
330
331
332	def printSingleDetail(
333	dsnames = ['general'],
334	machines = allMachines,
335	*,
336	fxs = general_fxs_full,
337	tgtMovement = 'all',
338	tgtPolarity = 'all',
339	tgtAccessor = 'all',
340	tgtInterleave = 0.0,
341	measureBase = 'mean_op_dur_ns',
342	marginalizeOn = explanations ):
343
344
345	timings = getSingleResults(dsnames, machines,
346	fxs = fxs,
347	tgtMovement = tgtMovement,
348	tgtPolarity = tgtPolarity,
349	tgtAccessor = tgtAccessor,
350	tgtInterleave = tgtInterleave)
351
352	if measureBase == 'OpDurRel':
353	annotateBaseline(timings, marginalizeOn)
354	measure = c( measureBase, marginalizeOn )
355	elif measureBase == 'mean_op_dur_ns':
356	measure = measureBase
357	else:
358	raise RuntimeError(f"measureBase '{measureBase}' not handled")
359
360	groupedFx = timings.groupby('fx')
361	for fx, fgroup in groupedFx:
362	# print(fgroup.head())
363	groupedRun = fgroup.groupby(['NumNodes']) # , 'fx', 'op'
364	aggregated = groupedRun[measure].agg(
365	["mean", "std", "min", "max", "count", "sum"]
366	)
367	aggregated['mean_no_outlr'] = (
368	( aggregated['sum'] - aggregated['min'] - aggregated['max'] )
369	/
370	( aggregated['count'] - 2 )
371	)
372
373	#print(aggregated.head())
374
375	print('"{header}"'.format(header=fx))
376	text = aggregated.to_csv(header=False, index=True, sep='\t')
377	print(text)
378	print()
379	print()
380
381	def aMeanNoOutlr(range):
382	return ( range.sum() - range.min() - range.max() ) / ( range.count() - 2 )
383
384	def gMeanNoOutlr(range):
385	return ( range.prod() / range.min() / range.max() ) ** ( 1 / ( range.count() - 2 ) )
386
387
388	def trimPer( df, criteria ):
389	for field, values in criteria.items():
390	areMatches = df[ field ].isin(values)
391	df = df[ areMatches ]
392	return df
393
394	# The range from 0.9759 to 1.0247 (which is 1.05 x wide) has 1.0 in its centre.
395	# This is the bucket with key 0.
396	# Logs of values in this bucket go from -0.5 to +0.5.
397	# Rounding a log value to the nearest integer gives the key.
398	# Exponentiating a key directly gives the centre of its bucket.
399	# Exponentiating a key less 0.5 gives the bottom of its bucket.
400	# Gnuplot expects the latter.
401
402	bucketMin = 0.25
403	bucketMax = 4.0
404	bucketGrain = 1.05
405	bktKeyLo = math.floor( math.log(bucketMin, bucketGrain) )
406	bktKeyHi = math.ceil( math.log(bucketMax, bucketGrain) )
407
408	def bktKeyOfVal( relDur ):
409	distance = math.log(relDur, bucketGrain)
410	key = round( distance )
411	return key
412
413	def bktIxOfVal( relDur ):
414	return bktKeyToIx( bktKeyOfVal( relDur ) )
415
416	def botValOfBucketK( key ):
417	return bucketGrain ** ( key - 0.5 )
418
419	def topValOfBucketBotVal( botVal ):
420	return bucketGrain * botVal
421
422	def bktKeyToIx( key ):
423	return key - bktKeyLo
424
425	def bktIxToKey( ix ):
426	return ix + bktKeyLo
427
428	def botOfBucketOfVal( relDur ):
429	return botValOfBucketK( bktKeyOfVal( relDur ) )
430
431	buckets = [ botValOfBucketK(key) for key in range(bktKeyLo, bktKeyHi) ]
432
433	# printSingleDetail
434	def printHistos(*,
435	tgtMovement = 'all',
436	tgtPolarity = 'all',
437	tgtAccessor = 'all',
438	tgtInterleave = 0.0,
439	earlyFilter = {}, # exclude from benchmarking
440	lateFilter = {}, # exclude from output
441	drillOn = ['fx'],
442	marginalizeOn = None, # None means match drill-on
443	sumFile = sys.stdout,
444	detFile = sys.stdout ):
445
446	if marginalizeOn == None:
447	marginalizeOn = drillOn
448
449	# watch out for filtering too early here; need everything sticking around until baselines are applies
450	# ie, maybe I should get rid of all the tgt parms at the pre-benchmark layers
451	timings = getSingleResults(
452	tgtMovement = tgtMovement,
453	tgtPolarity = tgtPolarity,
454	tgtAccessor = tgtAccessor,
455	tgtInterleave = tgtInterleave)
456	timings = getJustCanon( timings,
457	fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp', 'lq-list'],
458	szInc = ['SM', 'ML'],
459	sExcl = [1] )
460
461	timings = trimPer( timings, earlyFilter )
462
463	options = timings.groupby(explanations)
464	aggregated = options.agg(
465	mean_op_dur_ns = ('mean_op_dur_ns', gMeanNoOutlr)
466	).reset_index()
467	annotateBaseline(aggregated, marginalizeOn)
468
469	aggregated = trimPer( aggregated, lateFilter )
470
471	# if examining "why CFA slow" need both
472	# - getVariousCfa inplace of getJust Canon
473	# - do annotate-then-filter because baseline needs to stay cfa-tailq-upp
474	# (filter-then-annotate is fine for general cases (where all three canons are included) and good for build time)
475
476
477	c_measure = c('OpDurRel', marginalizeOn)
478	# options = timings.groupby(explanations)
479
480	# aggregated = options.agg(
481	# **{measure:(measure,gMeanNoOutlr)}
482	# ).reset_index()
483
484	c_measureBkt = 'BUCKET_' + c_measure
485	aggregated[ c_measureBkt ] = aggregated[c_measure].apply( botOfBucketOfVal )
486
487	drillgrp = aggregated.groupby(drillOn)
488
489	# print(f'measure is {measure}')
490	# print()
491	# print()
492
493	for dkey, dgroup in drillgrp:
494	# print(mgroup, file=sys.stderr)
495
496	dkey_str = list( map( str, dkey ) )
497	header = str.join(', ', dkey_str)
498
499	dgroup_sole = dgroup.groupby((lambda _: 0))
500	print(f'"{header}"', file=sumFile)
501	printSingleSummaryFrom(
502	c_measure, dgroup_sole, file=sumFile, index=False )
503	print(file=sumFile)
504	print(file=sumFile)
505
506	histo_raw = dgroup[ c_measureBkt ].value_counts()
507	for b in buckets:
508	if b not in histo_raw.keys():
509	# print( f"{b} := 0", file=sys.stderr )
510	histo_raw[b] = 0
511	histo_raw = histo_raw.sort_index()
512
513	histo = histo_raw.rename("count").reset_index()
514	histo = histo.rename(columns={c_measureBkt: "y_lo"})
515	y_lo_col_loc = histo.columns.get_loc("y_lo")
516	histo.insert(y_lo_col_loc + 1, "y_hi", histo["y_lo"].apply(topValOfBucketBotVal))
517
518	print(f'"{header}"', file=detFile)
519	text = histo.to_csv(header=False, index=False, sep='\t')
520	print(text, file=detFile)
521	print(file=detFile)
522	print(file=detFile)
523
524	# print(f'"{header}" FULL')
525	# text = group.to_csv(header=False, index=True, sep='\t')
526	# print(text)
527	# print()
528	# print()
529
530	# print(f'"RAW"')
531	# text = timings.to_csv(header=False, index=True, sep='\t')
532	# print(text)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: