source: doc/theses/mike_brooks_MMath/plots/ListCommon.py@ bf73608

Last change on this file since bf73608 was bf73608, checked in by Michael Brooks <mlbrooks@…>, 4 days ago

revisions to ll perf intro and graph formatting

  • Property mode set to 100644
File size: 18.6 KB
Line 
1import pandas as pd
2import numpy as np
3import math
4import os
5import sys
6from subprocess import Popen, PIPE
7from scipy.stats import gmean
8
9def getDataset( infile ):
10 # grep to remove lines that end in comma; these were error runs
11 with Popen("grep '[^,]$' " + infile, shell=True, stdout=PIPE) as process:
12 timings = pd.read_csv(
13 process.stdout,
14 names=['RunMoment', 'RunIdx', 'Args', 'Program', 'Width',
15 'expt_ops_completed', 'expt_elapsed_sec', 'mean_op_dur_ns'],
16 dtype={'RunMoment': str,
17 'RunIdx': np.int64,
18 'Args': str,
19 'Program': str,
20 'Width': np.int64,
21 'expt_ops_completed': np.int64,
22 'expt_elapsed_sec': np.float64,
23 'mean_op_dur_ns': np.float64},
24 parse_dates=['RunMoment']
25 )
26 # print(timings.head())
27
28 ## parse executable name and args
29
30 timings[['ExperimentDurSec',
31 'CheckDonePeriod',
32 'Length',
33 'ExperimentDurOpCount',
34 'Seed',
35 'InterleaveFrac']] = timings['Args'].str.strip().str.split(expand=True)
36 timings["Length"] = pd.to_numeric(timings["Length"])
37 timings["InterleaveFrac"] = pd.to_numeric(timings["InterleaveFrac"]).round(3)
38
39 timings["NumNodes"] = timings["Length"] * timings["Width"]
40
41 timings[['__ProgramPrefix',
42 'fx',
43 'op']] = timings['Program'].str.split('--', expand=True)
44
45 timings[['movement',
46 'polarity',
47 'accessor']] = timings['op'].str.split('-', expand=True)
48
49 ## SizeZone as NumNodes t-shirt size
50 timings['SizeZone'] = np.select(
51 condlist = [
52 ( 6 <= timings['NumNodes']) & (timings['NumNodes'] <= 20),
53 (50 <= timings['NumNodes']) & (timings['NumNodes'] <= 200)
54 ],
55 choicelist = [
56 'SM',
57 'ML'
58 ],
59 default = 'none'
60 )
61
62 return timings
63
64# `c` = column name
65def c( baseName, marginalizeOn ):
66 margSlug = str.join( "_", marginalizeOn )
67 return baseName + "_" + margSlug
68
69explanations = ['movement', 'polarity', 'accessor',
70 'NumNodes', 'Width', 'Length',
71 'SizeZone', # note fd: NumNodes -> SizeZone
72 'fx',
73 'machine',
74 'InterleaveFrac', # unused and always zero
75 ]
76
77# helper for avoiding pollution from e.g. alternate cfa list versions
78# when a preference-limiting factor is marginalized, make bl value from preferred subset
79# but still stamp result everywhere; e.g. even cfa-strip has canon-bl-relative perf
80# when conditioning on such factor, peer groups are already small enough to stop such pollution
81# use nontrivial marginalizeOn when calculating baseline values, to achieve the above outside-canonical behaviour non-degenerately
82# use default full marginalizeOn when removing points from a graph, which leaves only canonical points
83def getJustCanon( timings,
84 marginalizeOn = explanations, *,
85 # no c++: bl is for comparing intrusives
86 # no lq-list: sparse
87 # no cfa-fredDisbled: bl is for comparing prod-readies
88 fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp'],
89 szInc = ['SM', 'ML'],
90 sExcl = [1]
91 ): # all explanations marginalized => maximally aggressive filter
92 if 'fx' in marginalizeOn:
93 fxIsCanon = timings.fx.isin(fxInc)
94 timings = timings[ fxIsCanon ]
95 if 'SizeZone' in marginalizeOn:
96 szIsCanon = timings.SizeZone.isin(szInc)
97 timings = timings[ szIsCanon ]
98 if 'NumNodes' in marginalizeOn:
99 sIsCanon = ~ timings.NumNodes.isin(sExcl)
100 timings = timings[ sIsCanon ]
101 return timings
102
103
104def annotateBaseline( timings, marginalizeOn ):
105 c_tgtPeers = c( 'Peers', marginalizeOn )
106 c_tgtBl = c("Baseline", marginalizeOn)
107 c_tgtRel = c("OpDurRel", marginalizeOn)
108 if c_tgtBl in timings.columns or c_tgtRel in timings.columns:
109 assert( c_tgtBl in timings.columns and c_tgtRel in timings.columns )
110 return
111 # size handling:
112 # two ordinary baselines (sz-nn, nn) and one synthetic baseline (sz)
113 # the SizeZone-only baseline has no interpretation wrt a real peer group
114 # it isolates the effect of belonging to one SZ or the other
115 # while conditioning away the specific-size effects within the SZ
116 # notably in zone SM, opDur-v-size usually pitches upward
117 # comparing to sz-only baseline gets rid of "they all pitch up," while keeping "SM is faster then ML"
118 if 'SizeZone' in marginalizeOn and 'NumNodes' not in marginalizeOn:
119 # special case: sz-only synthetic benchmark
120 margNeither = list( set(marginalizeOn) - {'SizeZone'} )
121 margBoth = list( set(marginalizeOn) | {'NumNodes'} )
122 margJustNn = list( set(margNeither) | {'NumNodes'} )
123 annotateBaseline( timings, margNeither )
124 annotateBaseline( timings, margBoth )
125 annotateBaseline( timings, margJustNn )
126 c_neitherRel = c("OpDurRel", margNeither)
127 c_bothBl = c("Baseline", margBoth)
128 c_justNnBl = c("Baseline", margJustNn)
129 timings[ c_tgtBl ] = np.nan
130 timings[ c_tgtRel ] = timings[ c_justNnBl ] / timings[ c_bothBl ] * timings[ c_neitherRel ]
131 else: # general case
132 # prevent non-canonical samples from polluting baseline values
133 # note, depending on the presentation, the polluting points may already be removed from timings entirely
134 canonSrc = getJustCanon(timings, marginalizeOn)
135 # print(f"for marg on {marginalizeOn}, |canonSrc| = {len(canonSrc)}, |timings| = {len(timings)}", file=sys.stderr)
136 conditionOn = list( set(explanations) - set(marginalizeOn) )
137 # print( "marginalizing on", marginalizeOn, "conditioning on", conditionOn, file=sys.stderr )
138
139 if conditionOn:
140 stats = canonSrc.groupby(conditionOn)['mean_op_dur_ns'].agg(**{
141 c_tgtPeers: 'count',
142 c_tgtBl: gmean
143 })
144 group_lookup = timings.set_index(conditionOn).index
145 timings[c_tgtPeers] = stats[c_tgtPeers].reindex(group_lookup).values
146 timings[c_tgtBl] = stats[c_tgtBl].reindex(group_lookup).values
147 else:
148 stats = canonSrc.groupby((lambda _: 0))['mean_op_dur_ns'].agg(**{
149 c_tgtPeers: 'count',
150 c_tgtBl: gmean
151 })
152 # Extract the single row
153 row = stats.iloc[0]
154 # Broadcast to all rows
155 timings[c_tgtPeers] = row[c_tgtPeers]
156 timings[c_tgtBl] = row[c_tgtBl]
157
158
159 # everywhere := itself / [preferred-subset derived]
160 timings[c_tgtRel] = timings['mean_op_dur_ns'] / timings[c_tgtBl]
161
162
163# longer column name (Peers_%, Baseline_%, OpDurRel_%) gives larger peer group and more (total) variation
164def annotateCommonBaselines( timings ):
165 def applyGeneralExplanations( bgMarginalizeOn ):
166 def fg( marginalizeOn ):
167 return bgMarginalizeOn + marginalizeOn
168 annotateBaseline( timings, fg( [] ) ) # all-in baseline (all factors conditioned): only inter-run differences
169 annotateBaseline( timings, fg( ['movement', 'polarity'] ) )
170 annotateBaseline( timings, fg( ['accessor'] ) )
171 annotateBaseline( timings, fg( ['machine'] ) )
172
173 annotateBaseline( timings, fg( ['SizeZone', 'NumNodes'] ) ) # SizeZone is NOT redundant; conditioned on neither
174 annotateBaseline( timings, fg( ['NumNodes'] ) ) # still conditioned on SizeZone
175 annotateBaseline( timings, fg( ['SizeZone'] ) ) # synthetic: conditioned on NumNodes but not SizeZone
176 applyGeneralExplanations( [] )
177 applyGeneralExplanations( ['fx'] )
178
179def getMachineDataset( dsname, machine ):
180 infileLocal = f"results-{machine}-{dsname}.csv"
181 infile = os.path.dirname(os.path.abspath(__file__)) + '/../benchmarks/list/' + infileLocal
182 timings = getDataset( infile )
183 timings['machine'] = machine
184 return timings
185
186allMachines = ['swift', 'java']
187
188
189# general, as in exclude the stripped-down experimental CFAs
190general_fxs_full = ['cfa-cfa', 'cpp-stlref', 'upp-upp', 'lq-tailq', 'lq-list']
191general_fxs_intrusive = ['cfa-cfa', 'upp-upp', 'lq-tailq', 'lq-list']
192
193def getSingleResults(
194 dsnames = ['general'],
195 machines = allMachines,
196 *,
197 fxs = general_fxs_full,
198 tgtMovement = 'all',
199 tgtPolarity = 'all',
200 tgtAccessor = 'all',
201 tgtInterleave = 0.0 ):
202
203 timings = pd.concat([
204 getMachineDataset( d, m )
205 for d in dsnames
206 for m in machines ])
207
208# print(timings, file=sys.stderr)
209
210 movements = timings['movement'].unique()
211 polarities = timings['polarity'].unique()
212 accessors = timings['accessor'].unique()
213 interleaves = timings['InterleaveFrac'].unique()
214
215 if movements.size > 1:
216 movements = np.append(movements, 'all')
217 if polarities.size > 1:
218 polarities = np.append(polarities, 'all')
219 if accessors.size > 1:
220 accessors = np.append(accessors, 'all')
221
222# print(f"trying to filter {dsname} {machines} {len(timings)}", file=sys.stderr)
223 grp = timings.groupby('fx')
224# print(f"with fxs {grp.groups.keys()}", file=sys.stderr)
225 timings = pd.concat([
226 grp.get_group(fx)
227 for fx in fxs ])
228
229 if (tgtMovement != 'all'):
230 grp = timings.groupby('movement')
231 timings = grp.get_group(tgtMovement)
232 if (tgtPolarity != 'all'):
233 grp = timings.groupby('polarity')
234 timings = grp.get_group(tgtPolarity)
235 if (tgtAccessor != 'all'):
236 grp = timings.groupby('accessor')
237 timings = grp.get_group(tgtAccessor)
238 if (tgtInterleave != 'all'):
239 timings = timings[ timings['InterleaveFrac'] == float(tgtInterleave) ]
240
241
242 return timings
243
244def stripMachine(pyCore):
245 parts = str.split(pyCore, '-')
246 exceptLast = parts[ 0 : -1 ]
247 return str.join('-', exceptLast)
248
249def getSummaryMeta(metaFileCore):
250 metafile = os.path.dirname(os.path.abspath(__file__)) + "/" + metaFileCore + '-meta.dat'
251 metadata = pd.read_csv(
252 metafile,
253 names=['OpIx', 'Op'],
254 delimiter='\t'
255 )
256 metadata[['movement',
257 'polarity',
258 'accessor']] = metadata['Op'].str.split('\\\\n', expand=True)
259 metadata.replace('*', 'all', inplace=True)
260 metadata.replace('S', 'stack', inplace=True)
261 metadata.replace('Q', 'queue', inplace=True)
262 metadata.replace('iF', 'insfirst', inplace=True)
263 metadata.replace('iL', 'inslast', inplace=True)
264 metadata.replace('H', 'allhead', inplace=True)
265 metadata.replace('Ie', 'inselem', inplace=True)
266 metadata.replace('Re', 'remelem', inplace=True)
267 return metadata
268
269def printSingleSummaryFrom( measure, dfgrouped, *,
270 file = sys.stdout,
271 index = True,
272 end = '\n' ) :
273 aggregated = dfgrouped[measure].agg([
274 ("gmean", gmean), "std", "min", "max", "count",
275 lambda x: x.quantile(0.025),
276 lambda x: x.quantile(0.16),
277 lambda x: x.quantile(0.5),
278 lambda x: x.quantile(0.84),
279 lambda x: x.quantile(0.975)
280 ])
281 text = aggregated.to_csv(header=False, index=index, sep='\t')
282 print(text, file=file, end=end)
283
284
285swiftSweetspot = (lambda x: x > 16 and x < 150)
286# swiftSweetspot = (lambda x: x > 4 and x < 32)
287javaSweetspot = (lambda x: x >= 24 and x <= 256)
288
289def printManySummary(*,
290 dsnames = ['general'],
291 machines = allMachines,
292 metafileCore,
293 fxs,
294 sizeQual,
295 tgtInterleave = 0.0,
296 marginalizeOn = ['fx'] ) :
297
298 metadata = getSummaryMeta(metafileCore)
299
300 measure = c( 'OpDurRel', marginalizeOn )
301
302 print("# op_num\tfx_num\tfx\tmean\tstdev\tmin\tmax\tcount\tpl95\tpl68\tp50\tph68\tph95")
303
304 for op in metadata.itertuples():
305 timings = getSingleResults(dsnames, machines,
306 fxs=fxs,
307 tgtMovement = op.movement,
308 tgtPolarity = op.polarity,
309 tgtAccessor = op.accessor,
310 tgtInterleave = tgtInterleave )
311 annotateBaseline(timings, marginalizeOn)
312
313 timings = timings[ timings['fx'].isin(fxs) ]
314 timings = timings[ timings['NumNodes'].apply(sizeQual) ]
315
316 fxnums = timings['fx'].apply(
317 lambda fx: fxs.index(fx) + 1
318 )
319 timings.insert(loc=0, column='fx_num', value=fxnums)
320 timings.insert(loc=0, column='op_num', value=op.OpIx)
321
322 grouped = timings.groupby(['op_num', 'fx_num', 'fx'])
323 printSingleSummaryFrom( measure, grouped, end ='' )
324
325
326def printSingleDetail(
327 dsnames = ['general'],
328 machines = allMachines,
329 *,
330 fxs = general_fxs_full,
331 tgtMovement = 'all',
332 tgtPolarity = 'all',
333 tgtAccessor = 'all',
334 tgtInterleave = 0.0,
335 measureBase = 'mean_op_dur_ns',
336 marginalizeOn = explanations ):
337
338
339 timings = getSingleResults(dsnames, machines,
340 fxs = fxs,
341 tgtMovement = tgtMovement,
342 tgtPolarity = tgtPolarity,
343 tgtAccessor = tgtAccessor,
344 tgtInterleave = tgtInterleave)
345
346 if measureBase == 'OpDurRel':
347 annotateBaseline(timings, marginalizeOn)
348 measure = c( measureBase, marginalizeOn )
349 elif measureBase == 'mean_op_dur_ns':
350 measure = measureBase
351 else:
352 raise RuntimeError(f"measureBase '{measureBase}' not handled")
353
354 groupedFx = timings.groupby('fx')
355 for fx, fgroup in groupedFx:
356 # print(fgroup.head())
357 groupedRun = fgroup.groupby(['NumNodes']) # , 'fx', 'op'
358 aggregated = groupedRun[measure].agg(
359 ["mean", "std", "min", "max", "count", "sum"]
360 )
361 aggregated['mean_no_outlr'] = (
362 ( aggregated['sum'] - aggregated['min'] - aggregated['max'] )
363 /
364 ( aggregated['count'] - 2 )
365 )
366
367 #print(aggregated.head())
368
369 print('"{header}"'.format(header=fx))
370 text = aggregated.to_csv(header=False, index=True, sep='\t')
371 print(text)
372 print()
373 print()
374
375def aMeanNoOutlr(range):
376 return ( range.sum() - range.min() - range.max() ) / ( range.count() - 2 )
377
378def gMeanNoOutlr(range):
379 return ( range.prod() / range.min() / range.max() ) ** ( 1 / ( range.count() - 2 ) )
380
381
382def trimPer( df, criteria ):
383 for field, values in criteria.items():
384 areMatches = df[ field ].isin(values)
385 df = df[ areMatches ]
386 return df
387
388# The range from 0.9759 to 1.0247 (which is 1.05 x wide) has 1.0 in its centre.
389# This is the bucket with key 0.
390# Logs of values in this bucket go from -0.5 to +0.5.
391# Rounding a log value to the nearest integer gives the key.
392# Exponentiating a key directly gives the centre of its bucket.
393# Exponentiating a key less 0.5 gives the bottom of its bucket.
394# Gnuplot expects the latter.
395
396bucketMin = 0.25
397bucketMax = 4.0
398bucketGrain = 1.05
399bktKeyLo = math.floor( math.log(bucketMin, bucketGrain) )
400bktKeyHi = math.ceil( math.log(bucketMax, bucketGrain) )
401
402def bktKeyOfVal( relDur ):
403 distance = math.log(relDur, bucketGrain)
404 key = round( distance )
405 return key
406
407def bktIxOfVal( relDur ):
408 return bktKeyToIx( bktKeyOfVal( relDur ) )
409
410def botValOfBucketK( key ):
411 return bucketGrain ** ( key - 0.5 )
412
413def topValOfBucketBotVal( botVal ):
414 return bucketGrain * botVal
415
416def bktKeyToIx( key ):
417 return key - bktKeyLo
418
419def bktIxToKey( ix ):
420 return ix + bktKeyLo
421
422def botOfBucketOfVal( relDur ):
423 return botValOfBucketK( bktKeyOfVal( relDur ) )
424
425buckets = [ botValOfBucketK(key) for key in range(bktKeyLo, bktKeyHi) ]
426
427# printSingleDetail
428def printHistos(*,
429 tgtMovement = 'all',
430 tgtPolarity = 'all',
431 tgtAccessor = 'all',
432 tgtInterleave = 0.0,
433 earlyFilter = {}, # exclude from benchmarking
434 lateFilter = {}, # exclude from output
435 drillOn = ['fx'],
436 marginalizeOn = None, # None means match drill-on
437 sumFile = sys.stdout,
438 detFile = sys.stdout ):
439
440 if marginalizeOn == None:
441 marginalizeOn = drillOn
442
443 # watch out for filtering too early here; need everything sticking around until baselines are applies
444 # ie, maybe I should get rid of all the tgt parms at the pre-benchmark layers
445 timings = getSingleResults(
446 tgtMovement = tgtMovement,
447 tgtPolarity = tgtPolarity,
448 tgtAccessor = tgtAccessor,
449 tgtInterleave = tgtInterleave)
450 timings = getJustCanon( timings,
451 fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp', 'lq-list'],
452 szInc = ['SM', 'ML'],
453 sExcl = [1] )
454
455 timings = trimPer( timings, earlyFilter )
456
457 options = timings.groupby(explanations)
458 aggregated = options.agg(
459 mean_op_dur_ns = ('mean_op_dur_ns', gMeanNoOutlr)
460 ).reset_index()
461 annotateBaseline(aggregated, marginalizeOn)
462
463 aggregated = trimPer( aggregated, lateFilter )
464
465 # if examining "why CFA slow" need both
466 # - getVariousCfa inplace of getJust Canon
467 # - do annotate-then-filter because baseline needs to stay cfa-tailq-upp
468 # (filter-then-annotate is fine for general cases (where all three canons are included) and good for build time)
469
470
471 c_measure = c('OpDurRel', marginalizeOn)
472 # options = timings.groupby(explanations)
473
474 # aggregated = options.agg(
475 # **{measure:(measure,gMeanNoOutlr)}
476 # ).reset_index()
477
478 c_measureBkt = 'BUCKET_' + c_measure
479 aggregated[ c_measureBkt ] = aggregated[c_measure].apply( botOfBucketOfVal )
480
481 drillgrp = aggregated.groupby(drillOn)
482
483 # print(f'measure is {measure}')
484 # print()
485 # print()
486
487 for dkey, dgroup in drillgrp:
488# print(mgroup, file=sys.stderr)
489
490 dkey_str = list( map( str, dkey ) )
491 header = str.join(', ', dkey_str)
492
493 dgroup_sole = dgroup.groupby((lambda _: 0))
494 print(f'"{header}"', file=sumFile)
495 printSingleSummaryFrom(
496 c_measure, dgroup_sole, file=sumFile, index=False )
497 print(file=sumFile)
498 print(file=sumFile)
499
500 histo_raw = dgroup[ c_measureBkt ].value_counts()
501 for b in buckets:
502 if b not in histo_raw.keys():
503# print( f"{b} := 0", file=sys.stderr )
504 histo_raw[b] = 0
505 histo_raw = histo_raw.sort_index()
506
507 histo = histo_raw.rename("count").reset_index()
508 histo = histo.rename(columns={c_measureBkt: "y_lo"})
509 y_lo_col_loc = histo.columns.get_loc("y_lo")
510 histo.insert(y_lo_col_loc + 1, "y_hi", histo["y_lo"].apply(topValOfBucketBotVal))
511
512 print(f'"{header}"', file=detFile)
513 text = histo.to_csv(header=False, index=False, sep='\t')
514 print(text, file=detFile)
515 print(file=detFile)
516 print(file=detFile)
517
518 # print(f'"{header}" FULL')
519 # text = group.to_csv(header=False, index=True, sep='\t')
520 # print(text)
521 # print()
522 # print()
523
524 # print(f'"RAW"')
525 # text = timings.to_csv(header=False, index=True, sep='\t')
526 # print(text)
Note: See TracBrowser for help on using the repository browser.