source: doc/theses/mike_brooks_MMath/plots/ListCommon.py@ 741d004

Last change on this file since 741d004 was 408f954, checked in by Michael Brooks <mlbrooks@…>, 9 days ago

fix list perf plotting issues with physical factors

  • Property mode set to 100644
File size: 18.9 KB
Line 
1import pandas as pd
2import numpy as np
3import math
4import os
5import sys
6from subprocess import Popen, PIPE
7from scipy.stats import gmean
8
9def getDataset( infile ):
10 # grep to remove lines that end in comma; these were error runs
11 with Popen("grep '[^,]$' " + infile, shell=True, stdout=PIPE) as process:
12 timings = pd.read_csv(
13 process.stdout,
14 names=['RunMoment', 'RunIdx', 'Args', 'Program', 'Width',
15 'expt_ops_completed', 'expt_elapsed_sec', 'mean_op_dur_ns'],
16 dtype={'RunMoment': str,
17 'RunIdx': np.int64,
18 'Args': str,
19 'Program': str,
20 'Width': np.int64,
21 'expt_ops_completed': np.int64,
22 'expt_elapsed_sec': np.float64,
23 'mean_op_dur_ns': np.float64},
24 parse_dates=['RunMoment']
25 )
26 # print(timings.head())
27
28 ## parse executable name and args
29
30 timings[['ExperimentDurSec',
31 'CheckDonePeriod',
32 'Length',
33 'ExperimentDurOpCount',
34 'Seed',
35 'InterleaveFrac']] = timings['Args'].str.strip().str.split(expand=True)
36 timings["Length"] = pd.to_numeric(timings["Length"])
37 timings["InterleaveFrac"] = pd.to_numeric(timings["InterleaveFrac"]).round(3)
38
39 timings["NumNodes"] = timings["Length"] * timings["Width"]
40
41 timings[['__ProgramPrefix',
42 'fx',
43 'op']] = timings['Program'].str.split('--', expand=True)
44
45 timings[['movement',
46 'polarity',
47 'accessor']] = timings['op'].str.split('-', expand=True)
48
49 ## SizeZone as NumNodes t-shirt size
50 timings['SizeZone'] = np.select(
51 condlist = [
52 ( 6 <= timings['NumNodes']) & (timings['NumNodes'] <= 20),
53 (50 <= timings['NumNodes']) & (timings['NumNodes'] <= 200)
54 ],
55 choicelist = [
56 'SM',
57 'ML'
58 ],
59 default = 'none'
60 )
61
62 return timings
63
64# `c` = column name
65def c( baseName, marginalizeOn ):
66 margSlug = str.join( "_", marginalizeOn )
67 return baseName + "_" + margSlug
68
69explanations = ['movement', 'polarity', 'accessor',
70 'NumNodes', 'Width', 'Length',
71 'SizeZone', # note fd: NumNodes -> SizeZone
72 'fx',
73 'machine',
74 'InterleaveFrac', # unused and always zero
75 ]
76
77# helper for avoiding pollution from e.g. alternate cfa list versions
78# when a preference-limiting factor is marginalized, make bl value from preferred subset
79# but still stamp result everywhere; e.g. even cfa-strip has canon-bl-relative perf
80# when conditioning on such factor, peer groups are already small enough to stop such pollution
81# use nontrivial marginalizeOn when calculating baseline values, to achieve the above outside-canonical behaviour non-degenerately
82# use default full marginalizeOn when removing points from a graph, which leaves only canonical points
83def getJustCanon( timings,
84 marginalizeOn = explanations, *,
85 # no c++: bl is for comparing intrusives
86 # no lq-list: sparse
87 # no cfa-fredDisbled: bl is for comparing prod-readies
88 fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp'],
89 szInc = ['SM', 'ML'],
90 sExcl = [1]
91 ): # all explanations marginalized => maximally aggressive filter
92 if 'fx' in marginalizeOn:
93 fxIsCanon = timings.fx.isin(fxInc)
94 timings = timings[ fxIsCanon ]
95 if 'SizeZone' in marginalizeOn:
96 szIsCanon = timings.SizeZone.isin(szInc)
97 timings = timings[ szIsCanon ]
98 if 'NumNodes' in marginalizeOn:
99 sIsCanon = ~ timings.NumNodes.isin(sExcl)
100 timings = timings[ sIsCanon ]
101 return timings
102
103# calls = 0
104
105def annotateBaseline( timings, marginalizeOn ):
106# global calls
107# print( "XXX", calls, marginalizeOn, file=sys.stderr )
108 c_tgtPeers = c( 'Peers', marginalizeOn )
109 c_tgtBl = c("Baseline", marginalizeOn)
110 c_tgtRel = c("OpDurRel", marginalizeOn)
111 if c_tgtBl in timings.columns or c_tgtRel in timings.columns:
112 assert( c_tgtBl in timings.columns and c_tgtRel in timings.columns )
113 return
114 # size handling:
115 # two ordinary baselines (sz-nn, nn) and one synthetic baseline (sz)
116 # the SizeZone-only baseline has no interpretation wrt a real peer group
117 # it isolates the effect of belonging to one SZ or the other
118 # while conditioning away the specific-size effects within the SZ
119 # notably in zone SM, opDur-v-size usually pitches upward
120 # comparing to sz-only baseline gets rid of "they all pitch up," while keeping "SM is faster then ML"
121 if 'SizeZone' in marginalizeOn and 'NumNodes' not in marginalizeOn:
122 assert( 'Length' not in marginalizeOn ) # need to treat them in lockstep because of fd
123 # calls += 1
124 # special case: sz-only synthetic benchmark
125 margNeither = list( set(marginalizeOn) - {'SizeZone'} )
126 margBoth = list( set(marginalizeOn) | {'NumNodes', 'Length'} )
127 margJustNn = list( set(margNeither) | {'NumNodes', 'Length'} )
128 annotateBaseline( timings, margNeither )
129 annotateBaseline( timings, margBoth )
130 annotateBaseline( timings, margJustNn )
131 c_neitherRel = c("OpDurRel", margNeither)
132 c_bothBl = c("Baseline", margBoth)
133 c_justNnBl = c("Baseline", margJustNn)
134 timings[ c_tgtBl ] = np.nan
135 timings[ c_tgtRel ] = timings[ c_justNnBl ] / timings[ c_bothBl ] * timings[ c_neitherRel ]
136 else: # general case
137 # prevent non-canonical samples from polluting baseline values
138 # note, depending on the presentation, the polluting points may already be removed from timings entirely
139 canonSrc = getJustCanon(timings, marginalizeOn)
140 # print(f"for marg on {marginalizeOn}, |canonSrc| = {len(canonSrc)}, |timings| = {len(timings)}", file=sys.stderr)
141 conditionOn = list( set(explanations) - set(marginalizeOn) )
142 # print( "marginalizing on", marginalizeOn, "conditioning on", conditionOn, file=sys.stderr )
143 # calls += 1
144
145 if conditionOn:
146 stats = canonSrc.groupby(conditionOn)['mean_op_dur_ns'].agg(**{
147 c_tgtPeers: 'count',
148 c_tgtBl: gmean
149 })
150 group_lookup = timings.set_index(conditionOn).index
151 timings[c_tgtPeers] = stats[c_tgtPeers].reindex(group_lookup).values
152 timings[c_tgtBl] = stats[c_tgtBl].reindex(group_lookup).values
153 else:
154 stats = canonSrc.groupby((lambda _: 0))['mean_op_dur_ns'].agg(**{
155 c_tgtPeers: 'count',
156 c_tgtBl: gmean
157 })
158 # Extract the single row
159 row = stats.iloc[0]
160 # Broadcast to all rows
161 timings[c_tgtPeers] = row[c_tgtPeers]
162 timings[c_tgtBl] = row[c_tgtBl]
163
164
165 # everywhere := itself / [preferred-subset derived]
166 timings[c_tgtRel] = timings['mean_op_dur_ns'] / timings[c_tgtBl]
167
168
169# longer column name (Peers_%, Baseline_%, OpDurRel_%) gives larger peer group and more (total) variation
170def annotateCommonBaselines( timings ):
171 def applyGeneralExplanations( bgMarginalizeOn ):
172 def fg( marginalizeOn ):
173 return bgMarginalizeOn + marginalizeOn
174 annotateBaseline( timings, fg( [] ) ) # all-in baseline (all factors conditioned): only inter-run differences
175 annotateBaseline( timings, fg( ['movement', 'polarity'] ) )
176 annotateBaseline( timings, fg( ['accessor'] ) )
177 annotateBaseline( timings, fg( ['machine'] ) )
178
179 annotateBaseline( timings, fg( ['SizeZone', 'NumNodes'] ) ) # SizeZone is NOT redundant; conditioned on neither
180 annotateBaseline( timings, fg( ['NumNodes'] ) ) # still conditioned on SizeZone
181 annotateBaseline( timings, fg( ['SizeZone'] ) ) # synthetic: conditioned on NumNodes but not SizeZone
182 applyGeneralExplanations( [] )
183 applyGeneralExplanations( ['fx'] )
184
185def getMachineDataset( dsname, machine ):
186 infileLocal = f"results-{machine}-{dsname}.csv"
187 infile = os.path.dirname(os.path.abspath(__file__)) + '/../benchmarks/list/' + infileLocal
188 timings = getDataset( infile )
189 timings['machine'] = machine
190 return timings
191
192allMachines = ['swift', 'java']
193
194
195# general, as in exclude the stripped-down experimental CFAs
196general_fxs_full = ['cfa-cfa', 'cpp-stlref', 'upp-upp', 'lq-tailq', 'lq-list']
197general_fxs_intrusive = ['cfa-cfa', 'upp-upp', 'lq-tailq', 'lq-list']
198
199def getSingleResults(
200 dsnames = ['general'],
201 machines = allMachines,
202 *,
203 fxs = general_fxs_full,
204 tgtMovement = 'all',
205 tgtPolarity = 'all',
206 tgtAccessor = 'all',
207 tgtInterleave = 0.0 ):
208
209 timings = pd.concat([
210 getMachineDataset( d, m )
211 for d in dsnames
212 for m in machines ])
213
214# print(timings, file=sys.stderr)
215
216 movements = timings['movement'].unique()
217 polarities = timings['polarity'].unique()
218 accessors = timings['accessor'].unique()
219 interleaves = timings['InterleaveFrac'].unique()
220
221 if movements.size > 1:
222 movements = np.append(movements, 'all')
223 if polarities.size > 1:
224 polarities = np.append(polarities, 'all')
225 if accessors.size > 1:
226 accessors = np.append(accessors, 'all')
227
228# print(f"trying to filter {dsname} {machines} {len(timings)}", file=sys.stderr)
229 grp = timings.groupby('fx')
230# print(f"with fxs {grp.groups.keys()}", file=sys.stderr)
231 timings = pd.concat([
232 grp.get_group(fx)
233 for fx in fxs ])
234
235 if (tgtMovement != 'all'):
236 grp = timings.groupby('movement')
237 timings = grp.get_group(tgtMovement)
238 if (tgtPolarity != 'all'):
239 grp = timings.groupby('polarity')
240 timings = grp.get_group(tgtPolarity)
241 if (tgtAccessor != 'all'):
242 grp = timings.groupby('accessor')
243 timings = grp.get_group(tgtAccessor)
244 if (tgtInterleave != 'all'):
245 timings = timings[ timings['InterleaveFrac'] == float(tgtInterleave) ]
246
247
248 return timings
249
250def stripMachine(pyCore):
251 parts = str.split(pyCore, '-')
252 exceptLast = parts[ 0 : -1 ]
253 return str.join('-', exceptLast)
254
255def getSummaryMeta(metaFileCore):
256 metafile = os.path.dirname(os.path.abspath(__file__)) + "/" + metaFileCore + '-meta.dat'
257 metadata = pd.read_csv(
258 metafile,
259 names=['OpIx', 'Op'],
260 delimiter='\t'
261 )
262 metadata[['movement',
263 'polarity',
264 'accessor']] = metadata['Op'].str.split('\\\\n', expand=True)
265 metadata.replace('*', 'all', inplace=True)
266 metadata.replace('S', 'stack', inplace=True)
267 metadata.replace('Q', 'queue', inplace=True)
268 metadata.replace('iF', 'insfirst', inplace=True)
269 metadata.replace('iL', 'inslast', inplace=True)
270 metadata.replace('H', 'allhead', inplace=True)
271 metadata.replace('Ie', 'inselem', inplace=True)
272 metadata.replace('Re', 'remelem', inplace=True)
273 return metadata
274
275def printSingleSummaryFrom( measure, dfgrouped, *,
276 file = sys.stdout,
277 index = True,
278 end = '\n' ) :
279 aggregated = dfgrouped[measure].agg([
280 ("gmean", gmean), "std", "min", "max", "count",
281 lambda x: x.quantile(0.025),
282 lambda x: x.quantile(0.16),
283 lambda x: x.quantile(0.5),
284 lambda x: x.quantile(0.84),
285 lambda x: x.quantile(0.975)
286 ])
287 text = aggregated.to_csv(header=False, index=index, sep='\t')
288 print(text, file=file, end=end)
289
290
291swiftSweetspot = (lambda x: x > 16 and x < 150)
292# swiftSweetspot = (lambda x: x > 4 and x < 32)
293javaSweetspot = (lambda x: x >= 24 and x <= 256)
294
295def printManySummary(*,
296 dsnames = ['general'],
297 machines = allMachines,
298 metafileCore,
299 fxs,
300 sizeQual,
301 tgtInterleave = 0.0,
302 marginalizeOn = ['fx'] ) :
303
304 metadata = getSummaryMeta(metafileCore)
305
306 measure = c( 'OpDurRel', marginalizeOn )
307
308 print("# op_num\tfx_num\tfx\tmean\tstdev\tmin\tmax\tcount\tpl95\tpl68\tp50\tph68\tph95")
309
310 for op in metadata.itertuples():
311 timings = getSingleResults(dsnames, machines,
312 fxs=fxs,
313 tgtMovement = op.movement,
314 tgtPolarity = op.polarity,
315 tgtAccessor = op.accessor,
316 tgtInterleave = tgtInterleave )
317 annotateBaseline(timings, marginalizeOn)
318
319 timings = timings[ timings['fx'].isin(fxs) ]
320 timings = timings[ timings['NumNodes'].apply(sizeQual) ]
321
322 fxnums = timings['fx'].apply(
323 lambda fx: fxs.index(fx) + 1
324 )
325 timings.insert(loc=0, column='fx_num', value=fxnums)
326 timings.insert(loc=0, column='op_num', value=op.OpIx)
327
328 grouped = timings.groupby(['op_num', 'fx_num', 'fx'])
329 printSingleSummaryFrom( measure, grouped, end ='' )
330
331
332def printSingleDetail(
333 dsnames = ['general'],
334 machines = allMachines,
335 *,
336 fxs = general_fxs_full,
337 tgtMovement = 'all',
338 tgtPolarity = 'all',
339 tgtAccessor = 'all',
340 tgtInterleave = 0.0,
341 measureBase = 'mean_op_dur_ns',
342 marginalizeOn = explanations ):
343
344
345 timings = getSingleResults(dsnames, machines,
346 fxs = fxs,
347 tgtMovement = tgtMovement,
348 tgtPolarity = tgtPolarity,
349 tgtAccessor = tgtAccessor,
350 tgtInterleave = tgtInterleave)
351
352 if measureBase == 'OpDurRel':
353 annotateBaseline(timings, marginalizeOn)
354 measure = c( measureBase, marginalizeOn )
355 elif measureBase == 'mean_op_dur_ns':
356 measure = measureBase
357 else:
358 raise RuntimeError(f"measureBase '{measureBase}' not handled")
359
360 groupedFx = timings.groupby('fx')
361 for fx, fgroup in groupedFx:
362 # print(fgroup.head())
363 groupedRun = fgroup.groupby(['NumNodes']) # , 'fx', 'op'
364 aggregated = groupedRun[measure].agg(
365 ["mean", "std", "min", "max", "count", "sum"]
366 )
367 aggregated['mean_no_outlr'] = (
368 ( aggregated['sum'] - aggregated['min'] - aggregated['max'] )
369 /
370 ( aggregated['count'] - 2 )
371 )
372
373 #print(aggregated.head())
374
375 print('"{header}"'.format(header=fx))
376 text = aggregated.to_csv(header=False, index=True, sep='\t')
377 print(text)
378 print()
379 print()
380
381def aMeanNoOutlr(range):
382 return ( range.sum() - range.min() - range.max() ) / ( range.count() - 2 )
383
384def gMeanNoOutlr(range):
385 return ( range.prod() / range.min() / range.max() ) ** ( 1 / ( range.count() - 2 ) )
386
387
388def trimPer( df, criteria ):
389 for field, values in criteria.items():
390 areMatches = df[ field ].isin(values)
391 df = df[ areMatches ]
392 return df
393
394# The range from 0.9759 to 1.0247 (which is 1.05 x wide) has 1.0 in its centre.
395# This is the bucket with key 0.
396# Logs of values in this bucket go from -0.5 to +0.5.
397# Rounding a log value to the nearest integer gives the key.
398# Exponentiating a key directly gives the centre of its bucket.
399# Exponentiating a key less 0.5 gives the bottom of its bucket.
400# Gnuplot expects the latter.
401
402bucketMin = 0.25
403bucketMax = 4.0
404bucketGrain = 1.05
405bktKeyLo = math.floor( math.log(bucketMin, bucketGrain) )
406bktKeyHi = math.ceil( math.log(bucketMax, bucketGrain) )
407
408def bktKeyOfVal( relDur ):
409 distance = math.log(relDur, bucketGrain)
410 key = round( distance )
411 return key
412
413def bktIxOfVal( relDur ):
414 return bktKeyToIx( bktKeyOfVal( relDur ) )
415
416def botValOfBucketK( key ):
417 return bucketGrain ** ( key - 0.5 )
418
419def topValOfBucketBotVal( botVal ):
420 return bucketGrain * botVal
421
422def bktKeyToIx( key ):
423 return key - bktKeyLo
424
425def bktIxToKey( ix ):
426 return ix + bktKeyLo
427
428def botOfBucketOfVal( relDur ):
429 return botValOfBucketK( bktKeyOfVal( relDur ) )
430
431buckets = [ botValOfBucketK(key) for key in range(bktKeyLo, bktKeyHi) ]
432
433# printSingleDetail
434def printHistos(*,
435 tgtMovement = 'all',
436 tgtPolarity = 'all',
437 tgtAccessor = 'all',
438 tgtInterleave = 0.0,
439 earlyFilter = {}, # exclude from benchmarking
440 lateFilter = {}, # exclude from output
441 drillOn = ['fx'],
442 marginalizeOn = None, # None means match drill-on
443 sumFile = sys.stdout,
444 detFile = sys.stdout ):
445
446 if marginalizeOn == None:
447 marginalizeOn = drillOn
448
449 # watch out for filtering too early here; need everything sticking around until baselines are applies
450 # ie, maybe I should get rid of all the tgt parms at the pre-benchmark layers
451 timings = getSingleResults(
452 tgtMovement = tgtMovement,
453 tgtPolarity = tgtPolarity,
454 tgtAccessor = tgtAccessor,
455 tgtInterleave = tgtInterleave)
456 timings = getJustCanon( timings,
457 fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp', 'lq-list'],
458 szInc = ['SM', 'ML'],
459 sExcl = [1] )
460
461 timings = trimPer( timings, earlyFilter )
462
463 options = timings.groupby(explanations)
464 aggregated = options.agg(
465 mean_op_dur_ns = ('mean_op_dur_ns', gMeanNoOutlr)
466 ).reset_index()
467 annotateBaseline(aggregated, marginalizeOn)
468
469 aggregated = trimPer( aggregated, lateFilter )
470
471 # if examining "why CFA slow" need both
472 # - getVariousCfa inplace of getJust Canon
473 # - do annotate-then-filter because baseline needs to stay cfa-tailq-upp
474 # (filter-then-annotate is fine for general cases (where all three canons are included) and good for build time)
475
476
477 c_measure = c('OpDurRel', marginalizeOn)
478 # options = timings.groupby(explanations)
479
480 # aggregated = options.agg(
481 # **{measure:(measure,gMeanNoOutlr)}
482 # ).reset_index()
483
484 c_measureBkt = 'BUCKET_' + c_measure
485 aggregated[ c_measureBkt ] = aggregated[c_measure].apply( botOfBucketOfVal )
486
487 drillgrp = aggregated.groupby(drillOn)
488
489 # print(f'measure is {measure}')
490 # print()
491 # print()
492
493 for dkey, dgroup in drillgrp:
494# print(mgroup, file=sys.stderr)
495
496 dkey_str = list( map( str, dkey ) )
497 header = str.join(', ', dkey_str)
498
499 dgroup_sole = dgroup.groupby((lambda _: 0))
500 print(f'"{header}"', file=sumFile)
501 printSingleSummaryFrom(
502 c_measure, dgroup_sole, file=sumFile, index=False )
503 print(file=sumFile)
504 print(file=sumFile)
505
506 histo_raw = dgroup[ c_measureBkt ].value_counts()
507 for b in buckets:
508 if b not in histo_raw.keys():
509# print( f"{b} := 0", file=sys.stderr )
510 histo_raw[b] = 0
511 histo_raw = histo_raw.sort_index()
512
513 histo = histo_raw.rename("count").reset_index()
514 histo = histo.rename(columns={c_measureBkt: "y_lo"})
515 y_lo_col_loc = histo.columns.get_loc("y_lo")
516 histo.insert(y_lo_col_loc + 1, "y_hi", histo["y_lo"].apply(topValOfBucketBotVal))
517
518 print(f'"{header}"', file=detFile)
519 text = histo.to_csv(header=False, index=False, sep='\t')
520 print(text, file=detFile)
521 print(file=detFile)
522 print(file=detFile)
523
524 # print(f'"{header}" FULL')
525 # text = group.to_csv(header=False, index=True, sep='\t')
526 # print(text)
527 # print()
528 # print()
529
530 # print(f'"RAW"')
531 # text = timings.to_csv(header=False, index=True, sep='\t')
532 # print(text)
Note: See TracBrowser for help on using the repository browser.