source: doc/theses/mike_brooks_MMath/plots/ListCommon.py@ e35ecd0

Last change on this file since e35ecd0 was e35ecd0, checked in by Michael Brooks <mlbrooks@…>, 2 weeks ago

save draft of list perf fx-interaction plot

  • Property mode set to 100644
File size: 18.1 KB
Line 
1import pandas as pd
2import numpy as np
3import math
4import os
5from subprocess import Popen, PIPE
6from scipy.stats import gmean
7
8def getDataset( infile ):
9 # grep to remove lines that end in comma; these were error runs
10 with Popen("grep '[^,]$' " + infile, shell=True, stdout=PIPE) as process:
11 timings = pd.read_csv(
12 process.stdout,
13 names=['RunMoment', 'RunIdx', 'Args', 'Program', 'Width',
14 'expt_ops_completed', 'expt_elapsed_sec', 'mean_op_dur_ns'],
15 dtype={'RunMoment': str,
16 'RunIdx': np.int64,
17 'Args': str,
18 'Program': str,
19 'Width': np.int64,
20 'expt_ops_completed': np.int64,
21 'expt_elapsed_sec': np.float64,
22 'mean_op_dur_ns': np.float64},
23 parse_dates=['RunMoment']
24 )
25 # print(timings.head())
26
27 ## parse executable name and args
28
29 timings[['ExperimentDurSec',
30 'CheckDonePeriod',
31 'Length',
32 'ExperimentDurOpCount',
33 'Seed',
34 'InterleaveFrac']] = timings['Args'].str.strip().str.split(expand=True)
35 timings["Length"] = pd.to_numeric(timings["Length"])
36 timings["InterleaveFrac"] = pd.to_numeric(timings["InterleaveFrac"]).round(3)
37
38 timings["NumNodes"] = timings["Length"] * timings["Width"]
39
40 timings[['__ProgramPrefix',
41 'fx',
42 'op']] = timings['Program'].str.split('--', expand=True)
43
44 timings[['movement',
45 'polarity',
46 'accessor']] = timings['op'].str.split('-', expand=True)
47
48 ## SizeZone as NumNodes t-shirt size
49 timings['SizeZone'] = np.select(
50 condlist = [
51 (4 <= timings['NumNodes']) & (timings['NumNodes'] <= 16),
52 (48 <= timings['NumNodes']) & (timings['NumNodes'] <= 256)
53 ],
54 choicelist = [
55 'SM',
56 'ML'
57 ],
58 default = 'none'
59 )
60
61 return timings
62
63# `c` = column name
64def c( baseName, marginalizeOn ):
65 margSlug = str.join( "_", marginalizeOn )
66 return baseName + "_" + margSlug
67
68explanations = ['movement', 'polarity', 'accessor',
69 'NumNodes',
70 'SizeZone', # note fd: NumNodes -> SizeZone
71 'fx',
72 'machine',
73 'InterleaveFrac', # unused and always zero
74 ]
75
76# helper for avoiding pollution from e.g. alternate cfa list versions
77# when a preference-limiting factor is marginalized, make bl value from preferred subset
78# but still stamp result everywhere; e.g. even cfa-strip has canon-bl-relative perf
79# when conditioning on such factor, peer groups are already small enough to stop such pollution
80# use nontrivial marginalizeOn when calculating baseline values, to achieve the above outside-canonical behaviour non-degenerately
81# use default full marginalizeOn when removing points from a graph, which leaves only canonical points
82def getJustCanon( timings,
83 marginalizeOn = explanations, *,
84 # no c++: bl is for comparing intrusives
85 # no lq-list: sparse
86 # no cfa-fredDisbled: bl is for comparing prod-readies
87 fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp'],
88 szInc = ['SM', 'ML'],
89 sExcl = [1]
90 ): # all explanations marginalized => maximally aggressive filter
91 if 'fx' in marginalizeOn:
92 fxIsCanon = timings.fx.isin(fxInc)
93 timings = timings[ fxIsCanon ]
94 if 'SizeZone' in marginalizeOn:
95 szIsCanon = timings.SizeZone.isin(szInc)
96 timings = timings[ szIsCanon ]
97 if 'NumNodes' in marginalizeOn:
98 sIsCanon = ~ timings.NumNodes.isin(sExcl)
99 timings = timings[ sIsCanon ]
100 return timings
101
102
103def annotateBaseline( timings, marginalizeOn ):
104 c_tgtPeers = c( 'Peers', marginalizeOn )
105 c_tgtBl = c("Baseline", marginalizeOn)
106 c_tgtRel = c("OpDurRel", marginalizeOn)
107 if c_tgtBl in timings.columns or c_tgtRel in timings.columns:
108 assert( c_tgtBl in timings.columns and c_tgtRel in timings.columns )
109 return
110 # size handling:
111 # two ordinary baselines (sz-nn, nn) and one synthetic baseline (sz)
112 # the SizeZone-only baseline has no interpretation wrt a real peer group
113 # it isolates the effect of belonging to one SZ or the other
114 # while conditioning away the specific-size effects within the SZ
115 # notably in zone SM, opDur-v-size usually pitches upward
116 # comparing to sz-only baseline gets rid of "they all pitch up," while keeping "SM is faster then ML"
117 if 'SizeZone' in marginalizeOn and 'NumNodes' not in marginalizeOn:
118 # special case: sz-only synthetic benchmark
119 margNeither = list( set(marginalizeOn) - {'SizeZone'} )
120 margBoth = list( set(marginalizeOn) | {'NumNodes'} )
121 margJustNn = list( set(margNeither) | {'NumNodes'} )
122 annotateBaseline( timings, margNeither )
123 annotateBaseline( timings, margBoth )
124 annotateBaseline( timings, margJustNn )
125 c_neitherRel = c("OpDurRel", margNeither)
126 c_bothBl = c("Baseline", margBoth)
127 c_justNnBl = c("Baseline", margJustNn)
128 timings[ c_tgtBl ] = np.nan
129 timings[ c_tgtRel ] = timings[ c_justNnBl ] / timings[ c_bothBl ] * timings[ c_neitherRel ]
130 else: # general case
131 # prevent non-canonical samples from polluting baseline values
132 # note, depending on the presentation, the polluting points may already be removed from timings entirely
133 canonSrc = getJustCanon(timings, marginalizeOn)
134 # print(f"for marg on {marginalizeOn}, |canonSrc| = {len(canonSrc)}, |timings| = {len(timings)}", file=sys.stderr)
135 conditionOn = list( set(explanations) - set(marginalizeOn) )
136 # print( "marginalizing on", marginalizeOn, "conditioning on", conditionOn, file=sys.stderr )
137
138 if conditionOn:
139 stats = canonSrc.groupby(conditionOn)['mean_op_dur_ns'].agg(**{
140 c_tgtPeers: 'count',
141 c_tgtBl: gmean
142 })
143 group_lookup = timings.set_index(conditionOn).index
144 timings[c_tgtPeers] = stats[c_tgtPeers].reindex(group_lookup).values
145 timings[c_tgtBl] = stats[c_tgtBl].reindex(group_lookup).values
146 else:
147 stats = canonSrc.groupby((lambda _: 0))['mean_op_dur_ns'].agg(**{
148 c_tgtPeers: 'count',
149 c_tgtBl: gmean
150 })
151 # Extract the single row
152 row = stats.iloc[0]
153 # Broadcast to all rows
154 timings[c_tgtPeers] = row[c_tgtPeers]
155 timings[c_tgtBl] = row[c_tgtBl]
156
157
158 # everywhere := itself / [preferred-subset derived]
159 timings[c_tgtRel] = timings['mean_op_dur_ns'] / timings[c_tgtBl]
160
161
162# longer column name (Peers_%, Baseline_%, OpDurRel_%) gives larger peer group and more (total) variation
163def annotateCommonBaselines( timings ):
164 def applyGeneralExplanations( bgMarginalizeOn ):
165 def fg( marginalizeOn ):
166 return bgMarginalizeOn + marginalizeOn
167 annotateBaseline( timings, fg( [] ) ) # all-in baseline (all factors conditioned): only inter-run differences
168 annotateBaseline( timings, fg( ['movement', 'polarity'] ) )
169 annotateBaseline( timings, fg( ['accessor'] ) )
170 annotateBaseline( timings, fg( ['machine'] ) )
171
172 annotateBaseline( timings, fg( ['SizeZone', 'NumNodes'] ) ) # SizeZone is NOT redundant; conditioned on neither
173 annotateBaseline( timings, fg( ['NumNodes'] ) ) # still conditioned on SizeZone
174 annotateBaseline( timings, fg( ['SizeZone'] ) ) # synthetic: conditioned on NumNodes but not SizeZone
175 applyGeneralExplanations( [] )
176 applyGeneralExplanations( ['fx'] )
177
178def getMachineDataset( dsname, machine ):
179 infileLocal = f"results-{machine}-{dsname}.csv"
180 infile = os.path.dirname(os.path.abspath(__file__)) + '/../benchmarks/list/' + infileLocal
181 timings = getDataset( infile )
182 timings['machine'] = machine
183 return timings
184
185allMachines = ['swift', 'java']
186
187
188# general, as in exclude the stripped-down experimental CFAs
189general_fxs_full = ['cfa-cfa', 'cpp-stlref', 'upp-upp', 'lq-tailq', 'lq-list']
190general_fxs_intrusive = ['cfa-cfa', 'upp-upp', 'lq-tailq', 'lq-list']
191
192def getSingleResults(
193 dsname = 'general',
194 machines = allMachines,
195 *,
196 fxs = general_fxs_full,
197 tgtMovement = 'all',
198 tgtPolarity = 'all',
199 tgtAccessor = 'all',
200 tgtInterleave = 0.0 ):
201
202 timings = pd.concat([
203 getMachineDataset( dsname, m )
204 for m in machines ])
205
206# print(timings, file=sys.stderr)
207
208 movements = timings['movement'].unique()
209 polarities = timings['polarity'].unique()
210 accessors = timings['accessor'].unique()
211 interleaves = timings['InterleaveFrac'].unique()
212
213 if movements.size > 1:
214 movements = np.append(movements, 'all')
215 if polarities.size > 1:
216 polarities = np.append(polarities, 'all')
217 if accessors.size > 1:
218 accessors = np.append(accessors, 'all')
219
220# print(f"trying to filter {dsname} {machines} {len(timings)}", file=sys.stderr)
221 grp = timings.groupby('fx')
222# print(f"with fxs {grp.groups.keys()}", file=sys.stderr)
223 timings = pd.concat([
224 grp.get_group(fx)
225 for fx in fxs ])
226
227 if (tgtMovement != 'all'):
228 grp = timings.groupby('movement')
229 timings = grp.get_group(tgtMovement)
230 if (tgtPolarity != 'all'):
231 grp = timings.groupby('polarity')
232 timings = grp.get_group(tgtPolarity)
233 if (tgtAccessor != 'all'):
234 grp = timings.groupby('accessor')
235 timings = grp.get_group(tgtAccessor)
236 if (tgtInterleave != 'all'):
237 timings = timings[ timings['InterleaveFrac'] == float(tgtInterleave) ]
238
239
240 return timings
241
242def stripMachine(pyCore):
243 parts = str.split(pyCore, '-')
244 exceptLast = parts[ 0 : -1 ]
245 return str.join('-', exceptLast)
246
247def getSummaryMeta(metaFileCore):
248 metafile = os.path.dirname(os.path.abspath(__file__)) + "/" + metaFileCore + '-meta.dat'
249 metadata = pd.read_csv(
250 metafile,
251 names=['OpIx', 'Op'],
252 delimiter='\t'
253 )
254 metadata[['movement',
255 'polarity',
256 'accessor']] = metadata['Op'].str.split('\\\\n', expand=True)
257 metadata.replace('*', 'all', inplace=True)
258 metadata.replace('S', 'stack', inplace=True)
259 metadata.replace('Q', 'queue', inplace=True)
260 metadata.replace('iF', 'insfirst', inplace=True)
261 metadata.replace('iL', 'inslast', inplace=True)
262 metadata.replace('H', 'allhead', inplace=True)
263 metadata.replace('Ie', 'inselem', inplace=True)
264 metadata.replace('Re', 'remelem', inplace=True)
265 return metadata
266
267swiftSweetspot = (lambda x: x > 16 and x < 150)
268# swiftSweetspot = (lambda x: x > 4 and x < 32)
269javaSweetspot = (lambda x: x >= 24 and x <= 256)
270
271def printManySummary(*,
272 dsname = 'general',
273 machines = allMachines,
274 metafileCore,
275 fxs,
276 sizeQual,
277 tgtInterleave = 0.0,
278 marginalizeOn = ['fx'] ) :
279
280 metadata = getSummaryMeta(metafileCore)
281
282 measure = c( 'OpDurRel', marginalizeOn )
283
284 print("# op_num\tfx_num\tfx\tmean\tstdev\tmin\tmax\tcount\tpl95\tpl68\tp50\tph68\tph95")
285
286 for op in metadata.itertuples():
287 timings = getSingleResults(dsname, machines,
288 fxs=fxs,
289 tgtMovement = op.movement,
290 tgtPolarity = op.polarity,
291 tgtAccessor = op.accessor,
292 tgtInterleave = tgtInterleave )
293 annotateBaseline(timings, marginalizeOn)
294
295 timings = timings[ timings['fx'].isin(fxs) ]
296 timings = timings[ timings['NumNodes'].apply(sizeQual) ]
297
298 fxnums = timings['fx'].apply(
299 lambda fx: fxs.index(fx) + 1
300 )
301 timings.insert(loc=0, column='fx_num', value=fxnums)
302 timings.insert(loc=0, column='op_num', value=op.OpIx)
303
304 grouped = timings.groupby(['op_num', 'fx_num', 'fx'])
305
306 aggregated = grouped[measure].agg(
307 ["mean", "std", "min", "max", "count",
308 lambda x: x.quantile(0.025),
309 lambda x: x.quantile(0.16),
310 lambda x: x.quantile(0.5),
311 lambda x: x.quantile(0.84),
312 lambda x: x.quantile(0.975)]
313 )
314
315 text = aggregated.to_csv(header=False, index=True, sep='\t')
316 print(text, end='')
317
318def printSingleDetail(
319 dsname = 'general',
320 machines = allMachines,
321 *,
322 fxs = general_fxs_full,
323 tgtMovement = 'all',
324 tgtPolarity = 'all',
325 tgtAccessor = 'all',
326 tgtInterleave = 0.0,
327 measureBase = 'mean_op_dur_ns',
328 marginalizeOn = explanations ):
329
330
331 timings = getSingleResults(dsname, machines,
332 fxs = fxs,
333 tgtMovement = tgtMovement,
334 tgtPolarity = tgtPolarity,
335 tgtAccessor = tgtAccessor,
336 tgtInterleave = tgtInterleave)
337
338 if measureBase == 'OpDurRel':
339 annotateBaseline(timings, marginalizeOn)
340 measure = c( measureBase, marginalizeOn )
341 elif measureBase == 'mean_op_dur_ns':
342 measure = measureBase
343 else:
344 raise RuntimeError(f"measureBase '{measureBase}' not handled")
345
346 groupedFx = timings.groupby('fx')
347 for fx, fgroup in groupedFx:
348 # print(fgroup.head())
349 groupedRun = fgroup.groupby(['NumNodes']) # , 'fx', 'op'
350 aggregated = groupedRun[measure].agg(
351 ["mean", "std", "min", "max", "count", "sum"]
352 )
353 aggregated['mean_no_outlr'] = (
354 ( aggregated['sum'] - aggregated['min'] - aggregated['max'] )
355 /
356 ( aggregated['count'] - 2 )
357 )
358
359 #print(aggregated.head())
360
361 print('"{header}"'.format(header=fx))
362 text = aggregated.to_csv(header=False, index=True, sep='\t')
363 print(text)
364 print()
365 print()
366
367def aMeanNoOutlr(range):
368 return ( range.sum() - range.min() - range.max() ) / ( range.count() - 2 )
369
370def gMeanNoOutlr(range):
371 return ( range.prod() / range.min() / range.max() ) ** ( 1 / ( range.count() - 2 ) )
372
373
374def trimPer( df, criteria ):
375 for field, values in criteria.items():
376 areMatches = df[ field ].isin(values)
377 df = df[ areMatches ]
378 return df
379
380# The range from 0.9759 to 1.0247 (which is 1.05 x wide) has 1.0 in its centre.
381# This is the bucket with key 0.
382# Logs of values in this bucket go from -0.5 to +0.5.
383# Rounding a log value to the nearest integer gives the key.
384# Exponentiating a key directly gives the centre of its bucket.
385# Exponentiating a key less 0.5 gives the bottom of its bucket.
386# Gnuplot expects the latter.
387
388bucketMin = 0.25
389bucketMax = 4.0
390bucketGrain = 1.05
391bktKeyLo = math.floor( math.log(bucketMin, bucketGrain) )
392bktKeyHi = math.ceil( math.log(bucketMax, bucketGrain) )
393
394def bktKeyOfVal( relDur ):
395 distance = math.log(relDur, bucketGrain)
396 key = round( distance )
397 return key
398
399def bktIxOfVal( relDur ):
400 return bktKeyToIx( bktKeyOfVal( relDur ) )
401
402def botValOfBucketK( key ):
403 return bucketGrain ** ( key - 0.5 )
404
405def topValOfBucketBotVal( botVal ):
406 return bucketGrain * botVal
407
408def bktKeyToIx( key ):
409 return key - bktKeyLo
410
411def bktIxToKey( ix ):
412 return ix + bktKeyLo
413
414def botOfBucketOfVal( relDur ):
415 return botValOfBucketK( bktKeyOfVal( relDur ) )
416
417buckets = [ botValOfBucketK(key) for key in range(bktKeyLo, bktKeyHi) ]
418
419# printSingleDetail
420def printHistos(*,
421 tgtMovement = 'all',
422 tgtPolarity = 'all',
423 tgtAccessor = 'all',
424 tgtInterleave = 0.0,
425 earlyFilter = {}, # exclude from benchmarking
426 lateFilter = {}, # exclude from output
427 drillOn = ['fx'],
428 marginalizeOn = None ): # None means match drill-on
429
430 if marginalizeOn == None:
431 marginalizeOn = drillOn
432
433 # watch out for filtering too early here; need everything sticking around until baselines are applies
434 # ie, maybe I should get rid of all the tgt parms at the pre-benchmark layers
435 timings = getSingleResults(
436 tgtMovement = tgtMovement,
437 tgtPolarity = tgtPolarity,
438 tgtAccessor = tgtAccessor,
439 tgtInterleave = tgtInterleave)
440 timings = getJustCanon( timings,
441 fxInc = ['cfa-cfa', 'lq-tailq', 'upp-upp', 'lq-list'],
442 szInc = ['SM', 'ML'],
443 sExcl = [1] )
444
445 timings = trimPer( timings, earlyFilter )
446
447 options = timings.groupby(explanations)
448 aggregated = options.agg(
449 mean_op_dur_ns = ('mean_op_dur_ns', gMeanNoOutlr)
450 ).reset_index()
451 annotateBaseline(aggregated, marginalizeOn)
452
453 aggregated = trimPer( aggregated, lateFilter )
454
455 # if examining "why CFA slow" need both
456 # - getVariousCfa inplace of getJust Canon
457 # - do annotate-then-filter because baseline needs to stay cfa-tailq-upp
458 # (filter-then-annotate is fine for general cases (where all three canons are included) and good for build time)
459
460
461 c_measure = c('OpDurRel', marginalizeOn)
462 # options = timings.groupby(explanations)
463
464 # aggregated = options.agg(
465 # **{measure:(measure,gMeanNoOutlr)}
466 # ).reset_index()
467
468 c_measureBkt = 'BUCKET_' + c_measure
469 aggregated[ c_measureBkt ] = aggregated[c_measure].apply( botOfBucketOfVal )
470
471 drillgrp = aggregated.groupby(drillOn)
472
473 # print(f'measure is {measure}')
474 # print()
475 # print()
476
477 for dkey, dgroup in drillgrp:
478# print(mgroup, file=sys.stderr)
479
480 histo_raw = dgroup[ c_measureBkt ].value_counts()
481 for b in buckets:
482 if b not in histo_raw.keys():
483# print( f"{b} := 0", file=sys.stderr )
484 histo_raw[b] = 0
485 histo_raw = histo_raw.sort_index()
486
487 histo = histo_raw.rename("count").reset_index()
488 histo = histo.rename(columns={c_measureBkt: "y_lo"})
489 y_lo_col_loc = histo.columns.get_loc("y_lo")
490 histo.insert(y_lo_col_loc + 1, "y_hi", histo["y_lo"].apply(topValOfBucketBotVal))
491
492 dkey_str = list( map( str, dkey ) )
493 header = str.join(', ', dkey_str)
494 print(f'"{header}"')
495 text = histo.to_csv(header=False, index=False, sep='\t')
496 print(text)
497 print()
498 print()
499
500 # print(f'"{header}" FULL')
501 # text = group.to_csv(header=False, index=True, sep='\t')
502 # print(text)
503 # print()
504 # print()
505
506 # print(f'"RAW"')
507 # text = timings.to_csv(header=False, index=True, sep='\t')
508 # print(text)
Note: See TracBrowser for help on using the repository browser.