source: doc/theses/mike_brooks_MMath/benchmarks/string/make-corpus.cfa

Last change on this file was e0350e0, checked in by Michael Brooks <mlbrooks@…>, 4 months ago

Recent rework of string benchmarks

  • Property mode set to 100644
File size: 4.2 KB
Line 
1#include <stdlib.h>
2#include <math.h>
3#include <limits.h>
4#include <unistd.h>
5#include <string.h>
6#include <assert.h>
7#include <stdio.h>
8
9#include <stdlib.h>
10#include <math.h>
11#include <limits.h>
12#include <unistd.h>
13
14int printing =
15#ifdef VERBOSE
16 1
17#else
18 0
19#endif
20;
21
22#define PRTF(fmt, ...) if (printing) fprintf(stderr, fmt, __VA_ARGS__)
23
24// U(0,1)
25static double U() {
26 return ((double)rand() + 1) / ((double)RAND_MAX + 2); // avoid 0
27}
28
29// parameters for distribution
30static double logn_mu, logn_sigma;
31
32// returns a draw from N(0,1)
33// based on https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform, section Implementation
34static double rand_std_normal() {
35 double u1 = U();
36 double u2 = U();
37 double ret = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
38 PRTF("=== %f %f %f\n", u1, u2, ret);
39 return ret;
40}
41
42// Initialize parameters for log-normal
43// Mean and stdev are for the underlying normal distribution
44// Resulting values will be log-normally distributed: X = exp(mu + sigma * Z)
45static void initialize(double locn, double relscale, int seed) {
46 if (seed) srand(seed);
47 else srand(getpid());
48
49 logn_sigma = sqrt(log(1 + pow(relscale, 2)));
50 logn_mu = log(locn-1) - 0.5 * pow(relscale, 2);
51 PRTF("xxx %f %f\n", logn_mu, logn_sigma);
52}
53
54// Generate a log-normally distributed random integer
55// ln(X) ~ N(mu, sigma^2)
56static int nextLognRand() {
57 double z = rand_std_normal();
58 double x = exp(logn_mu + logn_sigma * z);
59 PRTF("---%f %f\n", z, x);
60 return round(x);
61}
62
63// write a randomly generated alphabetic string whose length is adjused from a draw of the above distribution
64static void emit1( int offset, double mcfreq, char mchar ) {
65 int lim = 1 + offset + nextLognRand();
66 PRTF("%d\n", lim);
67 for (int i = 0; i < lim; i++) {
68 char emit;
69 if (U() < mcfreq) emit = mchar;
70 else emit = 'a' + (rand() % ('z'-'a'+1));
71 printf("%c", emit);
72 }
73 printf("\n");
74}
75
76// usage: ./make-corpus toGen locn [relscale=1.0] [seed=(pid)] [offset=0] [mcfreq=0.0] [mchar='-']
77//
78// Outputs alphabetic (plus magic-char) strings, one per line.
79// toGen: number of strings (lines)
80//
81// generated length ~ offset + lognormal( ... locn ... relscale ... )
82// >= 1
83//
84// offset=0, locn=1: constant length 1
85// offset=0, locn=2: lengths go like current value of $1 cash + $1-bought stock
86// offset=0, locn=6: lengths go like current value of $1 cash + $5-bought stock
87// offset=15, locn=1: constant length 16
88// offset=15, locn=2: population's minimum is 16 and mean is 17
89// i.e. lengths go like current value of $16 cash + $1-bought stock
90//
91// relscale gives the volatility of the stock. It's relative to the mean.
92// relscale=0.5 means the +-1 SD outcomes (68% case) are between locn/1.5 = locn*0.67 and locn*1.5.
93//
94// Magic Char (mc) does not affect these lengths. Any mc occurrence replaces an alphabetic char.
95// mcfreq: (in [0,1]) expected fraction of the characters output that are mchar
96//
97int main(int argc, char ** argv) {
98
99 int toGen;
100 double locn;
101 double relscale = 1.0;
102 int seed = 0;
103 int offset = 0;
104 double mcfreq = 0.0;
105 char mchar = '-';
106
107 assert(argc >= 3 && argc <= 8);
108 switch(argc) {
109 case 8:
110 assert(strlen(argv[7]) == 1);
111 mchar = argv[7][0];
112 case 7:
113 mcfreq = atof(argv[6]);
114 assert(mcfreq >= 0.0 && mcfreq <= 1.0);
115 case 6:
116 offset = atoi(argv[5]);
117 assert(offset >= 0 && offset < 10000);
118 case 5:
119 seed = atoi(argv[4]);
120 assert(seed > 0);
121 case 4:
122 relscale = atof(argv[3]);
123 assert(relscale > 0);
124 assert(relscale < 10);
125 default:
126 locn = atof(argv[2]);
127 assert(locn > 0);
128 assert(locn < 1000);
129 toGen = atoi(argv[1]);
130 assert(toGen > 0);
131 assert(toGen < 1000000);
132 }
133
134 PRTF("toGen=%d, locn=%f, relscale=%f, seed=%d, offset=%d, mcfreq=%f, mchar='%c'\n", toGen, locn, relscale, seed, offset, mcfreq, mchar);
135
136 initialize(locn, relscale, seed);
137 for( int i = 0; i < toGen; i++ ) {
138 emit1(offset, mcfreq, mchar);
139 }
140}
Note: See TracBrowser for help on using the repository browser.