1 | #include <stdlib.h>
|
---|
2 | #include <math.h>
|
---|
3 | #include <limits.h>
|
---|
4 | #include <unistd.h>
|
---|
5 | #include <string.h>
|
---|
6 | #include <assert.h>
|
---|
7 | #include <stdio.h>
|
---|
8 |
|
---|
9 | #include <stdlib.h>
|
---|
10 | #include <math.h>
|
---|
11 | #include <limits.h>
|
---|
12 | #include <unistd.h>
|
---|
13 |
|
---|
14 | int printing =
|
---|
15 | #ifdef VERBOSE
|
---|
16 | 1
|
---|
17 | #else
|
---|
18 | 0
|
---|
19 | #endif
|
---|
20 | ;
|
---|
21 |
|
---|
22 | #define PRTF(fmt, ...) if (printing) fprintf(stderr, fmt, __VA_ARGS__)
|
---|
23 |
|
---|
24 | // U(0,1)
|
---|
25 | static double U() {
|
---|
26 | return ((double)rand() + 1) / ((double)RAND_MAX + 2); // avoid 0
|
---|
27 | }
|
---|
28 |
|
---|
29 | // parameters for distribution
|
---|
30 | static double logn_mu, logn_sigma;
|
---|
31 |
|
---|
32 | // returns a draw from N(0,1)
|
---|
33 | // based on https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform, section Implementation
|
---|
34 | static double rand_std_normal() {
|
---|
35 | double u1 = U();
|
---|
36 | double u2 = U();
|
---|
37 | double ret = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
|
---|
38 | PRTF("=== %f %f %f\n", u1, u2, ret);
|
---|
39 | return ret;
|
---|
40 | }
|
---|
41 |
|
---|
42 | // Initialize parameters for log-normal
|
---|
43 | // Mean and stdev are for the underlying normal distribution
|
---|
44 | // Resulting values will be log-normally distributed: X = exp(mu + sigma * Z)
|
---|
45 | static void initialize(double locn, double relscale, int seed) {
|
---|
46 | if (seed) srand(seed);
|
---|
47 | else srand(getpid());
|
---|
48 |
|
---|
49 | logn_sigma = sqrt(log(1 + pow(relscale, 2)));
|
---|
50 | logn_mu = log(locn-1) - 0.5 * pow(relscale, 2);
|
---|
51 | PRTF("xxx %f %f\n", logn_mu, logn_sigma);
|
---|
52 | }
|
---|
53 |
|
---|
54 | // Generate a log-normally distributed random integer
|
---|
55 | // ln(X) ~ N(mu, sigma^2)
|
---|
56 | static int nextLognRand() {
|
---|
57 | double z = rand_std_normal();
|
---|
58 | double x = exp(logn_mu + logn_sigma * z);
|
---|
59 | PRTF("---%f %f\n", z, x);
|
---|
60 | return round(x);
|
---|
61 | }
|
---|
62 |
|
---|
63 | // write a randomly generated alphabetic string whose length is adjused from a draw of the above distribution
|
---|
64 | static void emit1( int offset, double mcfreq, char mchar ) {
|
---|
65 | int lim = 1 + offset + nextLognRand();
|
---|
66 | PRTF("%d\n", lim);
|
---|
67 | for (int i = 0; i < lim; i++) {
|
---|
68 | char emit;
|
---|
69 | if (U() < mcfreq) emit = mchar;
|
---|
70 | else emit = 'a' + (rand() % ('z'-'a'+1));
|
---|
71 | printf("%c", emit);
|
---|
72 | }
|
---|
73 | printf("\n");
|
---|
74 | }
|
---|
75 |
|
---|
76 | // usage: ./make-corpus toGen locn [relscale=1.0] [seed=(pid)] [offset=0] [mcfreq=0.0] [mchar='-']
|
---|
77 | //
|
---|
78 | // Outputs alphabetic (plus magic-char) strings, one per line.
|
---|
79 | // toGen: number of strings (lines)
|
---|
80 | //
|
---|
81 | // generated length ~ offset + lognormal( ... locn ... relscale ... )
|
---|
82 | // >= 1
|
---|
83 | //
|
---|
84 | // offset=0, locn=1: constant length 1
|
---|
85 | // offset=0, locn=2: lengths go like current value of $1 cash + $1-bought stock
|
---|
86 | // offset=0, locn=6: lengths go like current value of $1 cash + $5-bought stock
|
---|
87 | // offset=15, locn=1: constant length 16
|
---|
88 | // offset=15, locn=2: population's minimum is 16 and mean is 17
|
---|
89 | // i.e. lengths go like current value of $16 cash + $1-bought stock
|
---|
90 | //
|
---|
91 | // relscale gives the volatility of the stock. It's relative to the mean.
|
---|
92 | // relscale=0.5 means the +-1 SD outcomes (68% case) are between locn/1.5 = locn*0.67 and locn*1.5.
|
---|
93 | //
|
---|
94 | // Magic Char (mc) does not affect these lengths. Any mc occurrence replaces an alphabetic char.
|
---|
95 | // mcfreq: (in [0,1]) expected fraction of the characters output that are mchar
|
---|
96 | //
|
---|
97 | int main(int argc, char ** argv) {
|
---|
98 |
|
---|
99 | int toGen;
|
---|
100 | double locn;
|
---|
101 | double relscale = 1.0;
|
---|
102 | int seed = 0;
|
---|
103 | int offset = 0;
|
---|
104 | double mcfreq = 0.0;
|
---|
105 | char mchar = '-';
|
---|
106 |
|
---|
107 | assert(argc >= 3 && argc <= 8);
|
---|
108 | switch(argc) {
|
---|
109 | case 8:
|
---|
110 | assert(strlen(argv[7]) == 1);
|
---|
111 | mchar = argv[7][0];
|
---|
112 | case 7:
|
---|
113 | mcfreq = atof(argv[6]);
|
---|
114 | assert(mcfreq >= 0.0 && mcfreq <= 1.0);
|
---|
115 | case 6:
|
---|
116 | offset = atoi(argv[5]);
|
---|
117 | assert(offset >= 0 && offset < 10000);
|
---|
118 | case 5:
|
---|
119 | seed = atoi(argv[4]);
|
---|
120 | assert(seed > 0);
|
---|
121 | case 4:
|
---|
122 | relscale = atof(argv[3]);
|
---|
123 | assert(relscale > 0);
|
---|
124 | assert(relscale < 10);
|
---|
125 | default:
|
---|
126 | locn = atof(argv[2]);
|
---|
127 | assert(locn > 0);
|
---|
128 | assert(locn < 1000);
|
---|
129 | toGen = atoi(argv[1]);
|
---|
130 | assert(toGen > 0);
|
---|
131 | assert(toGen < 1000000);
|
---|
132 | }
|
---|
133 |
|
---|
134 | PRTF("toGen=%d, locn=%f, relscale=%f, seed=%d, offset=%d, mcfreq=%f, mchar='%c'\n", toGen, locn, relscale, seed, offset, mcfreq, mchar);
|
---|
135 |
|
---|
136 | initialize(locn, relscale, seed);
|
---|
137 | for( int i = 0; i < toGen; i++ ) {
|
---|
138 | emit1(offset, mcfreq, mchar);
|
---|
139 | }
|
---|
140 | }
|
---|