1 | #include <stdlib.hfa>
|
---|
2 | #include <math.h>
|
---|
3 | #include <limits.h>
|
---|
4 | #include <unistd.h>
|
---|
5 | #include <string.h>
|
---|
6 |
|
---|
7 | // U(0,1)
|
---|
8 | static double U() {
|
---|
9 | return (double)rand() / (double)INT_MAX;
|
---|
10 | }
|
---|
11 |
|
---|
12 | // generate random draws from a geometric distribution of the given mean
|
---|
13 | // https://math.stackexchange.com/questions/485448/prove-the-way-to-generate-geometrically-distributed-random-numbers
|
---|
14 | static double denom;
|
---|
15 | static void initialize(int mean) {
|
---|
16 | srand(getpid());
|
---|
17 | double p = 1.0 / (double) mean;
|
---|
18 | denom = log(1-p);
|
---|
19 | }
|
---|
20 | static int nextGeoRand() {
|
---|
21 | // ret = ⌊ln(U)/ln(1−p)⌋ where U ~ U(0, 1)
|
---|
22 | return 1 + (int) (log(U()) / denom);
|
---|
23 | }
|
---|
24 |
|
---|
25 | // write a randomly generated alphabetic string whose length is adjused from a draw of the above distribution
|
---|
26 | static void emit1( int offset, double mcfreq, char mchar ) {
|
---|
27 | int lim = offset + nextGeoRand();
|
---|
28 | // printf("==%d\n", lim);
|
---|
29 | for (i; lim) {
|
---|
30 | char emit;
|
---|
31 | if (U() < mcfreq) emit = mchar;
|
---|
32 | else emit = 'a' + (rand() % ('z'-'a'));
|
---|
33 | printf("%c", emit);
|
---|
34 | }
|
---|
35 | printf("\n");
|
---|
36 | }
|
---|
37 |
|
---|
38 | // usage: ./make-corpus toGen mean [offset=0] [mcfreq=0.0] [mchar='-']
|
---|
39 | //
|
---|
40 | // Outputs alphabetic (plus magic-char) strings, one per line.
|
---|
41 | // toGen: number of strings (lines)
|
---|
42 | //
|
---|
43 | // generated length ~ offset + geo(mean)
|
---|
44 | // >= 1
|
---|
45 | //
|
---|
46 | // offset=0, mean=1: constant length 1
|
---|
47 | // offset=0, mean=2: lengths go like number of coin tosses it takes to get heads
|
---|
48 | // offset=0, mean=6: lengths go like number of cube die rolls it takes to get :::
|
---|
49 | // offset=15, mean=1: constant length 16
|
---|
50 | // offset=15, mean=2: population's minimum is 16 and mean is 17
|
---|
51 | //
|
---|
52 | // Magic Char (mc) does not affect these lengths. Any mc occurrence replaces an alphabetic char.
|
---|
53 | // mcfreq: (in [0,1]) expected fraction of the characters output that are mchar
|
---|
54 | //
|
---|
55 | int main(int argc, char ** argv) {
|
---|
56 |
|
---|
57 | int toGen;
|
---|
58 | int mean;
|
---|
59 | int offset = 0;
|
---|
60 | double mcfreq = 0.0;
|
---|
61 | char mchar = '-';
|
---|
62 |
|
---|
63 | assert(argc >= 3 && argc <= 6);
|
---|
64 | switch(argc) {
|
---|
65 | case 6:
|
---|
66 | assert(strlen(argv[5]) == 0);
|
---|
67 | mchar = argv[5][0];
|
---|
68 | case 5:
|
---|
69 | mcfreq = atof(argv[4]);
|
---|
70 | assert(mcfreq >= 0.0 && mcfreq <= 1.0);
|
---|
71 | case 4:
|
---|
72 | offset = atoi(argv[3]);
|
---|
73 | assert(offset >= 0 && offset < 10000);
|
---|
74 | default:
|
---|
75 | mean = atoi(argv[2]);
|
---|
76 | assert(mean > 0);
|
---|
77 | assert(mean < 1000);
|
---|
78 | toGen = atoi(argv[1]);
|
---|
79 | assert(toGen > 0);
|
---|
80 | assert(toGen < 1000000);
|
---|
81 | }
|
---|
82 |
|
---|
83 | initialize(mean);
|
---|
84 | for( i; toGen ) {
|
---|
85 | emit1(offset, mcfreq, mchar);
|
---|
86 | }
|
---|
87 | }
|
---|