| [01db301] | 1 | #include <stdlib.hfa>
 | 
|---|
 | 2 | #include <math.h>
 | 
|---|
 | 3 | #include <limits.h>
 | 
|---|
 | 4 | #include <unistd.h>
 | 
|---|
| [08ed947] | 5 | #include <string.h>
 | 
|---|
 | 6 | 
 | 
|---|
 | 7 | // U(0,1)
 | 
|---|
 | 8 | static double U() {
 | 
|---|
 | 9 |     return (double)rand() / (double)INT_MAX;
 | 
|---|
 | 10 | }
 | 
|---|
| [01db301] | 11 | 
 | 
|---|
 | 12 | // generate random draws from a geometric distribution of the given mean
 | 
|---|
 | 13 | // https://math.stackexchange.com/questions/485448/prove-the-way-to-generate-geometrically-distributed-random-numbers
 | 
|---|
 | 14 | static double denom;
 | 
|---|
 | 15 | static void initialize(int mean) {
 | 
|---|
 | 16 |     srand(getpid());
 | 
|---|
 | 17 |     double p = 1.0 / (double) mean;
 | 
|---|
 | 18 |     denom = log(1-p);
 | 
|---|
 | 19 | }
 | 
|---|
 | 20 | static int nextGeoRand() {
 | 
|---|
 | 21 |     // ret = ⌊ln(U)/ln(1−p)⌋ where U ~ U(0, 1)
 | 
|---|
| [08ed947] | 22 |     return 1 + (int) (log(U()) / denom);
 | 
|---|
| [01db301] | 23 | }
 | 
|---|
 | 24 | 
 | 
|---|
| [08ed947] | 25 | // write a randomly generated alphabetic string whose length is adjused from a draw of the above distribution
 | 
|---|
 | 26 | static void emit1( int offset, double mcfreq, char mchar ) {
 | 
|---|
 | 27 |     int lim = offset + nextGeoRand();
 | 
|---|
| [01db301] | 28 |     // printf("==%d\n", lim);
 | 
|---|
 | 29 |     for (i; lim) {
 | 
|---|
| [08ed947] | 30 |         char emit;
 | 
|---|
 | 31 |         if (U() < mcfreq) emit = mchar;
 | 
|---|
 | 32 |         else emit = 'a' + (rand() % ('z'-'a'));
 | 
|---|
| [01db301] | 33 |         printf("%c", emit);
 | 
|---|
 | 34 |     }
 | 
|---|
 | 35 |     printf("\n");
 | 
|---|
 | 36 | }
 | 
|---|
 | 37 | 
 | 
|---|
| [08ed947] | 38 | // usage: ./make-corpus toGen mean [offset=0] [mcfreq=0.0] [mchar='-']
 | 
|---|
 | 39 | //
 | 
|---|
 | 40 | // Outputs alphabetic (plus magic-char) strings, one per line.
 | 
|---|
 | 41 | // toGen: number of strings (lines)
 | 
|---|
 | 42 | // 
 | 
|---|
 | 43 | // generated length ~  offset + geo(mean)
 | 
|---|
 | 44 | //                  >= 1
 | 
|---|
 | 45 | //
 | 
|---|
 | 46 | // offset=0,  mean=1:  constant length 1
 | 
|---|
 | 47 | // offset=0,  mean=2:  lengths go like number of coin tosses it takes to get heads
 | 
|---|
 | 48 | // offset=0,  mean=6:  lengths go like number of cube die rolls it takes to get :::
 | 
|---|
 | 49 | // offset=15, mean=1:  constant length 16
 | 
|---|
 | 50 | // offset=15, mean=2:  population's minimum is 16 and mean is 17
 | 
|---|
 | 51 | //
 | 
|---|
 | 52 | // Magic Char (mc) does not affect these lengths.  Any mc occurrence replaces an alphabetic char.
 | 
|---|
 | 53 | // mcfreq: (in [0,1]) expected fraction of the characters output that are mchar
 | 
|---|
 | 54 | //
 | 
|---|
| [01db301] | 55 | int main(int argc, char ** argv) {
 | 
|---|
 | 56 | 
 | 
|---|
| [08ed947] | 57 |     int toGen;
 | 
|---|
 | 58 |     int mean;
 | 
|---|
 | 59 |     int offset = 0;
 | 
|---|
 | 60 |     double mcfreq = 0.0;
 | 
|---|
 | 61 |     char mchar = '-';
 | 
|---|
| [01db301] | 62 | 
 | 
|---|
| [08ed947] | 63 |     assert(argc >= 3 && argc <= 6);
 | 
|---|
 | 64 |     switch(argc) {
 | 
|---|
 | 65 |         case 6:
 | 
|---|
 | 66 |             assert(strlen(argv[5]) == 0);
 | 
|---|
 | 67 |             mchar = argv[5][0];
 | 
|---|
 | 68 |         case 5:
 | 
|---|
 | 69 |             mcfreq = atof(argv[4]);
 | 
|---|
 | 70 |             assert(mcfreq >= 0.0 && mcfreq <= 1.0);
 | 
|---|
 | 71 |         case 4:
 | 
|---|
 | 72 |             offset = atoi(argv[3]);
 | 
|---|
 | 73 |             assert(offset >= 0 && offset < 10000);
 | 
|---|
 | 74 |         default:
 | 
|---|
 | 75 |             mean = atoi(argv[2]);
 | 
|---|
 | 76 |             assert(mean > 0);
 | 
|---|
 | 77 |             assert(mean < 1000);
 | 
|---|
 | 78 |             toGen = atoi(argv[1]);
 | 
|---|
 | 79 |             assert(toGen > 0);
 | 
|---|
 | 80 |             assert(toGen < 1000000);
 | 
|---|
 | 81 |     }
 | 
|---|
| [01db301] | 82 | 
 | 
|---|
 | 83 |     initialize(mean);
 | 
|---|
 | 84 |     for( i; toGen ) {
 | 
|---|
| [08ed947] | 85 |         emit1(offset, mcfreq, mchar);
 | 
|---|
| [01db301] | 86 |     }
 | 
|---|
 | 87 | }
 | 
|---|