| [01db301] | 1 | #include <stdlib.hfa> | 
|---|
|  | 2 | #include <math.h> | 
|---|
|  | 3 | #include <limits.h> | 
|---|
|  | 4 | #include <unistd.h> | 
|---|
| [08ed947] | 5 | #include <string.h> | 
|---|
|  | 6 |  | 
|---|
|  | 7 | // U(0,1) | 
|---|
|  | 8 | static double U() { | 
|---|
|  | 9 | return (double)rand() / (double)INT_MAX; | 
|---|
|  | 10 | } | 
|---|
| [01db301] | 11 |  | 
|---|
|  | 12 | // generate random draws from a geometric distribution of the given mean | 
|---|
|  | 13 | // https://math.stackexchange.com/questions/485448/prove-the-way-to-generate-geometrically-distributed-random-numbers | 
|---|
|  | 14 | static double denom; | 
|---|
|  | 15 | static void initialize(int mean) { | 
|---|
|  | 16 | srand(getpid()); | 
|---|
|  | 17 | double p = 1.0 / (double) mean; | 
|---|
|  | 18 | denom = log(1-p); | 
|---|
|  | 19 | } | 
|---|
|  | 20 | static int nextGeoRand() { | 
|---|
|  | 21 | // ret = ⌊ln(U)/ln(1−p)⌋ where U ~ U(0, 1) | 
|---|
| [08ed947] | 22 | return 1 + (int) (log(U()) / denom); | 
|---|
| [01db301] | 23 | } | 
|---|
|  | 24 |  | 
|---|
| [08ed947] | 25 | // write a randomly generated alphabetic string whose length is adjused from a draw of the above distribution | 
|---|
|  | 26 | static void emit1( int offset, double mcfreq, char mchar ) { | 
|---|
|  | 27 | int lim = offset + nextGeoRand(); | 
|---|
| [01db301] | 28 | // printf("==%d\n", lim); | 
|---|
|  | 29 | for (i; lim) { | 
|---|
| [08ed947] | 30 | char emit; | 
|---|
|  | 31 | if (U() < mcfreq) emit = mchar; | 
|---|
|  | 32 | else emit = 'a' + (rand() % ('z'-'a')); | 
|---|
| [01db301] | 33 | printf("%c", emit); | 
|---|
|  | 34 | } | 
|---|
|  | 35 | printf("\n"); | 
|---|
|  | 36 | } | 
|---|
|  | 37 |  | 
|---|
| [08ed947] | 38 | // usage: ./make-corpus toGen mean [offset=0] [mcfreq=0.0] [mchar='-'] | 
|---|
|  | 39 | // | 
|---|
|  | 40 | // Outputs alphabetic (plus magic-char) strings, one per line. | 
|---|
|  | 41 | // toGen: number of strings (lines) | 
|---|
|  | 42 | // | 
|---|
|  | 43 | // generated length ~  offset + geo(mean) | 
|---|
|  | 44 | //                  >= 1 | 
|---|
|  | 45 | // | 
|---|
|  | 46 | // offset=0,  mean=1:  constant length 1 | 
|---|
|  | 47 | // offset=0,  mean=2:  lengths go like number of coin tosses it takes to get heads | 
|---|
|  | 48 | // offset=0,  mean=6:  lengths go like number of cube die rolls it takes to get ::: | 
|---|
|  | 49 | // offset=15, mean=1:  constant length 16 | 
|---|
|  | 50 | // offset=15, mean=2:  population's minimum is 16 and mean is 17 | 
|---|
|  | 51 | // | 
|---|
|  | 52 | // Magic Char (mc) does not affect these lengths.  Any mc occurrence replaces an alphabetic char. | 
|---|
|  | 53 | // mcfreq: (in [0,1]) expected fraction of the characters output that are mchar | 
|---|
|  | 54 | // | 
|---|
| [01db301] | 55 | int main(int argc, char ** argv) { | 
|---|
|  | 56 |  | 
|---|
| [08ed947] | 57 | int toGen; | 
|---|
|  | 58 | int mean; | 
|---|
|  | 59 | int offset = 0; | 
|---|
|  | 60 | double mcfreq = 0.0; | 
|---|
|  | 61 | char mchar = '-'; | 
|---|
| [01db301] | 62 |  | 
|---|
| [08ed947] | 63 | assert(argc >= 3 && argc <= 6); | 
|---|
|  | 64 | switch(argc) { | 
|---|
|  | 65 | case 6: | 
|---|
|  | 66 | assert(strlen(argv[5]) == 0); | 
|---|
|  | 67 | mchar = argv[5][0]; | 
|---|
|  | 68 | case 5: | 
|---|
|  | 69 | mcfreq = atof(argv[4]); | 
|---|
|  | 70 | assert(mcfreq >= 0.0 && mcfreq <= 1.0); | 
|---|
|  | 71 | case 4: | 
|---|
|  | 72 | offset = atoi(argv[3]); | 
|---|
|  | 73 | assert(offset >= 0 && offset < 10000); | 
|---|
|  | 74 | default: | 
|---|
|  | 75 | mean = atoi(argv[2]); | 
|---|
|  | 76 | assert(mean > 0); | 
|---|
|  | 77 | assert(mean < 1000); | 
|---|
|  | 78 | toGen = atoi(argv[1]); | 
|---|
|  | 79 | assert(toGen > 0); | 
|---|
|  | 80 | assert(toGen < 1000000); | 
|---|
|  | 81 | } | 
|---|
| [01db301] | 82 |  | 
|---|
|  | 83 | initialize(mean); | 
|---|
|  | 84 | for( i; toGen ) { | 
|---|
| [08ed947] | 85 | emit1(offset, mcfreq, mchar); | 
|---|
| [01db301] | 86 | } | 
|---|
|  | 87 | } | 
|---|