#include #include #include #include #include // U(0,1) static double U() { return (double)rand() / (double)INT_MAX; } // generate random draws from a geometric distribution of the given mean // https://math.stackexchange.com/questions/485448/prove-the-way-to-generate-geometrically-distributed-random-numbers static double denom; static void initialize(int mean) { srand(getpid()); double p = 1.0 / (double) mean; denom = log(1-p); } static int nextGeoRand() { // ret = ⌊ln(U)/ln(1−p)⌋ where U ~ U(0, 1) return 1 + (int) (log(U()) / denom); } // write a randomly generated alphabetic string whose length is adjused from a draw of the above distribution static void emit1( int offset, double mcfreq, char mchar ) { int lim = offset + nextGeoRand(); // printf("==%d\n", lim); for (i; lim) { char emit; if (U() < mcfreq) emit = mchar; else emit = 'a' + (rand() % ('z'-'a')); printf("%c", emit); } printf("\n"); } // usage: ./make-corpus toGen mean [offset=0] [mcfreq=0.0] [mchar='-'] // // Outputs alphabetic (plus magic-char) strings, one per line. // toGen: number of strings (lines) // // generated length ~ offset + geo(mean) // >= 1 // // offset=0, mean=1: constant length 1 // offset=0, mean=2: lengths go like number of coin tosses it takes to get heads // offset=0, mean=6: lengths go like number of cube die rolls it takes to get ::: // offset=15, mean=1: constant length 16 // offset=15, mean=2: population's minimum is 16 and mean is 17 // // Magic Char (mc) does not affect these lengths. Any mc occurrence replaces an alphabetic char. // mcfreq: (in [0,1]) expected fraction of the characters output that are mchar // int main(int argc, char ** argv) { int toGen; int mean; int offset = 0; double mcfreq = 0.0; char mchar = '-'; assert(argc >= 3 && argc <= 6); switch(argc) { case 6: assert(strlen(argv[5]) == 0); mchar = argv[5][0]; case 5: mcfreq = atof(argv[4]); assert(mcfreq >= 0.0 && mcfreq <= 1.0); case 4: offset = atoi(argv[3]); assert(offset >= 0 && offset < 10000); default: mean = atoi(argv[2]); assert(mean > 0); assert(mean < 1000); toGen = atoi(argv[1]); assert(toGen > 0); assert(toGen < 1000000); } initialize(mean); for( i; toGen ) { emit1(offset, mcfreq, mchar); } }