- Timestamp:
- Jun 4, 2025, 1:43:34 PM (4 months ago)
- Branches:
- master
- Children:
- f858ca5
- Parents:
- c8bdbaf
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/theses/mike_brooks_MMath/benchmarks/string/make-corpus.cfa
rc8bdbaf re0350e0 1 #include <stdlib.h fa>1 #include <stdlib.h> 2 2 #include <math.h> 3 3 #include <limits.h> 4 4 #include <unistd.h> 5 5 #include <string.h> 6 #include <assert.h> 7 #include <stdio.h> 8 9 #include <stdlib.h> 10 #include <math.h> 11 #include <limits.h> 12 #include <unistd.h> 13 14 int printing = 15 #ifdef VERBOSE 16 1 17 #else 18 0 19 #endif 20 ; 21 22 #define PRTF(fmt, ...) if (printing) fprintf(stderr, fmt, __VA_ARGS__) 6 23 7 24 // U(0,1) 8 25 static double U() { 9 return (double)rand() / (double)INT_MAX;26 return ((double)rand() + 1) / ((double)RAND_MAX + 2); // avoid 0 10 27 } 11 28 12 // generate random draws from a geometric distribution of the given mean 13 // https://math.stackexchange.com/questions/485448/prove-the-way-to-generate-geometrically-distributed-random-numbers 14 static double denom; 15 static void initialize(int mean) { 16 srand(getpid()); 17 double p = 1.0 / (double) mean; 18 denom = log(1-p); 29 // parameters for distribution 30 static double logn_mu, logn_sigma; 31 32 // returns a draw from N(0,1) 33 // based on https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform, section Implementation 34 static double rand_std_normal() { 35 double u1 = U(); 36 double u2 = U(); 37 double ret = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2); 38 PRTF("=== %f %f %f\n", u1, u2, ret); 39 return ret; 19 40 } 20 static int nextGeoRand() { 21 // ret = ⌊ln(U)/ln(1−p)⌋ where U ~ U(0, 1) 22 return 1 + (int) (log(U()) / denom); 41 42 // Initialize parameters for log-normal 43 // Mean and stdev are for the underlying normal distribution 44 // Resulting values will be log-normally distributed: X = exp(mu + sigma * Z) 45 static void initialize(double locn, double relscale, int seed) { 46 if (seed) srand(seed); 47 else srand(getpid()); 48 49 logn_sigma = sqrt(log(1 + pow(relscale, 2))); 50 logn_mu = log(locn-1) - 0.5 * pow(relscale, 2); 51 PRTF("xxx %f %f\n", logn_mu, logn_sigma); 52 } 53 54 // Generate a log-normally distributed random integer 55 // ln(X) ~ N(mu, sigma^2) 56 static int nextLognRand() { 57 double z = rand_std_normal(); 58 double x = exp(logn_mu + logn_sigma * z); 59 PRTF("---%f %f\n", z, x); 60 return round(x); 23 61 } 24 62 25 63 // write a randomly generated alphabetic string whose length is adjused from a draw of the above distribution 26 64 static void emit1( int offset, double mcfreq, char mchar ) { 27 int lim = offset + nextGeoRand();28 // printf("==%d\n", lim);29 for (i ; lim) {65 int lim = 1 + offset + nextLognRand(); 66 PRTF("%d\n", lim); 67 for (int i = 0; i < lim; i++) { 30 68 char emit; 31 69 if (U() < mcfreq) emit = mchar; 32 else emit = 'a' + (rand() % ('z'-'a' ));70 else emit = 'a' + (rand() % ('z'-'a'+1)); 33 71 printf("%c", emit); 34 72 } … … 36 74 } 37 75 38 // usage: ./make-corpus toGen mean[offset=0] [mcfreq=0.0] [mchar='-']76 // usage: ./make-corpus toGen locn [relscale=1.0] [seed=(pid)] [offset=0] [mcfreq=0.0] [mchar='-'] 39 77 // 40 78 // Outputs alphabetic (plus magic-char) strings, one per line. 41 79 // toGen: number of strings (lines) 42 80 // 43 // generated length ~ offset + geo(mean)81 // generated length ~ offset + lognormal( ... locn ... relscale ... ) 44 82 // >= 1 45 83 // 46 // offset=0, mean=1: constant length 1 47 // offset=0, mean=2: lengths go like number of coin tosses it takes to get heads 48 // offset=0, mean=6: lengths go like number of cube die rolls it takes to get ::: 49 // offset=15, mean=1: constant length 16 50 // offset=15, mean=2: population's minimum is 16 and mean is 17 84 // offset=0, locn=1: constant length 1 85 // offset=0, locn=2: lengths go like current value of $1 cash + $1-bought stock 86 // offset=0, locn=6: lengths go like current value of $1 cash + $5-bought stock 87 // offset=15, locn=1: constant length 16 88 // offset=15, locn=2: population's minimum is 16 and mean is 17 89 // i.e. lengths go like current value of $16 cash + $1-bought stock 90 // 91 // relscale gives the volatility of the stock. It's relative to the mean. 92 // relscale=0.5 means the +-1 SD outcomes (68% case) are between locn/1.5 = locn*0.67 and locn*1.5. 51 93 // 52 94 // Magic Char (mc) does not affect these lengths. Any mc occurrence replaces an alphabetic char. … … 56 98 57 99 int toGen; 58 int mean; 100 double locn; 101 double relscale = 1.0; 102 int seed = 0; 59 103 int offset = 0; 60 104 double mcfreq = 0.0; 61 105 char mchar = '-'; 62 106 63 assert(argc >= 3 && argc <= 6);107 assert(argc >= 3 && argc <= 8); 64 108 switch(argc) { 109 case 8: 110 assert(strlen(argv[7]) == 1); 111 mchar = argv[7][0]; 112 case 7: 113 mcfreq = atof(argv[6]); 114 assert(mcfreq >= 0.0 && mcfreq <= 1.0); 65 115 case 6: 66 assert(strlen(argv[5]) == 0);67 mchar = argv[5][0];116 offset = atoi(argv[5]); 117 assert(offset >= 0 && offset < 10000); 68 118 case 5: 69 mcfreq = atof(argv[4]);70 assert( mcfreq >= 0.0 && mcfreq <= 1.0);119 seed = atoi(argv[4]); 120 assert(seed > 0); 71 121 case 4: 72 offset = atoi(argv[3]); 73 assert(offset >= 0 && offset < 10000); 122 relscale = atof(argv[3]); 123 assert(relscale > 0); 124 assert(relscale < 10); 74 125 default: 75 mean = atoi(argv[2]);76 assert( mean > 0);77 assert( mean < 1000);126 locn = atof(argv[2]); 127 assert(locn > 0); 128 assert(locn < 1000); 78 129 toGen = atoi(argv[1]); 79 130 assert(toGen > 0); … … 81 132 } 82 133 83 initialize(mean); 84 for( i; toGen ) { 134 PRTF("toGen=%d, locn=%f, relscale=%f, seed=%d, offset=%d, mcfreq=%f, mchar='%c'\n", toGen, locn, relscale, seed, offset, mcfreq, mchar); 135 136 initialize(locn, relscale, seed); 137 for( int i = 0; i < toGen; i++ ) { 85 138 emit1(offset, mcfreq, mchar); 86 139 }
Note:
See TracChangeset
for help on using the changeset viewer.