Ignore:
Timestamp:
Jun 4, 2025, 1:43:34 PM (4 months ago)
Author:
Michael Brooks <mlbrooks@…>
Branches:
master
Children:
f858ca5
Parents:
c8bdbaf
Message:

Recent rework of string benchmarks

File:
1 edited

Legend:

Unmodified
Added
Removed
  • doc/theses/mike_brooks_MMath/benchmarks/string/make-corpus.cfa

    rc8bdbaf re0350e0  
    1 #include <stdlib.hfa>
     1#include <stdlib.h>
    22#include <math.h>
    33#include <limits.h>
    44#include <unistd.h>
    55#include <string.h>
     6#include <assert.h>
     7#include <stdio.h>
     8
     9#include <stdlib.h>
     10#include <math.h>
     11#include <limits.h>
     12#include <unistd.h>
     13
     14int printing =
     15#ifdef VERBOSE
     16    1
     17#else
     18    0
     19#endif
     20;
     21
     22#define PRTF(fmt, ...) if (printing) fprintf(stderr, fmt, __VA_ARGS__)
    623
    724// U(0,1)
    825static double U() {
    9     return (double)rand() / (double)INT_MAX;
     26    return  ((double)rand() + 1) / ((double)RAND_MAX + 2); // avoid 0
    1027}
    1128
    12 // generate random draws from a geometric distribution of the given mean
    13 // https://math.stackexchange.com/questions/485448/prove-the-way-to-generate-geometrically-distributed-random-numbers
    14 static double denom;
    15 static void initialize(int mean) {
    16     srand(getpid());
    17     double p = 1.0 / (double) mean;
    18     denom = log(1-p);
     29// parameters for distribution
     30static double logn_mu, logn_sigma;
     31
     32// returns a draw from N(0,1)
     33// based on https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform, section Implementation
     34static double rand_std_normal() {
     35    double u1 = U();
     36    double u2 = U();
     37    double ret = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
     38    PRTF("=== %f %f %f\n", u1, u2, ret);
     39    return ret;
    1940}
    20 static int nextGeoRand() {
    21     // ret = ⌊ln(U)/ln(1−p)⌋ where U ~ U(0, 1)
    22     return 1 + (int) (log(U()) / denom);
     41
     42// Initialize parameters for log-normal
     43// Mean and stdev are for the underlying normal distribution
     44// Resulting values will be log-normally distributed: X = exp(mu + sigma * Z)
     45static void initialize(double locn, double relscale, int seed) {
     46    if (seed) srand(seed);
     47    else srand(getpid());
     48
     49    logn_sigma = sqrt(log(1 + pow(relscale, 2)));
     50    logn_mu = log(locn-1) - 0.5 * pow(relscale, 2);
     51    PRTF("xxx %f %f\n", logn_mu, logn_sigma);
     52}
     53
     54// Generate a log-normally distributed random integer
     55// ln(X) ~ N(mu, sigma^2)
     56static int nextLognRand() {
     57    double z = rand_std_normal();
     58    double x = exp(logn_mu + logn_sigma * z);
     59    PRTF("---%f %f\n", z, x);
     60    return round(x);
    2361}
    2462
    2563// write a randomly generated alphabetic string whose length is adjused from a draw of the above distribution
    2664static void emit1( int offset, double mcfreq, char mchar ) {
    27     int lim = offset + nextGeoRand();
    28     // printf("==%d\n", lim);
    29     for (i; lim) {
     65    int lim = 1 + offset + nextLognRand();
     66    PRTF("%d\n", lim);
     67    for (int i = 0; i < lim; i++) {
    3068        char emit;
    3169        if (U() < mcfreq) emit = mchar;
    32         else emit = 'a' + (rand() % ('z'-'a'));
     70        else emit = 'a' + (rand() % ('z'-'a'+1));
    3371        printf("%c", emit);
    3472    }
     
    3674}
    3775
    38 // usage: ./make-corpus toGen mean [offset=0] [mcfreq=0.0] [mchar='-']
     76// usage: ./make-corpus toGen locn [relscale=1.0] [seed=(pid)] [offset=0] [mcfreq=0.0] [mchar='-']
    3977//
    4078// Outputs alphabetic (plus magic-char) strings, one per line.
    4179// toGen: number of strings (lines)
    4280//
    43 // generated length ~  offset + geo(mean)
     81// generated length ~  offset + lognormal( ... locn ... relscale ... )
    4482//                  >= 1
    4583//
    46 // offset=0,  mean=1:  constant length 1
    47 // offset=0,  mean=2:  lengths go like number of coin tosses it takes to get heads
    48 // offset=0,  mean=6:  lengths go like number of cube die rolls it takes to get :::
    49 // offset=15, mean=1:  constant length 16
    50 // offset=15, mean=2:  population's minimum is 16 and mean is 17
     84// offset=0,  locn=1:  constant length 1
     85// offset=0,  locn=2:  lengths go like current value of $1 cash + $1-bought stock
     86// offset=0,  locn=6:  lengths go like current value of $1 cash + $5-bought stock
     87// offset=15, locn=1:  constant length 16
     88// offset=15, locn=2:  population's minimum is 16 and mean is 17
     89//                     i.e. lengths go like current value of $16 cash + $1-bought stock
     90//
     91// relscale gives the volatility of the stock.  It's relative to the mean.
     92// relscale=0.5 means the +-1 SD outcomes (68% case) are between locn/1.5 = locn*0.67 and locn*1.5.
    5193//
    5294// Magic Char (mc) does not affect these lengths.  Any mc occurrence replaces an alphabetic char.
     
    5698
    5799    int toGen;
    58     int mean;
     100    double locn;
     101    double relscale = 1.0;
     102    int seed = 0;
    59103    int offset = 0;
    60104    double mcfreq = 0.0;
    61105    char mchar = '-';
    62106
    63     assert(argc >= 3 && argc <= 6);
     107    assert(argc >= 3 && argc <= 8);
    64108    switch(argc) {
     109        case 8:
     110            assert(strlen(argv[7]) == 1);
     111            mchar = argv[7][0];
     112        case 7:
     113            mcfreq = atof(argv[6]);
     114            assert(mcfreq >= 0.0 && mcfreq <= 1.0);
    65115        case 6:
    66             assert(strlen(argv[5]) == 0);
    67             mchar = argv[5][0];
     116            offset = atoi(argv[5]);
     117            assert(offset >= 0 && offset < 10000);
    68118        case 5:
    69             mcfreq = atof(argv[4]);
    70             assert(mcfreq >= 0.0 && mcfreq <= 1.0);
     119            seed = atoi(argv[4]);
     120            assert(seed > 0);
    71121        case 4:
    72             offset = atoi(argv[3]);
    73             assert(offset >= 0 && offset < 10000);
     122            relscale = atof(argv[3]);
     123            assert(relscale > 0);
     124            assert(relscale < 10);
    74125        default:
    75             mean = atoi(argv[2]);
    76             assert(mean > 0);
    77             assert(mean < 1000);
     126            locn = atof(argv[2]);
     127            assert(locn > 0);
     128            assert(locn < 1000);
    78129            toGen = atoi(argv[1]);
    79130            assert(toGen > 0);
     
    81132    }
    82133
    83     initialize(mean);
    84     for( i; toGen ) {
     134    PRTF("toGen=%d, locn=%f, relscale=%f, seed=%d, offset=%d, mcfreq=%f, mchar='%c'\n", toGen, locn, relscale, seed, offset, mcfreq, mchar);
     135
     136    initialize(locn, relscale, seed);
     137    for( int i = 0; i < toGen; i++ ) {
    85138        emit1(offset, mcfreq, mchar);
    86139    }
Note: See TracChangeset for help on using the changeset viewer.