// // Cforall Version 1.0.0 Copyright (C) 2022 University of Waterloo // // The contents of this file are covered under the licence agreement in the // file "LICENCE" distributed with Cforall. // // random.hfa -- // // Author : Peter A. Buhr // Created On : Fri Jan 14 07:18:11 2022 // Last Modified By : Peter A. Buhr // Last Modified On : Wed Dec 14 20:32:53 2022 // Update Count : 177 // #pragma once #include #define GLUE2( x, y ) x##y #define GLUE( x, y ) GLUE2( x, y ) // Set default PRNG for architecture size. #ifdef __x86_64__ // 64-bit architecture // 64-bit generators //#define LEHMER64 //#define XORSHIFT_12_25_27 #define XOSHIRO256PP //#define KISS_64 // 32-bit generators //#define XORSHIFT_6_21_7 #define XOSHIRO128PP #else // 32-bit architecture // 64-bit generators //#define XORSHIFT_13_7_17 #define XOSHIRO256PP // 32-bit generators //#define XORSHIFT_6_21_7 #define XOSHIRO128PP #endif // __x86_64__ // Define C/CFA PRNG name and random-state. // SKULLDUGGERY: typedefs name struct and typedef with the same name to deal with CFA typedef numbering problem. #ifdef XOSHIRO256PP #define PRNG_NAME_64 xoshiro256pp #define PRNG_STATE_64_T GLUE(PRNG_NAME_64,_t) typedef struct PRNG_STATE_64_T { uint64_t s0, s1, s2, s3; } PRNG_STATE_64_T; #endif // XOSHIRO256PP #ifdef XOSHIRO128PP #define PRNG_NAME_32 xoshiro128pp #define PRNG_STATE_32_T GLUE(PRNG_NAME_32,_t) typedef struct PRNG_STATE_32_T { uint32_t s0, s1, s2, s3; } PRNG_STATE_32_T; #endif // XOSHIRO128PP #ifdef LEHMER64 #define PRNG_NAME_64 lehmer64 #define PRNG_STATE_64_T __uint128_t #endif // LEHMER64 #ifdef WYHASH64 #define PRNG_NAME_64 wyhash64 #define PRNG_STATE_64_T uint64_t #endif // LEHMER64 #ifdef XORSHIFT_13_7_17 #define PRNG_NAME_64 xorshift_13_7_17 #define PRNG_STATE_64_T uint64_t #endif // XORSHIFT_13_7_17 #ifdef XORSHIFT_6_21_7 #define PRNG_NAME_32 xorshift_6_21_7 #define PRNG_STATE_32_T uint32_t #endif // XORSHIFT_6_21_7 #ifdef XORSHIFT_12_25_27 #define PRNG_NAME_64 xorshift_12_25_27 #define PRNG_STATE_64_T uint64_t #endif // XORSHIFT_12_25_27 #ifdef KISS_64 #define PRNG_NAME_64 kiss_64 #define PRNG_STATE_64_T GLUE(PRNG_NAME_64,_t) typedef struct PRNG_STATE_64_T { uint64_t z, w, jsr, jcong; } PRNG_STATE_64_T; #endif // KISS_^64 #ifdef XORWOW #define PRNG_NAME_32 xorwow #define PRNG_STATE_32_T GLUE(PRNG_NAME_32,_t) typedef struct PRNG_STATE_32_T { uint32_t a, b, c, d, counter; } PRNG_STATE_32_T; #endif // XOSHIRO128PP #define PRNG_SET_SEED_64 GLUE(PRNG_NAME_64,_set_seed) #define PRNG_SET_SEED_32 GLUE(PRNG_NAME_32,_set_seed) // Default PRNG used by runtime. #ifdef __x86_64__ // 64-bit architecture #define PRNG_NAME PRNG_NAME_64 #define PRNG_STATE_T PRNG_STATE_64_T #else // 32-bit architecture #define PRNG_NAME PRNG_NAME_32 #define PRNG_STATE_T PRNG_STATE_32_T #endif // __x86_64__ #define PRNG_SET_SEED GLUE(PRNG_NAME,_set_seed) // ALL PRNG ALGORITHMS ARE OPTIMIZED SO THAT THE PRNG LOGIC CAN HAPPEN IN PARALLEL WITH THE USE OF THE RESULT. // Specifically, the current random state is copied for returning, before computing the next value. As a consequence, // the set_seed routine primes the PRNG by calling it with the state so the seed is not return as the first random // value. #ifdef __cforall // don't include in C code (invoke.h) // https://prng.di.unimi.it/xoshiro256starstar.c // // This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators. It has excellent (sub-ns) speed, a state // (256 bits) that is large enough for any parallel application, and it passes all tests we are aware of. // // For generating just floating-point numbers, xoshiro256+ is even faster. // // The state must be seeded so that it is not everywhere zero. If you have a 64-bit seed, we suggest to seed a // splitmix64 generator and use its output to fill s. #ifndef XOSHIRO256PP typedef struct xoshiro256pp_t { uint64_t s0, s1, s2, s3; } xoshiro256pp_t; #endif // ! XOSHIRO256PP static inline uint64_t xoshiro256pp( xoshiro256pp_t & rs ) with(rs) { inline uint64_t rotl( const uint64_t x, int k ) { return (x << k) | (x >> (64 - k)); } // rotl const uint64_t result = rotl( s0 + s3, 23 ) + s0; const uint64_t t = s1 << 17; s2 ^= s0; s3 ^= s1; s1 ^= s2; s0 ^= s3; s2 ^= t; s3 = rotl( s3, 45 ); return result; } // xoshiro256pp static inline void xoshiro256pp_set_seed( xoshiro256pp_t & state, uint64_t seed ) { state = (xoshiro256pp_t){ seed, seed, seed, seed }; xoshiro256pp( state ); } // xoshiro256pp_set_seed // https://prng.di.unimi.it/xoshiro128plusplus.c // // This is xoshiro128++ 1.0, one of our 32-bit all-purpose, rock-solid generators. It has excellent speed, a state size // (128 bits) that is large enough for mild parallelism, and it passes all tests we are aware of. // // For generating just single-precision (i.e., 32-bit) floating-point numbers, xoshiro128+ is even faster. // // The state must be seeded so that it is not everywhere zero. #ifndef XOSHIRO128PP typedef struct xoshiro128pp_t { uint32_t s0, s1, s2, s3; } xoshiro128pp_t; #endif // ! XOSHIRO128PP static inline uint32_t xoshiro128pp( xoshiro128pp_t & rs ) with(rs) { inline uint32_t rotl( const uint32_t x, int k ) { return (x << k) | (x >> (32 - k)); } // rotl const uint32_t result = rotl( s0 + s3, 7 ) + s0; const uint32_t t = s1 << 9; s2 ^= s0; s3 ^= s1; s1 ^= s2; s0 ^= s3; s2 ^= t; s3 = rotl( s3, 11 ); return result; } // xoshiro128pp static inline void xoshiro128pp_set_seed( xoshiro128pp_t & state, uint32_t seed ) { state = (xoshiro128pp_t){ seed, seed, seed, seed }; xoshiro128pp( state ); // prime } // xoshiro128pp_set_seed #ifdef __SIZEOF_INT128__ //-------------------------------------------------- static inline uint64_t lehmer64( __uint128_t & state ) { __uint128_t ret = state; state *= 0xda942042e4dd58b5; return ret >> 64; } // lehmer64 static inline void lehmer64_set_seed( __uint128_t & state, uint64_t seed ) { // The seed needs to be coprime with the 2^64 modulus to get the argest period, so no factors of 2 in the seed. state = seed; lehmer64( state ); // prime } // lehmer64_set_seed //-------------------------------------------------- static inline uint64_t wyhash64( uint64_t & state ) { uint64_t ret = state; state += 0x_60be_e2be_e120_fc15; __uint128_t tmp; tmp = (__uint128_t) ret * 0x_a3b1_9535_4a39_b70d; uint64_t m1 = (tmp >> 64) ^ tmp; tmp = (__uint128_t)m1 * 0x_1b03_7387_12fa_d5c9; uint64_t m2 = (tmp >> 64) ^ tmp; return m2; } // wyhash64 static inline void wyhash64_set_seed( uint64_t & state, uint64_t seed ) { state = seed; wyhash64( state ); // prime } // wyhash64_set_seed #endif // __SIZEOF_INT128__ //-------------------------------------------------- static inline uint64_t xorshift_13_7_17( uint64_t & state ) { uint64_t ret = state; state ^= state << 13; state ^= state >> 7; state ^= state << 17; return ret; } // xorshift_13_7_17 static inline void xorshift_13_7_17_set_seed( uint64_t & state, uint64_t seed ) { state = seed; xorshift_13_7_17( state ); // prime } // xorshift_13_7_17_set_seed //-------------------------------------------------- // Marsaglia shift-XOR PRNG with thread-local state // Period is 4G-1 // 0 is absorbing and must be avoided // Low-order bits are not particularly random static inline uint32_t xorshift_6_21_7( uint32_t & state ) { uint32_t ret = state; state ^= state << 6; state ^= state >> 21; state ^= state << 7; return ret; } // xorshift_6_21_7 static inline void xorshift_6_21_7_set_seed( uint32_t & state, uint32_t seed ) { state = seed; xorshift_6_21_7( state ); // prime } // xorshift_6_21_7_set_seed //-------------------------------------------------- // The state must be seeded with a nonzero value. static inline uint64_t xorshift_12_25_27( uint64_t & state ) { uint64_t ret = state; state ^= state >> 12; state ^= state << 25; state ^= state >> 27; return ret * 0x_2545_F491_4F6C_DD1D; } // xorshift_12_25_27 static inline void xorshift_12_25_27_set_seed( uint64_t & state, uint64_t seed ) { state = seed; xorshift_12_25_27( state ); // prime } // xorshift_12_25_27_set_seed //-------------------------------------------------- // The state must be seeded with a nonzero value. #ifndef KISS_64 typedef struct kiss_64_t { uint64_t z, w, jsr, jcong; } kiss_64_t; #endif // ! KISS_64 static inline uint64_t kiss_64( kiss_64_t & rs ) with(rs) { kiss_64_t ret = rs; z = 36969 * (z & 65535) + (z >> 16); w = 18000 * (w & 65535) + (w >> 16); jsr ^= (jsr << 13); jsr ^= (jsr >> 17); jsr ^= (jsr << 5); jcong = 69069 * jcong + 1234567; return (((ret.z << 16) + ret.w) ^ ret.jcong) + ret.jsr; } // kiss_64 static inline void kiss_64_set_seed( kiss_64_t & rs, uint64_t seed ) with(rs) { z = 1; w = 1; jsr = 4; jcong = seed; kiss_64( rs ); // prime } // kiss_64_set_seed //-------------------------------------------------- // The state array must be initialized to non-zero in the first four words. #ifndef XORWOW typedef struct xorwow_t { uint32_t a, b, c, d, counter; } xorwow_t; #endif // ! XORWOW static inline uint32_t xorwow( xorwow_t & rs ) with(rs) { // Algorithm "xorwow" from p. 5 of Marsaglia, "Xorshift RNGs". uint32_t ret = a + counter; uint32_t t = d; uint32_t const s = a; d = c; c = b; b = s; t ^= t >> 2; t ^= t << 1; t ^= s ^ (s << 4); a = t; counter += 362437; return ret; } // xorwow static inline void xorwow_set_seed( xorwow_t & rs, uint32_t seed ) { rs = (xorwow_t){ seed, seed, seed, seed, 0 }; xorwow( rs ); // prime } // xorwow_set_seed //-------------------------------------------------- // Used in __tls_rand_fwd #define M (1_l64u << 48_l64u) #define A (25214903917_l64u) #define AI (18446708753438544741_l64u) #define C (11_l64u) #define D (16_l64u) // Bi-directional LCG random-number generator static inline uint32_t LCGBI_fwd( uint64_t & rs ) { rs = (A * rs + C) & (M - 1); return rs >> D; } // LCGBI_fwd static inline uint32_t LCGBI_bck( uint64_t & rs ) { unsigned int r = rs >> D; rs = AI * (rs - C) & (M - 1); return r; } // LCGBI_bck #undef M #undef A #undef AI #undef C #undef D #endif // __cforall