Ignore:
Timestamp:
Feb 21, 2020, 3:36:36 PM (4 years ago)
Author:
Thierry Delisle <tdelisle@…>
Branches:
ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum
Children:
c7a900a
Parents:
8c50aed (diff), c744563a (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge branch 'master' into park_unpark

File:
1 edited

Legend:

Unmodified
Added
Removed
  • doc/theses/thierry_delisle_PhD/code/utils.hpp

    r8c50aed ra505021  
    1010#include <unistd.h>
    1111#include <sys/sysinfo.h>
     12
     13#include <x86intrin.h>
    1214
    1315// Barrier from
     
    5658}
    5759
    58 void affinity(int tid) {
     60static inline void affinity(int tid) {
    5961        static int cpus = get_nprocs();
    6062
     
    7072
    7173static const constexpr std::size_t cache_line_size = 64;
    72 void check_cache_line_size() {
     74static inline void check_cache_line_size() {
    7375        std::cout << "Checking cache line size" << std::endl;
    7476        const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size";
     
    103105        return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
    104106}
     107
     108static inline unsigned rand_bit(unsigned rnum, size_t mask) {
     109        unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
     110#if !defined(__BMI2__)
     111        uint64_t v = mask;   // Input value to find position with rank r.
     112        unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
     113        unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
     114        uint64_t a, b, c, d; // Intermediate temporaries for bit count.
     115        unsigned int t;      // Bit count temporary.
     116
     117        // Do a normal parallel bit count for a 64-bit integer,
     118        // but store all intermediate steps.
     119        a =  v - ((v >> 1) & ~0UL/3);
     120        b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
     121        c = (b + (b >> 4)) & ~0UL/0x11;
     122        d = (c + (c >> 8)) & ~0UL/0x101;
     123
     124
     125        t = (d >> 32) + (d >> 48);
     126        // Now do branchless select!
     127        s  = 64;
     128        s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
     129        t  = (d >> (s - 16)) & 0xff;
     130        s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
     131        t  = (c >> (s - 8)) & 0xf;
     132        s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
     133        t  = (b >> (s - 4)) & 0x7;
     134        s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
     135        t  = (a >> (s - 2)) & 0x3;
     136        s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
     137        t  = (v >> (s - 1)) & 0x1;
     138        s -= ((t - r) & 256) >> 8;
     139        return s - 1;
     140#else
     141        uint64_t picked = _pdep_u64(1ul << bit, mask);
     142        return picked ? __builtin_ctzl(picked) : 0;
     143#endif
     144}
Note: See TracChangeset for help on using the changeset viewer.