Context Navigation

← Previous Change
Next Change →

Changeset 4d8fbf4 for benchmark

Timestamp:

Sep 16, 2021, 2:22:01 PM (3 years ago)

Author:

caparsons <caparson@…>

Branches:

ADT, ast-experimental, enum, forall-pointer-decay, master, pthread-emulation, qualifiedEnum

Children:

432bffe, 7e7a076

Parents:

a8367eb (diff), 140eb16 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.

Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

Location:

benchmark

Files:

: 4 added
: 13 edited

Cargo.toml.in (modified) (1 diff)
Makefile.am (modified) (2 diffs)
bench.rs (modified) (4 diffs)
readyQ/cycle.cpp (modified) (3 diffs)
readyQ/cycle.go (modified) (2 diffs)
readyQ/cycle.rs (modified) (1 diff)
readyQ/locality.go (modified) (1 diff)
readyQ/locality.rs (modified) (2 diffs)
readyQ/transfer.cfa (modified) (3 diffs)
readyQ/transfer.cpp (modified) (1 diff)
readyQ/transfer.go (added)
readyQ/transfer.rs (added)
readyQ/yield.cfa (modified) (1 diff)
readyQ/yield.cpp (modified) (1 diff)
readyQ/yield.go (added)
readyQ/yield.rs (added)
rmit.py (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

benchmark/Cargo.toml.in

-                      ra8367eb
+                      r4d8fbf4
 [[bin]]
 name = "cycle-tokio"
+name = "rdq-cycle-tokio"
 path = "@abs_srcdir@/readyQ/cycle.rs"
 [[bin]]
 name = "locality-tokio"
+name = "rdq-locality-tokio"
 path = "@abs_srcdir@/readyQ/locality.rs"
+[[bin]]
+name = "rdq-transfer-tokio"
+path = "@abs_srcdir@/readyQ/transfer.rs"
+[[bin]]
+name = "rdq-yield-tokio"
+path = "@abs_srcdir@/readyQ/yield.rs"
 [features]

benchmark/Makefile.am

-                      ra8367eb
+                      r4d8fbf4
 include $(top_srcdir)/tools/build/cfa.make
 AM_CFLAGS = -O2 -Wall -Wextra -I$(srcdir) -lrt -pthread # -Werror
+AM_CFLAGS = -O3 -Wall -Wextra -I$(srcdir) -lrt -pthread # -Werror
 AM_CFAFLAGS = -quiet -nodebug
 AM_UPPFLAGS = -quiet -nodebug -multi -std=c++14
 …
 ## =========================================================================================================
+%-tokio$(EXEEXT): $(srcdir)/readyQ/%.rs $(srcdir)/bench.rs
+        cd $(builddir) && cargo build --release
+        cp $(builddir)/target/release/$(basename $@) $@
+RDQBENCHES = \
+        rdq-cycle-cfa \
+        rdq-cycle-tokio \
+        rdq-cycle-go \
+        rdq-cycle-fibre \
+        rdq-yield-cfa \
+        rdq-yield-tokio \
+        rdq-yield-go \
+        rdq-yield-fibre \
+        rdq-locality-cfa \
+        rdq-locality-tokio \
+        rdq-locality-go \
+        rdq-locality-fibre \
+        rdq-transfer-cfa \
+        rdq-transfer-tokio \
+        rdq-transfer-go \
+        rdq-transfer-fibre
+rdq-benches:
+        +make $(RDQBENCHES)
+clean-rdq-benches:
+        rm -rf $(RDQBENCHES) $(builddir)/target go.mod
+rdq-%-tokio$(EXEEXT): $(builddir)/target/release/rdq-%-tokio$(EXEEXT)
+        $(BENCH_V_RUSTC)cp $(builddir)/target/release/$(basename $@) $@
+$(builddir)/target/release/rdq-%-tokio$(EXEEXT): $(srcdir)/readyQ/%.rs $(srcdir)/bench.rs
+        $(BENCH_V_RUSTC)cd $(builddir) && cargo build --release
+rdq-%-cfa$(EXEEXT): $(srcdir)/readyQ/%.cfa $(srcdir)/readyQ/rq_bench.hfa
+        $(BENCH_V_CFA)$(CFACOMPILE) $< -o $@
+go.mod:
+        touch $@
+        go mod edit -module=rdq.bench
+        go get golang.org/x/sync/semaphore
+        go get golang.org/x/text/language
+        go get golang.org/x/text/message
+rdq-%-go$(EXEEXT): $(srcdir)/readyQ/%.go $(srcdir)/readyQ/bench.go go.mod
+        $(BENCH_V_GOC)go build -o $@ $< $(srcdir)/readyQ/bench.go
+rdq-%-fibre$(EXEEXT): $(srcdir)/readyQ/%.cpp
+        $(BENCH_V_CXX)$(CXXCOMPILE) $< -o $@ -lfibre -std=c++17 $(AM_CFLAGS)
+# ## =========================================================================================================
+CLEANFILES = $(RDQBENCHES) go.mod go.sum
+clean-local:
+        -rm -rf target

benchmark/bench.rs

-                      ra8367eb
+                      r4d8fbf4
 use std::io::{self, Write};
+use std::option;
 use std::sync::atomic::{AtomicU64, AtomicBool, Ordering};
 use std::time::{Instant,Duration};
+use std::u128;
 use clap::{Arg, ArgMatches};
 …
 impl BenchData {
         pub fn new(options: ArgMatches, nthreads: usize) -> BenchData {
+        pub fn new(options: ArgMatches, nthreads: usize, default_it: option::Option<u64>) -> BenchData {
                 let (clock_mode, stop_count, duration) = if options.is_present("iterations") {
                         (false,
                         options.value_of("iterations").unwrap().parse::<u64>().unwrap(),
+                        -1.0)
+                } else if !default_it.is_none() {
+                        (false,
+                        default_it.unwrap(),
                         -1.0)
                 } else {
 …
+        }
+        #[allow(dead_code)]
         pub async fn wait(&self, start: &Instant) -> Duration{
                 loop {
 …
+}
+// ==================================================
+pub fn _lehmer64( state: &mut u128 ) -> u64 {
+        *state = state.wrapping_mul(0xda942042e4dd58b5);
+        return (*state >> 64) as u64;
+}

benchmark/readyQ/cycle.cpp

-                      ra8367eb
+                      r4d8fbf4
                         Fibre * threads[tthreads];
                         Partner thddata[tthreads];
                         for(int i = 0; i < tthreads; i++) {
+                        for(unsigned i = 0; i < tthreads; i++) {
                                 unsigned pi = (i + nthreads) % tthreads;
                                 thddata[i].next = &thddata[pi].self;
+                        }
                         for(int i = 0; i < tthreads; i++) {
+                        for(unsigned i = 0; i < tthreads; i++) {
                                 threads[i] = new Fibre( reinterpret_cast<void (*)(void *)>(partner_main), &thddata[i] );
+                        }
 …
                         start = timeHiRes();
                         for(int i = 0; i < nthreads; i++) {
+                        for(unsigned i = 0; i < nthreads; i++) {
                                 thddata[i].self.post();
+                        }
 …
                         printf("\nDone\n");
                         for(int i = 0; i < tthreads; i++) {
+                        for(unsigned i = 0; i < tthreads; i++) {
                                 thddata[i].self.post();
                                 fibre_join( threads[i], nullptr );

benchmark/readyQ/cycle.go

-                      ra8367eb
+                      r4d8fbf4
         atomic.StoreInt32(&stop, 1)
         end := time.Now()
         delta := end.Sub(start)
+        duration := end.Sub(start)
         fmt.Printf("\nDone\n")
 …
         p := message.NewPrinter(language.English)
         p.Printf("Duration (ms)        : %f\n", delta.Seconds());
+        p.Printf("Duration (ms)        : %d\n", duration.Milliseconds())
         p.Printf("Number of processors : %d\n", nprocs);
         p.Printf("Number of threads    : %d\n", tthreads);
         p.Printf("Cycle size (# thrds) : %d\n", ring_size);
         p.Printf("Total Operations(ops): %15d\n", global_counter)
         p.Printf("Ops per second       : %18.2f\n", float64(global_counter) / delta.Seconds())
         p.Printf("ns per ops           : %18.2f\n", float64(delta.Nanoseconds()) / float64(global_counter))
+        p.Printf("Ops per second       : %18.2f\n", float64(global_counter) / duration.Seconds())
+        p.Printf("ns per ops           : %18.2f\n", float64(duration.Nanoseconds()) / float64(global_counter))
         p.Printf("Ops per threads      : %15d\n", global_counter / uint64(tthreads))
         p.Printf("Ops per procs        : %15d\n", global_counter / uint64(nprocs))
         p.Printf("Ops/sec/procs        : %18.2f\n", (float64(global_counter) / float64(nprocs)) / delta.Seconds())
         p.Printf("ns per ops/procs     : %18.2f\n", float64(delta.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
+        p.Printf("Ops/sec/procs        : %18.2f\n", (float64(global_counter) / float64(nprocs)) / duration.Seconds())
+        p.Printf("ns per ops/procs     : %18.2f\n", float64(duration.Nanoseconds()) / (float64(global_counter) / float64(nprocs)))
+}

benchmark/readyQ/cycle.rs

ra8367eb	r4d8fbf4
46	46
47	47	let tthreads = nthreads * ring_size;
48		let exp = Arc::new(bench::BenchData::new(options, tthreads));
	48	let exp = Arc::new(bench::BenchData::new(options, tthreads, None));
49	49
50	50	let s = (1000000 as u64).to_formatted_string(&Locale::en);

benchmark/readyQ/locality.go

ra8367eb	r4d8fbf4
286	286	// Print with nice 's, i.e. 1'000'000 instead of 1000000
287	287	p := message.NewPrinter(language.English)
288		p.Printf("Duration (~~s) : %f\n", delta.S~~econds());
	288	p.Printf("Duration (ms) : %f\n", delta.Milliseconds());
289	289	p.Printf("Number of processors : %d\n", nprocs);
290	290	p.Printf("Number of threads : %d\n", nthreads);

benchmark/readyQ/locality.rs

-                      ra8367eb
+                      r4d8fbf4
                                                 return (r as *mut MyData, true);
+                                        }
                                         let got = self.ptr.compare_and_swap(expected, ctx as *mut MyCtx as u64, Ordering::SeqCst);
                                         if got == expected {
+                                        let got = self.ptr.compare_exchange_weak(expected, ctx as *mut MyCtx as u64, Ordering::SeqCst, Ordering::SeqCst);
+                                        if got == Ok(expected) {
                                                 break expected;// We got the seat
+                                        }
 …
         assert_eq!(&s, "1,000,000");
         let exp = Arc::new(bench::BenchData::new(options, nprocs));
+        let exp = Arc::new(bench::BenchData::new(options, nprocs, None));
         let mut results = Result::new();

benchmark/readyQ/transfer.cfa

-                      ra8367eb
+                      r4d8fbf4
                         Pause();
                         if( (timeHiRes() - start) > 5`s ) {
+                                print_stats_now( bench_cluster, CFA_STATS_READY_Q | CFA_STATS_IO );
                                 serr | "Programs has been blocked for more than 5 secs";
                                 exit(1);
 …
         cfa_option opt[] = {
                 BENCH_OPT,
                 { 'e', "exhaust", "Whether or not threads that have seen the new epoch should yield or park.", exhaust, parse_yesno}
+                { 'e', "exhaust", "Whether or not threads that have seen the new epoch should park instead of yielding.", exhaust, parse_yesno}
         };
         BENCH_OPT_PARSE("cforall transition benchmark");
 …
+        }
         sout | "Duration                : " | ws(3, 3, unit(eng((end - start)`ds))) | 's';
+        sout | "Duration (ms)           : " | ws(3, 3, unit(eng((end - start)`dms)));
         sout | "Number of processors    : " | nprocs;
         sout | "Number of threads       : " | nthreads;

benchmark/readyQ/transfer.cpp

ra8367eb	r4d8fbf4
173	173	}
174	174
175		std::cout << "Duration ~~: " << to_miliseconds(end - start) << "ms"~~ << std::endl;
	175	std::cout << "Duration (ms) : " << to_miliseconds(end - start) << std::endl;
176	176	std::cout << "Number of processors : " << nprocs << std::endl;
177	177	std::cout << "Number of threads : " << nthreads << std::endl;

benchmark/readyQ/yield.cfa

-                      ra8367eb
+                      r4d8fbf4
+                }
+                printf("Took %'ld ms\n", (end - start)`ms);
+                printf("Duration (ms)       : %'ld\n", (end - start)`dms);
+                printf("Number of processors: %'d\n", nprocs);
+                printf("Number of threads   : %'d\n", nthreads);
+                printf("Total yields        : %'15llu\n", global_counter);
                 printf("Yields per second   : %'18.2lf\n", ((double)global_counter) / (end - start)`s);
                 printf("ns per yields       : %'18.2lf\n", ((double)(end - start)`ns) / global_counter);
-                printf("Total yields        : %'15llu\n", global_counter);
                 printf("Yields per procs    : %'15llu\n", global_counter / nprocs);
                 printf("Yields/sec/procs    : %'18.2lf\n", (((double)global_counter) / nprocs) / (end - start)`s);

benchmark/readyQ/yield.cpp

-                      ra8367eb
+                      r4d8fbf4
                 auto dur_nano = duration_cast<std::nano>(duration);
+                auto dur_dms  = duration_cast<std::milli>(duration);
                 std::cout << "Took " << duration << " s\n";
+                printf("Duration (ms)       : %'.2lf\n", dur_dms );
                 printf("Total yields        : %'15llu\n", global_counter );
                 printf("Yields per procs    : %'15llu\n", global_counter / nprocs );

benchmark/rmit.py

-                      ra8367eb
+                      r4d8fbf4
 import random
 import re
+import socket
 import subprocess
 import sys
 …
         return nopts
+# returns the first option with key 'opt'
+def search_option(action, opt):
+        i = 0
+        while i < len(action):
+                if action[i] == opt:
+                        i += 1
+                        if i != len(action):
+                                return action[i]
+                i += 1
+        return None
 def actions_eta(actions):
         time = 0
         for a in actions:
+                i = 0
+                while i < len(a):
+                        if a[i] == '-d':
+                                i += 1
+                                if i != len(a):
+                                        time += int(a[i])
+                        i += 1
+                o = search_option(a, '-d')
+                if o :
+                        time += int(o)
         return time
+taskset_maps = None
+def init_taskset_maps():
+        global taskset_maps
+        known_hosts = {
+                "jax": {
+                        range(  1,  24) : "48-71",
+                        range( 25,  48) : "48-71,144-167",
+                        range( 49,  96) : "48-95,144-191",
+                        range( 97, 144) : "24-95,120-191",
+                        range(145, 192) : "0-95,96-191",
+                },
+        }
+        if (host := socket.gethostname()) in known_hosts:
+                taskset_maps = known_hosts[host]
+                return True
+        print("Warning unknown host '{}', disable taskset usage".format(host), file=sys.stderr)
+        return False
+def settaskset_one(action):
+        o = search_option(action, '-p')
+        if not o:
+                return action
+        try:
+                oi = int(o)
+        except ValueError:
+                return action
+        m = "Not found"
+        for key in taskset_maps:
+                if oi in key:
+                        return ['taskset', '-c', taskset_maps[key], *action]
+        print("Warning no mapping for {} cores".format(oi), file=sys.stderr)
+        return action
+def settaskset(actions):
+        return [settaskset_one(a) for a in actions]
 if __name__ == "__main__":
 …
         parser.add_argument('--file', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
         parser.add_argument('--trials', help='Number of trials to run per combinaison', type=int, default=3)
+        parser.add_argument('--notaskset', help='If specified, the trial will not use taskset to match the -p option', action='store_true')
         parser.add_argument('command', metavar='command', type=str, nargs=1, help='the command prefix to run')
         parser.add_argument('candidates', metavar='candidates', type=str, nargs='*', help='the candidate suffix to run')
 …
         # ================================================================================
+        # Figure out all the combinations to run
+        # Fixup the different commands
+        # Add tasksets
+        withtaskset = False
+        if not options.notaskset and init_taskset_maps():
+                withtaskset = True
+                actions = settaskset(actions)
+        # ================================================================================
+        # Now that we know what to run, print it.
+        # find expected time
+        time = actions_eta(actions)
+        print("Running {} trials{}".format(len(actions), "" if time == 0 else " (expecting to take {})".format(str(datetime.timedelta(seconds=int(time)))) ))
+        # dry run if options ask for it
         if options.list:
                 for a in actions:
 …
         # Prepare to run
-        # find expected time
-        time = actions_eta(actions)
-        print("Running {} trials{}".format(len(actions), "" if time == 0 else " (expecting to take {})".format(str(datetime.timedelta(seconds=int(time)))) ))
         random.shuffle(actions)
 …
         first = True
         for i, a in enumerate(actions):
                 sa = " ".join(a)
+                sa = " ".join(a[3:] if withtaskset else a)
                 if first:
                         first = False
 …
                                 match = re.search("^(.*):(.*)$", s)
                                 if match:
+                                        fields[match.group(1).strip()] = float(match.group(2).strip().replace(',',''))
+                options.file.write(json.dumps([a[0][2:], sa, fields]))
+                                        try:
+                                                fields[match.group(1).strip()] = float(match.group(2).strip().replace(',',''))
+                                        except:
+                                                pass
+                options.file.write(json.dumps([a[3 if withtaskset else 0][2:], sa, fields]))
                 options.file.flush()

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 4d8fbf4 for benchmark

Legend:

Download in other formats: