Changeset 9421f3d8
- Timestamp:
- Oct 30, 2019, 11:26:07 AM (5 years ago)
- Branches:
- ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum
- Children:
- df75fe97
- Parents:
- b067d9b
- Location:
- doc/theses/thierry_delisle_PhD
- Files:
-
- 5 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp
rb067d9b r9421f3d8 9 9 #include <vector> 10 10 11 #include <getopt.h> 11 12 #include <unistd.h> 12 13 #include <sys/sysinfo.h> … … 21 22 22 23 int value; 23 Node(int value): value(value) { 24 creates++; 25 } 26 27 ~Node() { 28 destroys++; 29 } 24 25 Node() { creates++; } 26 Node(int value): value(value) { creates++; } 27 ~Node() { destroys++; } 30 28 }; 31 29 … … 33 31 std::atomic_size_t Node::destroys = { 0 }; 34 32 35 static const constexpr int nodes_per_threads = 128;36 struct NodeArray {37 __attribute__((aligned(64))) Node * array[nodes_per_threads];38 __attribute__((aligned(64))) char pad;39 };40 41 33 bool enable_stats = false; 34 35 template<> 36 thread_local relaxed_list<Node>::TLS relaxed_list<Node>::tls = {}; 37 38 template<> 39 relaxed_list<Node>::intrusive_queue_t::stat::Dif relaxed_list<Node>::intrusive_queue_t::stat::dif = {}; 40 41 // ================================================================================================ 42 // UTILS 43 // ================================================================================================ 42 44 43 45 struct local_stat_t { … … 49 51 }; 50 52 51 __attribute__((noinline)) void run_body( 52 std::atomic<bool>& done, 53 Random & rand, 54 Node * (&my_nodes)[128], 55 local_stat_t & local, 56 relaxed_list<Node> & list 57 ) { 58 while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { 59 int idx = rand.next() % nodes_per_threads; 60 if (auto node = my_nodes[idx]) { 61 local.crc_in += node->value; 62 list.push(node); 63 my_nodes[idx] = nullptr; 64 local.in++; 65 } 66 else if(auto node = list.pop()) { 67 local.crc_out += node->value; 68 my_nodes[idx] = node; 69 local.out++; 70 } 71 else { 72 local.empty++; 73 } 74 } 75 } 76 77 void run(unsigned nthread, unsigned nqueues, unsigned fill, double duration) { 78 // List being tested 79 relaxed_list<Node> list = { nthread * nqueues }; 80 81 // Barrier for synchronization 82 barrier_t barrier(nthread + 1); 83 84 // Data to check everything is OK 53 struct global_stat_t { 54 std::atomic_size_t in = { 0 }; 55 std::atomic_size_t out = { 0 }; 56 std::atomic_size_t empty = { 0 }; 57 std::atomic_size_t crc_in = { 0 }; 58 std::atomic_size_t crc_out = { 0 }; 85 59 struct { 86 std::atomic_size_t in = { 0 };87 std::atomic_size_t out = { 0 };88 std::atomic_size_t empty = { 0 };89 std::atomic_size_t crc_in = { 0 };90 std::atomic_size_t crc_out = { 0 };91 60 struct { 92 struct { 93 std::atomic_size_t attempt = { 0 }; 94 std::atomic_size_t success = { 0 }; 95 } push; 96 struct { 97 std::atomic_size_t attempt = { 0 }; 98 std::atomic_size_t success = { 0 }; 99 } pop; 100 } pick; 101 } global; 102 103 // Flag to signal termination 104 std::atomic_bool done = { false }; 105 106 // Prep nodes 107 std::cout << "Initializing "; 108 size_t nnodes = 0; 109 size_t npushed = 0; 110 NodeArray all_nodes[nthread]; 111 for(auto & nodes : all_nodes) { 112 Random rand(rdtscl()); 113 for(auto & node : nodes.array) { 114 auto r = rand.next() % 100; 115 if(r < fill) { 116 node = new Node(rand.next() % 100); 117 nnodes++; 118 } else { 119 node = nullptr; 120 } 121 } 122 123 for(int i = 0; i < 10; i++) { 124 int idx = rand.next() % nodes_per_threads; 125 if (auto node = nodes.array[idx]) { 126 global.crc_in += node->value; 127 list.push(node); 128 npushed++; 129 nodes.array[idx] = nullptr; 130 } 131 } 132 } 133 134 std::cout << nnodes << " nodes " << fill << "% (" << npushed << " pushed)" << std::endl; 135 136 enable_stats = true; 137 138 std::thread * threads[nthread]; 139 unsigned i = 1; 140 for(auto & t : threads) { 141 auto & my_nodes = all_nodes[i - 1].array; 142 t = new std::thread([&done, &list, &barrier, &global, &my_nodes](unsigned tid) { 143 Random rand(tid + rdtscl()); 144 145 local_stat_t local; 146 147 // affinity(tid); 148 149 barrier.wait(tid); 150 151 // EXPERIMENT START 152 153 run_body(done, rand, my_nodes, local, list); 154 155 // EXPERIMENT END 156 157 barrier.wait(tid); 158 159 global.in += local.in; 160 global.out += local.out; 161 global.empty += local.empty; 162 163 for(auto node : my_nodes) { 164 delete node; 165 } 166 167 global.crc_in += local.crc_in; 168 global.crc_out += local.crc_out; 169 170 global.pick.push.attempt += relaxed_list<Node>::tls.pick.push.attempt; 171 global.pick.push.success += relaxed_list<Node>::tls.pick.push.success; 172 global.pick.pop .attempt += relaxed_list<Node>::tls.pick.pop.attempt; 173 global.pick.pop .success += relaxed_list<Node>::tls.pick.pop.success; 174 }, i++); 175 } 176 61 std::atomic_size_t attempt = { 0 }; 62 std::atomic_size_t success = { 0 }; 63 } push; 64 struct { 65 std::atomic_size_t attempt = { 0 }; 66 std::atomic_size_t success = { 0 }; 67 std::atomic_size_t mask_attempt = { 0 }; 68 } pop; 69 } pick; 70 struct { 71 struct { 72 std::atomic_size_t value = { 0 }; 73 std::atomic_size_t count = { 0 }; 74 } push; 75 struct { 76 std::atomic_size_t value = { 0 }; 77 std::atomic_size_t count = { 0 }; 78 } pop; 79 } qstat; 80 }; 81 82 void tally_stats(global_stat_t & global, local_stat_t & local) { 83 global.in += local.in; 84 global.out += local.out; 85 global.empty += local.empty; 86 87 global.crc_in += local.crc_in; 88 global.crc_out += local.crc_out; 89 90 global.pick.push.attempt += relaxed_list<Node>::tls.pick.push.attempt; 91 global.pick.push.success += relaxed_list<Node>::tls.pick.push.success; 92 global.pick.pop .attempt += relaxed_list<Node>::tls.pick.pop.attempt; 93 global.pick.pop .success += relaxed_list<Node>::tls.pick.pop.success; 94 global.pick.pop .mask_attempt += relaxed_list<Node>::tls.pick.pop.mask_attempt; 95 96 global.qstat.push.value += relaxed_list<Node>::tls.empty.push.value; 97 global.qstat.push.count += relaxed_list<Node>::tls.empty.push.count; 98 global.qstat.pop .value += relaxed_list<Node>::tls.empty.pop .value; 99 global.qstat.pop .count += relaxed_list<Node>::tls.empty.pop .count; 100 } 101 102 void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) { 177 103 std::cout << "Starting" << std::endl; 178 104 auto before = Clock::now(); … … 196 122 duration = durr.count(); 197 123 std::cout << "\rClosing down" << std::endl; 198 199 for(auto t : threads) { 200 t->join(); 201 delete t; 202 } 203 204 enable_stats = false; 205 206 while(auto node = list.pop()) { 207 global.crc_out += node->value; 208 delete node; 209 } 210 124 } 125 126 void print_stats(double duration, unsigned nthread, global_stat_t & global) { 211 127 assert(Node::creates == Node::destroys); 212 128 assert(global.crc_in == global.crc_out); … … 227 143 double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt); 228 144 double pop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt); 145 229 146 std::cout << "Push Pick % : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n"; 230 147 std::cout << "Pop Pick % : " << pop_sur << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n"; 148 std::cout << "Pop mask trys : " << global.pick.pop.mask_attempt << std::endl; 149 150 double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count; 151 double avgQ_pop = double(global.qstat.pop .value) / global.qstat.pop .count; 152 double avgQ = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count); 153 std::cout << "Push Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n"; 154 std::cout << "Pop Avg Qs : " << avgQ_pop << " (" << global.qstat.pop .count << "ops)\n"; 155 std::cout << "Global Avg Qs : " << avgQ << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n"; 231 156 #endif 232 157 } 233 158 234 void usage(char * argv[]) { 235 std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [NQUEUES] [FILL]" << std::endl;; 236 std::exit(1); 159 // ================================================================================================ 160 // EXPERIMENTS 161 // ================================================================================================ 162 163 // ================================================================================================ 164 __attribute__((noinline)) void runChurn_body( 165 std::atomic<bool>& done, 166 Random & rand, 167 Node * my_nodes[], 168 unsigned nslots, 169 local_stat_t & local, 170 relaxed_list<Node> & list 171 ) { 172 while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { 173 int idx = rand.next() % nslots; 174 if (auto node = my_nodes[idx]) { 175 local.crc_in += node->value; 176 list.push(node); 177 my_nodes[idx] = nullptr; 178 local.in++; 179 } 180 else if(auto node = list.pop()) { 181 local.crc_out += node->value; 182 my_nodes[idx] = node; 183 local.out++; 184 } 185 else { 186 local.empty++; 187 } 188 } 189 } 190 191 void runChurn(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const unsigned nslots) { 192 std::cout << "Churn Benchmark" << std::endl; 193 assert(nnodes <= nslots); 194 // List being tested 195 relaxed_list<Node> list = { nthread * nqueues }; 196 197 // Barrier for synchronization 198 barrier_t barrier(nthread + 1); 199 200 // Data to check everything is OK 201 global_stat_t global; 202 203 // Flag to signal termination 204 std::atomic_bool done = { false }; 205 206 // Prep nodes 207 std::cout << "Initializing "; 208 size_t npushed = 0; 209 210 Node** all_nodes[nthread]; 211 for(auto & nodes : all_nodes) { 212 nodes = new __attribute__((aligned(64))) Node*[nslots + 8]; 213 Random rand(rdtscl()); 214 for(unsigned i = 0; i < nnodes; i++) { 215 nodes[i] = new Node(rand.next() % 100); 216 } 217 218 for(unsigned i = nnodes; i < nslots; i++) { 219 nodes[i] = nullptr; 220 } 221 222 for(int i = 0; i < 10 && i < (int)nslots; i++) { 223 int idx = rand.next() % nslots; 224 if (auto node = nodes[idx]) { 225 global.crc_in += node->value; 226 list.push(node); 227 npushed++; 228 nodes[idx] = nullptr; 229 } 230 } 231 } 232 233 std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl; 234 235 enable_stats = true; 236 237 std::thread * threads[nthread]; 238 unsigned i = 1; 239 for(auto & t : threads) { 240 auto & my_nodes = all_nodes[i - 1]; 241 t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) { 242 Random rand(tid + rdtscl()); 243 244 local_stat_t local; 245 246 // affinity(tid); 247 248 barrier.wait(tid); 249 250 // EXPERIMENT START 251 252 runChurn_body(done, rand, my_nodes, nslots, local, list); 253 254 // EXPERIMENT END 255 256 barrier.wait(tid); 257 258 tally_stats(global, local); 259 260 for(unsigned i = 0; i < nslots; i++) { 261 delete my_nodes[i]; 262 } 263 }, i++); 264 } 265 266 waitfor(duration, barrier, done); 267 268 for(auto t : threads) { 269 t->join(); 270 delete t; 271 } 272 273 enable_stats = false; 274 275 while(auto node = list.pop()) { 276 global.crc_out += node->value; 277 delete node; 278 } 279 280 for(auto nodes : all_nodes) { 281 delete[] nodes; 282 } 283 284 print_stats(duration, nthread, global); 285 } 286 287 // ================================================================================================ 288 __attribute__((noinline)) void runPingPong_body( 289 std::atomic<bool>& done, 290 Node initial_nodes[], 291 unsigned nnodes, 292 local_stat_t & local, 293 relaxed_list<Node> & list 294 ) { 295 Node * nodes[nnodes]; 296 { 297 unsigned i = 0; 298 for(auto & n : nodes) { 299 n = &initial_nodes[i++]; 300 } 301 } 302 303 while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) { 304 305 for(Node * & node : nodes) { 306 local.crc_in += node->value; 307 list.push(node); 308 local.in++; 309 } 310 311 // ----- 312 313 for(Node * & node : nodes) { 314 node = list.pop(); 315 assert(node); 316 local.crc_out += node->value; 317 local.out++; 318 } 319 } 320 } 321 322 void runPingPong(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) { 323 std::cout << "PingPong Benchmark" << std::endl; 324 325 // List being tested 326 relaxed_list<Node> list = { nthread * nqueues }; 327 328 // Barrier for synchronization 329 barrier_t barrier(nthread + 1); 330 331 // Data to check everything is OK 332 global_stat_t global; 333 334 // Flag to signal termination 335 std::atomic_bool done = { false }; 336 337 std::cout << "Initializing "; 338 enable_stats = true; 339 340 std::thread * threads[nthread]; 341 unsigned i = 1; 342 for(auto & t : threads) { 343 t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) { 344 Random rand(tid + rdtscl()); 345 346 Node nodes[nnodes]; 347 for(auto & n : nodes) { 348 n.value = (int)rand.next() % 100; 349 } 350 351 local_stat_t local; 352 353 // affinity(tid); 354 355 barrier.wait(tid); 356 357 // EXPERIMENT START 358 359 runPingPong_body(done, nodes, nnodes, local, list); 360 361 // EXPERIMENT END 362 363 barrier.wait(tid); 364 365 tally_stats(global, local); 366 }, i++); 367 } 368 369 waitfor(duration, barrier, done); 370 371 for(auto t : threads) { 372 t->join(); 373 delete t; 374 } 375 376 enable_stats = false; 377 378 print_stats(duration, nthread, global); 379 } 380 381 bool iequals(const std::string& a, const std::string& b) 382 { 383 return std::equal(a.begin(), a.end(), 384 b.begin(), b.end(), 385 [](char a, char b) { 386 return std::tolower(a) == std::tolower(b); 387 }); 237 388 } 238 389 … … 241 392 double duration = 5.0; 242 393 unsigned nthreads = 2; 243 unsigned nqueues = 2; 244 unsigned fill = 100; 394 unsigned nqueues = 4; 395 unsigned nnodes = 100; 396 unsigned nslots = 100; 397 398 enum { 399 Churn, 400 PingPong, 401 NONE 402 } benchmark = NONE; 245 403 246 404 std::cout.imbue(std::locale("")); 247 405 248 switch (argc) 249 { 250 case 5: 251 fill = std::stoul(argv[4]); 252 [[fallthrough]]; 253 case 4: 254 nqueues = std::stoul(argv[3]); 255 [[fallthrough]]; 256 case 3: 257 nthreads = std::stoul(argv[2]); 258 [[fallthrough]]; 259 case 2: 260 duration = std::stod(argv[1]); 261 if( duration <= 0.0 ) { 262 std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl; 263 usage(argv); 264 } 265 [[fallthrough]]; 266 case 1: 267 break; 268 default: 269 usage(argv); 270 break; 271 } 406 for(;;) { 407 static struct option options[] = { 408 {"duration", required_argument, 0, 'd'}, 409 {"nthreads", required_argument, 0, 't'}, 410 {"nqueues", required_argument, 0, 'q'}, 411 {"benchmark", required_argument, 0, 'b'}, 412 {0, 0, 0, 0} 413 }; 414 415 int idx = 0; 416 int opt = getopt_long(argc, argv, "d:t:q:b:", options, &idx); 417 418 std::string arg = optarg ? optarg : ""; 419 size_t len = 0; 420 switch(opt) { 421 // Exit Case 422 case -1: 423 /* paranoid */ assert(optind <= argc); 424 switch(benchmark) { 425 case NONE: 426 std::cerr << "Must specify a benchmark" << std::endl; 427 goto usage; 428 case PingPong: 429 nnodes = 1; 430 nslots = 1; 431 switch(argc - optind) { 432 case 0: break; 433 case 1: 434 try { 435 arg = optarg = argv[optind]; 436 nnodes = stoul(optarg, &len); 437 if(len != arg.size()) { throw std::invalid_argument(""); } 438 } catch(std::invalid_argument &) { 439 std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; 440 goto usage; 441 } 442 break; 443 default: 444 std::cerr << "'PingPong' benchmark doesn't accept more than 2 extra arguments" << std::endl; 445 goto usage; 446 } 447 break; 448 case Churn: 449 nnodes = 100; 450 nslots = 100; 451 switch(argc - optind) { 452 case 0: break; 453 case 1: 454 try { 455 arg = optarg = argv[optind]; 456 nnodes = stoul(optarg, &len); 457 if(len != arg.size()) { throw std::invalid_argument(""); } 458 nslots = nnodes; 459 } catch(std::invalid_argument &) { 460 std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; 461 goto usage; 462 } 463 break; 464 case 2: 465 try { 466 arg = optarg = argv[optind]; 467 nnodes = stoul(optarg, &len); 468 if(len != arg.size()) { throw std::invalid_argument(""); } 469 } catch(std::invalid_argument &) { 470 std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl; 471 goto usage; 472 } 473 try { 474 arg = optarg = argv[optind + 1]; 475 nslots = stoul(optarg, &len); 476 if(len != arg.size()) { throw std::invalid_argument(""); } 477 } catch(std::invalid_argument &) { 478 std::cerr << "Number of slots must be a positive integer, was " << arg << std::endl; 479 goto usage; 480 } 481 break; 482 default: 483 std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl; 484 goto usage; 485 } 486 break; 487 } 488 goto run; 489 // Benchmarks 490 case 'b': 491 if(benchmark != NONE) { 492 std::cerr << "Only when benchmark can be run" << std::endl; 493 goto usage; 494 } 495 if(iequals(arg, "churn")) { 496 benchmark = Churn; 497 break; 498 } 499 if(iequals(arg, "pingpong")) { 500 benchmark = PingPong; 501 break; 502 } 503 std::cerr << "Unkown benchmark " << arg << std::endl; 504 goto usage; 505 // Numeric Arguments 506 case 'd': 507 try { 508 duration = stod(optarg, &len); 509 if(len != arg.size()) { throw std::invalid_argument(""); } 510 } catch(std::invalid_argument &) { 511 std::cerr << "Duration must be a valid double, was " << arg << std::endl; 512 goto usage; 513 } 514 break; 515 case 't': 516 try { 517 nthreads = stoul(optarg, &len); 518 if(len != arg.size()) { throw std::invalid_argument(""); } 519 } catch(std::invalid_argument &) { 520 std::cerr << "Number of threads must be a positive integer, was " << arg << std::endl; 521 goto usage; 522 } 523 break; 524 case 'q': 525 try { 526 nqueues = stoul(optarg, &len); 527 if(len != arg.size()) { throw std::invalid_argument(""); } 528 } catch(std::invalid_argument &) { 529 std::cerr << "Number of queues must be a positive integer, was " << arg << std::endl; 530 goto usage; 531 } 532 break; 533 // Other cases 534 default: /* ? */ 535 std::cerr << opt << std::endl; 536 usage: 537 std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl; 538 std::cerr << " or: " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl; 539 std::cerr << std::endl; 540 std::cerr << " -d, --duration=DURATION Duration of the experiment, in seconds" << std::endl; 541 std::cerr << " -t, --nthreads=NTHREADS Number of kernel threads" << std::endl; 542 std::cerr << " -q, --nqueues=NQUEUES Number of queues per threads" << std::endl; 543 std::exit(1); 544 } 545 } 546 run: 272 547 273 548 check_cache_line_size(); 274 549 275 550 std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl; 276 run(nthreads, nqueues, fill, duration); 277 551 switch(benchmark) { 552 case Churn: 553 runChurn(nthreads, nqueues, duration, nnodes, nslots); 554 break; 555 case PingPong: 556 runPingPong(nthreads, nqueues, duration, nnodes); 557 break; 558 default: 559 abort(); 560 } 278 561 return 0; 279 562 } 280 563 281 template<>282 thread_local relaxed_list<Node>::TLS relaxed_list<Node>::tls = {};283 284 template<>285 relaxed_list<Node>::intrusive_queue_t::stat::Dif relaxed_list<Node>::intrusive_queue_t::stat::dif = {};286 287 564 const char * __my_progname = "Relaxed List"; -
doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp
rb067d9b r9421f3d8 37 37 }; 38 38 39 static inline bool bts(std::atomic_size_t & target, size_t bit ) { 40 //* 41 int result = 0; 42 asm volatile( 43 "LOCK btsq %[bit], %[target]\n\t" 44 :"=@ccc" (result) 45 : [target] "m" (target), [bit] "r" (bit) 46 ); 47 return result != 0; 48 /*/ 49 size_t mask = 1ul << bit; 50 size_t ret = target.fetch_or(mask, std::memory_order_relaxed); 51 return (ret & mask) != 0; 52 //*/ 53 } 54 55 static inline bool btr(std::atomic_size_t & target, size_t bit ) { 56 //* 57 int result = 0; 58 asm volatile( 59 "LOCK btrq %[bit], %[target]\n\t" 60 :"=@ccc" (result) 61 : [target] "m" (target), [bit] "r" (bit) 62 ); 63 return result != 0; 64 /*/ 65 size_t mask = 1ul << bit; 66 size_t ret = target.fetch_and(~mask, std::memory_order_relaxed); 67 return (ret & mask) != 0; 68 //*/ 69 } 39 70 40 71 extern bool enable_stats; … … 48 79 size_t attempt = 0; 49 80 size_t success = 0; 81 size_t mask_attempt = 0; 82 } pop; 83 }; 84 85 struct empty_stat { 86 struct { 87 size_t value = 0; 88 size_t count = 0; 89 } push; 90 struct { 91 size_t value = 0; 92 size_t count = 0; 50 93 } pop; 51 94 }; … … 65 108 public: 66 109 relaxed_list(unsigned numLists) 67 : numNonEmpty{0} 68 , lists(new intrusive_queue_t[numLists]) 110 : lists(new intrusive_queue_t[numLists]) 69 111 , numLists(numLists) 70 {} 112 { 113 assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists"); 114 assert(sizeof(*this) == 128); 115 } 71 116 72 117 ~relaxed_list() { … … 84 129 while(true) { 85 130 // Pick a random list 86 inti = tls.rng.next() % numLists;131 unsigned i = tls.rng.next() % numLists; 87 132 88 133 #ifndef NO_STATS … … 93 138 if( !lists[i].lock.try_lock() ) continue; 94 139 140 __attribute__((unused)) int num = numNonEmpty; 141 95 142 // Actually push it 96 lists[i].push(node, numNonEmpty); 143 if(lists[i].push(node)) { 144 numNonEmpty++; 145 size_t qword = i >> 6ull; 146 size_t bit = i & 63ull; 147 assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit)); 148 __attribute__((unused)) bool ret = bts(list_mask[qword], bit); 149 assert(!ret); 150 assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit)); 151 } 97 152 assert(numNonEmpty <= (int)numLists); 98 153 … … 102 157 #ifndef NO_STATS 103 158 tls.pick.push.success++; 159 tls.empty.push.value += num; 160 tls.empty.push.count += 1; 104 161 #endif 105 162 return; … … 108 165 109 166 __attribute__((noinline, hot)) node_t * pop() { 110 while(numNonEmpty != 0) { 111 // Pick two lists at random 112 int i = tls.rng.next() % numLists; 113 int j = tls.rng.next() % numLists; 114 115 #ifndef NO_STATS 116 tls.pick.pop.attempt++; 117 #endif 118 119 // Pick the bet list 120 int w = i; 121 if( __builtin_expect(lists[j].ts() != 0, true) ) { 122 w = (lists[i].ts() < lists[j].ts()) ? i : j; 123 } 124 125 auto & list = lists[w]; 126 // If list looks empty retry 127 if( list.ts() == 0 ) continue; 128 129 // If we can't get the lock retry 130 if( !list.lock.try_lock() ) continue; 131 132 // If list is empty, unlock and retry 133 if( list.ts() == 0 ) { 134 list.lock.unlock(); 135 continue; 136 } 137 138 // Actually pop the list 139 auto node = list.pop(numNonEmpty); 140 assert(node); 141 142 // Unlock and return 143 list.lock.unlock(); 144 assert(numNonEmpty >= 0); 145 #ifndef NO_STATS 146 tls.pick.pop.success++; 147 #endif 148 return node; 149 } 167 #if !defined(NO_BITMASK) 168 // for(int r = 0; r < 10 && numNonEmpty != 0; r++) { 169 // // Pick two lists at random 170 // unsigned i = tls.rng.next() % numLists; 171 // unsigned j = tls.rng.next() % numLists; 172 173 // if(auto node = try_pop(i, j)) return node; 174 // } 175 int nnempty; 176 while(0 != (nnempty = numNonEmpty)) { 177 unsigned i, j; 178 if( numLists < 4 || (numLists / nnempty) < 4 ) { 179 // Pick two lists at random 180 i = tls.rng.next() % numLists; 181 j = tls.rng.next() % numLists; 182 } else { 183 #ifndef NO_STATS 184 // tls.pick.push.mask_attempt++; 185 #endif 186 187 // Pick two lists at random 188 unsigned num = ((numLists - 1) >> 6) + 1; 189 190 unsigned ri = tls.rng.next(); 191 unsigned rj = tls.rng.next(); 192 193 unsigned wdxi = (ri >> 6u) % num; 194 unsigned wdxj = (rj >> 6u) % num; 195 196 size_t maski = list_mask[wdxi].load(std::memory_order_relaxed); 197 size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed); 198 199 if(maski == 0 && maskj == 0) continue; 200 201 unsigned biti = maski ? ri % __builtin_popcountl(maski) : 0; 202 unsigned bitj = maskj ? rj % __builtin_popcountl(maskj) : 0; 203 204 unsigned bi = 64 - nthSetBit(maski, biti + 1); 205 unsigned bj = 64 - nthSetBit(maskj, bitj + 1); 206 207 assertf(bi < 64, "%zu %u %u", maski, biti, bi); 208 assertf(bj < 64, "%zu %u %u", maskj, bitj, bj); 209 210 i = bi | (wdxi << 6); 211 j = bj | (wdxj << 6); 212 213 assertf(i < numLists, "%u", wdxi << 6); 214 assertf(j < numLists, "%u", wdxj << 6); 215 } 216 217 if(auto node = try_pop(i, j)) return node; 218 } 219 #else 220 while(numNonEmpty != 0) { 221 // Pick two lists at random 222 int i = tls.rng.next() % numLists; 223 int j = tls.rng.next() % numLists; 224 225 if(auto node = try_pop(i, j)) return node; 226 } 227 #endif 150 228 151 229 return nullptr; 152 230 } 231 232 private: 233 node_t * try_pop(unsigned i, unsigned j) { 234 #ifndef NO_STATS 235 tls.pick.pop.attempt++; 236 #endif 237 238 // Pick the bet list 239 int w = i; 240 if( __builtin_expect(lists[j].ts() != 0, true) ) { 241 w = (lists[i].ts() < lists[j].ts()) ? i : j; 242 } 243 244 auto & list = lists[w]; 245 // If list looks empty retry 246 if( list.ts() == 0 ) return nullptr; 247 248 // If we can't get the lock retry 249 if( !list.lock.try_lock() ) return nullptr; 250 251 __attribute__((unused)) int num = numNonEmpty; 252 253 // If list is empty, unlock and retry 254 if( list.ts() == 0 ) { 255 list.lock.unlock(); 256 return nullptr; 257 } 258 259 // Actually pop the list 260 node_t * node; 261 bool emptied; 262 std::tie(node, emptied) = list.pop(); 263 assert(node); 264 265 if(emptied) { 266 numNonEmpty--; 267 size_t qword = w >> 6ull; 268 size_t bit = w & 63ull; 269 assert((list_mask[qword] & (1ul << bit)) != 0); 270 __attribute__((unused)) bool ret = btr(list_mask[qword], bit); 271 assert(ret); 272 assert((list_mask[qword] & (1ul << bit)) == 0); 273 } 274 275 // Unlock and return 276 list.lock.unlock(); 277 assert(numNonEmpty >= 0); 278 #ifndef NO_STATS 279 tls.pick.pop.success++; 280 tls.empty.pop.value += num; 281 tls.empty.pop.count += 1; 282 #endif 283 return node; 284 } 153 285 154 286 private: … … 178 310 sentinel_t before; 179 311 sentinel_t after; 180 stat s; 312 #ifndef NO_STATS 313 stat s; 314 #endif 181 315 182 316 static constexpr auto fields_offset = offsetof( node_t, _links ); … … 186 320 , after {{ head(), nullptr }} 187 321 { 188 assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));189 assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));190 assert(head()->_links.prev == nullptr);191 assert(head()->_links.next == tail() );192 assert(tail()->_links.next == nullptr);193 assert(tail()->_links.prev == head() );194 assert(sizeof(*this) == 128);195 assert((intptr_t(this) % 128) == 0);322 /* paranoid */ assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before)); 323 /* paranoid */ assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after )); 324 /* paranoid */ assert(head()->_links.prev == nullptr); 325 /* paranoid */ assert(head()->_links.next == tail() ); 326 /* paranoid */ assert(tail()->_links.next == nullptr); 327 /* paranoid */ assert(tail()->_links.prev == head() ); 328 /* paranoid */ assert(sizeof(*this) == 128); 329 /* paranoid */ assert((intptr_t(this) % 128) == 0); 196 330 } 197 331 … … 220 354 } 221 355 222 inline void push(node_t * node, std::atomic_int & nonEmpty) {356 inline bool push(node_t * node) { 223 357 assert(lock); 224 358 assert(node->_links.ts != 0); … … 232 366 prev->_links.next = node; 233 367 tail->_links.prev = node; 234 if(before._links.ts == 0l) {235 nonEmpty += 1;236 before._links.ts = node->_links.ts;237 }238 368 #ifndef NO_STATS 239 369 if(enable_stats) s.diff++; 240 370 #endif 241 } 242 243 inline node_t * pop(std::atomic_int & nonEmpty) { 371 if(before._links.ts == 0l) { 372 before._links.ts = node->_links.ts; 373 assert(node->_links.prev == this->head()); 374 return true; 375 } 376 return false; 377 } 378 379 inline std::pair<node_t *, bool> pop() { 244 380 assert(lock); 245 381 node_t * head = this->head(); … … 248 384 node_t * node = head->_links.next; 249 385 node_t * next = node->_links.next; 250 if(node == tail) return nullptr;386 if(node == tail) return {nullptr, false}; 251 387 252 388 head->_links.next = next; 253 389 next->_links.prev = head; 254 390 391 #ifndef NO_STATS 392 if(enable_stats) s.diff--; 393 #endif 255 394 if(next == tail) { 256 395 before._links.ts = 0l; 257 nonEmpty -= 1;396 return {node, true}; 258 397 } 259 398 else { … … 261 400 before._links.ts = next->_links.ts; 262 401 assert(before._links.ts != 0); 263 } 264 #ifndef NO_STATS 265 if(enable_stats) s.diff--; 266 #endif 267 return node; 402 return {node, false}; 403 } 268 404 } 269 405 … … 277 413 278 414 static __attribute__((aligned(128))) thread_local struct TLS { 279 Random rng = { int(rdtscl()) }; 280 pick_stat pick; 415 Random rng = { int(rdtscl()) }; 416 pick_stat pick; 417 empty_stat empty; 281 418 } tls; 282 419 420 public: 421 std::atomic_int numNonEmpty = 0; // number of non-empty lists 422 std::atomic_size_t list_mask[7] = { 0 }; // which queues are empty 283 423 private: 284 std::atomic_int numNonEmpty; // number of non-empty lists285 424 __attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t []> lists; 286 425 const unsigned numLists; -
doc/theses/thierry_delisle_PhD/code/utils.hpp
rb067d9b r9421f3d8 10 10 #include <unistd.h> 11 11 #include <sys/sysinfo.h> 12 13 #include <x86intrin.h> 12 14 13 15 // Barrier from … … 103 105 return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count(); 104 106 } 107 108 // from geeksforgeeks 109 const constexpr unsigned num_to_bits[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; 110 111 /* Recursively get nibble of a given number 112 and map them in the array */ 113 __attribute__((noinline)) unsigned countSetBits(size_t num) { 114 unsigned nibble = 0; 115 if (0 == num) 116 return num_to_bits[0]; 117 118 // Find last nibble 119 nibble = num & 0xf; 120 121 // Use pre-stored values to find count 122 // in last nibble plus recursively add 123 // remaining nibbles. 124 return num_to_bits[nibble] + countSetBits(num >> 4); 125 } 126 127 __attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) { 128 uint64_t v = mask; // Input value to find position with rank r. 129 unsigned int r = bit;// Input: bit's desired rank [1-64]. 130 unsigned int s; // Output: Resulting position of bit with rank r [1-64] 131 uint64_t a, b, c, d; // Intermediate temporaries for bit count. 132 unsigned int t; // Bit count temporary. 133 134 // Do a normal parallel bit count for a 64-bit integer, 135 // but store all intermediate steps. 136 // a = (v & 0x5555...) + ((v >> 1) & 0x5555...); 137 a = v - ((v >> 1) & ~0UL/3); 138 // b = (a & 0x3333...) + ((a >> 2) & 0x3333...); 139 b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); 140 // c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...); 141 c = (b + (b >> 4)) & ~0UL/0x11; 142 // d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...); 143 d = (c + (c >> 8)) & ~0UL/0x101; 144 145 146 t = (d >> 32) + (d >> 48); 147 // Now do branchless select! 148 s = 64; 149 // if (r > t) {s -= 32; r -= t;} 150 s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); 151 t = (d >> (s - 16)) & 0xff; 152 // if (r > t) {s -= 16; r -= t;} 153 s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); 154 t = (c >> (s - 8)) & 0xf; 155 // if (r > t) {s -= 8; r -= t;} 156 s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); 157 t = (b >> (s - 4)) & 0x7; 158 // if (r > t) {s -= 4; r -= t;} 159 s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); 160 t = (a >> (s - 2)) & 0x3; 161 // if (r > t) {s -= 2; r -= t;} 162 s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); 163 t = (v >> (s - 1)) & 0x1; 164 // if (r > t) s--; 165 s -= ((t - r) & 256) >> 8; 166 s = 65 - s; 167 return s; 168 } 169 170 // __attribute__((noinline)) unsigned int nth_bit_set(uint64_t value, unsigned int n) 171 // { 172 // uint64_t mask = 0x00000000FFFFFFFFul; 173 // unsigned int size = 32u; 174 // unsigned int base = 0u; 175 176 // if(value == 0) return 0; 177 // assertf(n < (unsigned)__builtin_popcountl(value), "%u < %d (%zu)", n, __builtin_popcountl(value), value); 178 // n++; 179 // while (size > 0) { 180 // const unsigned int count = __builtin_popcountl(value & mask); 181 // if (n > count) { 182 // base += size; 183 // size >>= 1; 184 // mask |= mask << size; 185 // } else { 186 // size >>= 1; 187 // mask >>= size; 188 // } 189 // } 190 191 // return base; 192 // } 193 194 195 static inline uint64_t rotl64 (uint64_t n, unsigned int c) { 196 const unsigned int mask = (8*sizeof(n) - 1); // assumes width is a power of 2. 197 198 c &= mask; 199 return (n<<c) | (n>>( (-c)&mask )); 200 } 201 202 static inline uint64_t rotr64 (uint64_t n, unsigned int c) 203 { 204 const unsigned int mask = (8*sizeof(n) - 1); 205 206 // assert ( (c<=mask) &&"rotate by type width or more"); 207 c &= mask; 208 return (n>>c) | (n<<( (-c)&mask )); 209 }
Note: See TracChangeset
for help on using the changeset viewer.