Skip to content

Commit 01201c0

Browse files
hashtable: encapsulate as a class, drop module-level globals
The hash table for d=1 clustering was a set of free functions backed by four file-level globals (hash_mask, hash_occupied, hash_values, hash_data). The TODO at the top of hashtable.cc asked for these to be folded into a struct passed by reference, since they are only used from algod1.cc. Replace the free functions and globals with a Hashtable class that owns the three vectors and the mask, exposing the same operations as member functions: allocate, clear, getindex, getnextindex, set_occupied, is_occupied, set_value, compare_value, get_data, set_data The accessor members are marked noexcept: they only perform arithmetic on built-in integer types and unchecked vector operator[] (which does not throw); debug-build bounds checks are kept via assert(). In algod1.cc, hash_insert / hash_check_attach / find_variant_matches and their callers (mark_light_var/_thread, check_heavy_var/_var_2/ _thread, check_variants, network_thread) now take a Hashtable reference. The three thread-runner lambdas capture hash_table by reference. The std::fill on the occupancy buffer between fastidious phases becomes hash_table.clear(); the trailing hash_free() call disappears with RAII. No behavioural change: all 200 tests in swarm-tests pass under DEBUG=1 (asan + ubsan + _GLIBCXX_DEBUG). Co-Authored-By: Florian Filloux <[email protected]>
1 parent d1ac871 commit 01201c0

3 files changed

Lines changed: 141 additions & 149 deletions

File tree

src/algod1.cc

Lines changed: 51 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -195,23 +195,23 @@ namespace {
195195
}
196196

197197

198-
inline auto hash_insert(unsigned int const amp) -> bool {
198+
inline auto hash_insert(Hashtable & hash_table, unsigned int const amp) -> bool {
199199
/* find the first empty bucket */
200200
const auto hash = db_gethash(amp);
201-
auto index = hash_getindex(hash);
201+
auto index = hash_table.getindex(hash);
202202
auto has_duplicate = false;
203-
while (hash_is_occupied(index)) {
204-
auto const is_same_amplicon = hash_compare_value(index, hash) and
205-
check_amp_identical(amp, hash_get_data(index));
203+
while (hash_table.is_occupied(index)) {
204+
auto const is_same_amplicon = hash_table.compare_value(index, hash) and
205+
check_amp_identical(amp, hash_table.get_data(index));
206206
if (is_same_amplicon) {
207207
has_duplicate = true;
208208
}
209-
index = hash_getnextindex(index);
209+
index = hash_table.getnextindex(index);
210210
}
211211

212-
hash_set_occupied(index);
213-
hash_set_value(index, hash);
214-
hash_set_data(index, amp);
212+
hash_table.set_occupied(index);
213+
hash_table.set_value(index, hash);
214+
hash_table.set_data(index, amp);
215215
bloomflex_set(bloom_a, hash);
216216

217217
return has_duplicate;
@@ -347,7 +347,8 @@ namespace {
347347
}
348348

349349

350-
auto hash_check_attach(char * seed_sequence,
350+
auto hash_check_attach(Hashtable const & hash_table,
351+
char * seed_sequence,
351352
unsigned int seed_seqlen,
352353
struct var_s & var,
353354
unsigned int seed,
@@ -357,16 +358,16 @@ namespace {
357358

358359
/* compute hash and corresponding hash table index */
359360
const auto hash = var.hash;
360-
auto index = hash_getindex(hash);
361+
auto index = hash_table.getindex(hash);
361362

362363
/* find matching buckets */
363364

364-
while (hash_is_occupied(index))
365+
while (hash_table.is_occupied(index))
365366
{
366-
if (hash_compare_value(index, hash))
367+
if (hash_table.compare_value(index, hash))
367368
{
368369
/* check that mass is below threshold */
369-
const auto amp = hash_get_data(index);
370+
const auto amp = hash_table.get_data(index);
370371

371372
/* make absolutely sure sequences are identical */
372373
auto const * amp_sequence = db_getsequence(amp);
@@ -377,13 +378,14 @@ namespace {
377378
return true;
378379
}
379380
}
380-
index = hash_getnextindex(index);
381+
index = hash_table.getnextindex(index);
381382
}
382383
return false;
383384
}
384385

385386

386-
inline auto check_heavy_var_2(std::vector<char>& seq,
387+
inline auto check_heavy_var_2(Hashtable const & hash_table,
388+
std::vector<char>& seq,
387389
unsigned int seqlen,
388390
unsigned int seed,
389391
std::vector<struct var_s>& variant_list,
@@ -399,7 +401,7 @@ namespace {
399401

400402
for (auto i = 0U; i < variant_count; ++i) {
401403
if (bloomflex_get(bloom_a, variant_list[i].hash) and
402-
hash_check_attach(seq.data(), seqlen, variant_list[i], seed, graft_state)) {
404+
hash_check_attach(hash_table, seq.data(), seqlen, variant_list[i], seed, graft_state)) {
403405
++matches;
404406
}
405407
}
@@ -408,7 +410,8 @@ namespace {
408410
}
409411

410412

411-
auto check_heavy_var(struct bloomflex_s * bloom,
413+
auto check_heavy_var(Hashtable const & hash_table,
414+
struct bloomflex_s * bloom,
412415
std::vector<char>& varseq,
413416
unsigned int seed,
414417
uint64_t & number_of_matches,
@@ -452,7 +455,8 @@ namespace {
452455
auto varlen = 0U;
453456
generate_variant_sequence(sequence, seqlen,
454457
var, varseq, varlen);
455-
matches += check_heavy_var_2(varseq,
458+
matches += check_heavy_var_2(hash_table,
459+
varseq,
456460
varlen,
457461
seed,
458462
variant_list2,
@@ -466,6 +470,7 @@ namespace {
466470

467471

468472
auto check_heavy_thread(struct Parameters const & parameters,
473+
Hashtable const & hash_table,
469474
int64_t nth_thread,
470475
struct Heavy_state & heavy_state,
471476
struct Graft_state & graft_state,
@@ -500,7 +505,7 @@ namespace {
500505
lock.unlock();
501506
uint64_t number_of_matches {0};
502507
uint64_t number_of_variants {0};
503-
check_heavy_var(bloom_f, buffer1, heavy_amplicon_id,
508+
check_heavy_var(hash_table, bloom_f, buffer1, heavy_amplicon_id,
504509
number_of_matches, number_of_variants,
505510
variant_list, variant_list2,
506511
graft_state);
@@ -511,7 +516,8 @@ namespace {
511516
}
512517

513518

514-
auto mark_light_var(struct bloomflex_s * bloom,
519+
auto mark_light_var(Hashtable & hash_table,
520+
struct bloomflex_s * bloom,
515521
unsigned int seed,
516522
std::vector<struct var_s>& variant_list) -> uint64_t
517523
{
@@ -522,7 +528,7 @@ namespace {
522528
seed is the original seed
523529
*/
524530

525-
hash_insert(seed);
531+
hash_insert(hash_table, seed);
526532

527533
auto const * sequence = db_getsequence(seed);
528534
const auto seqlen = db_getsequencelen(seed);
@@ -538,6 +544,7 @@ namespace {
538544

539545

540546
auto mark_light_thread(struct Parameters const & parameters,
547+
Hashtable & hash_table,
541548
int64_t nth_thread,
542549
struct Light_state & state,
543550
struct Progress_status & progress) -> void
@@ -564,7 +571,7 @@ namespace {
564571
{
565572
progress_update(progress, ++state.progress); // refactoring: separate operations?
566573
lock.unlock();
567-
const auto variant_count = mark_light_var(bloom_f, light_amplicon_id,
574+
const auto variant_count = mark_light_var(hash_table, bloom_f, light_amplicon_id,
568575
variant_list);
569576
lock.lock();
570577
state.variants += variant_count;
@@ -577,6 +584,7 @@ namespace {
577584

578585

579586
inline auto find_variant_matches(struct Parameters const & parameters,
587+
Hashtable const & hash_table,
580588
unsigned int seed,
581589
struct var_s & var,
582590
std::vector<unsigned int>& hits_data,
@@ -588,15 +596,15 @@ namespace {
588596

589597
/* compute hash and corresponding hash table index */
590598

591-
auto index = hash_getindex(var.hash);
599+
auto index = hash_table.getindex(var.hash);
592600

593601
/* find matching buckets */
594602

595-
while (hash_is_occupied(index))
603+
while (hash_table.is_occupied(index))
596604
{
597-
if (hash_compare_value(index, var.hash))
605+
if (hash_table.compare_value(index, var.hash))
598606
{
599-
const auto amp = hash_get_data(index);
607+
const auto amp = hash_table.get_data(index);
600608

601609
/* avoid self */
602610
if (seed != amp) {
@@ -620,12 +628,13 @@ namespace {
620628
}
621629
}
622630
}
623-
index = hash_getnextindex(index);
631+
index = hash_table.getnextindex(index);
624632
}
625633
}
626634

627635

628636
auto check_variants(struct Parameters const & parameters,
637+
Hashtable const & hash_table,
629638
unsigned int seed,
630639
std::vector<struct var_s> & variant_list,
631640
std::vector<unsigned int>& hits_data) -> unsigned int
@@ -640,17 +649,18 @@ namespace {
640649
// C++17 refactoring:
641650
// std::for_each_n(variant_list.begin(), variant_count,
642651
// [seed, &hits_data, &hits_count](auto& variant) {
643-
// find_variant_matches(parameters, seed, variant, hits_data, hits_count);
652+
// find_variant_matches(parameters, hash_table, seed, variant, hits_data, hits_count);
644653
// });
645654
for (auto i = 0U; i < variant_count; ++i) {
646-
find_variant_matches(parameters, seed, variant_list[i], hits_data, hits_count);
655+
find_variant_matches(parameters, hash_table, seed, variant_list[i], hits_data, hits_count);
647656
}
648657

649658
return hits_count;
650659
}
651660

652661

653662
auto network_thread(struct Parameters const & parameters,
663+
Hashtable const & hash_table,
654664
int64_t nth_thread,
655665
struct Network_state & state,
656666
struct Progress_status & progress) -> void
@@ -673,7 +683,7 @@ namespace {
673683

674684
lock.unlock();
675685

676-
const auto hits_count = check_variants(parameters, amp, variant_list, hits_data);
686+
const auto hits_count = check_variants(parameters, hash_table, amp, variant_list, hits_data);
677687
lock.lock();
678688

679689
assert(amp <= std::numeric_limits<std::ptrdiff_t>::max());
@@ -1150,13 +1160,8 @@ auto algo_d1_run(struct Parameters const & parameters) -> void
11501160

11511161

11521162
/* compute hash for all amplicons and store them in a hash table */
1153-
std::vector<unsigned char> hash_occupied_v;
1154-
std::vector<uint64_t> hash_values_v;
1155-
std::vector<unsigned int> hash_data_v;
1156-
const auto hashtablesize = hash_alloc(amplicons,
1157-
hash_occupied_v,
1158-
hash_values_v,
1159-
hash_data_v);
1163+
Hashtable hash_table;
1164+
const auto hashtablesize = hash_table.allocate(amplicons);
11601165
static constexpr unsigned int amplicon_pattern_shift {10};
11611166
static constexpr unsigned int amplicon_n_hash_functions {8};
11621167
struct bloomflex_s bloom_filter;
@@ -1169,7 +1174,7 @@ auto algo_d1_run(struct Parameters const & parameters) -> void
11691174
bool has_duplicate {false};
11701175
for (auto k = 0U; k < amplicons; ++k)
11711176
{
1172-
has_duplicate = hash_insert(k);
1177+
has_duplicate = hash_insert(hash_table, k);
11731178
progress_update(progress, k);
11741179
if (has_duplicate) {
11751180
break;
@@ -1200,8 +1205,8 @@ auto algo_d1_run(struct Parameters const & parameters) -> void
12001205
// refactoring C++14: use std::make_unique
12011206
std::unique_ptr<ThreadRunner> network_tr (new ThreadRunner(
12021207
static_cast<int>(parameters.opt_threads),
1203-
[&parameters, &network_state, &progress](int64_t nth_thread) {
1204-
network_thread(parameters, nth_thread, network_state, progress);
1208+
[&parameters, &hash_table, &network_state, &progress](int64_t nth_thread) {
1209+
network_thread(parameters, hash_table, nth_thread, network_state, progress);
12051210
}));
12061211
network_tr->run();
12071212
}
@@ -1448,7 +1453,7 @@ auto algo_d1_run(struct Parameters const & parameters) -> void
14481453
/* Empty the old hash and bloom filter
14491454
before we reinsert only the light swarm amplicons */
14501455

1451-
std::fill(hash_occupied_v.begin(), hash_occupied_v.end(), 0U);
1456+
hash_table.clear();
14521457
bloomflex_zap(bloom_filter);
14531458

14541459
progress_init(progress, "Adding light swarm amplicons to Bloom filter",
@@ -1465,8 +1470,8 @@ auto algo_d1_run(struct Parameters const & parameters) -> void
14651470
// refactoring C++14: use std::make_unique
14661471
std::unique_ptr<ThreadRunner> light_tr (new ThreadRunner(
14671472
static_cast<int>(parameters.opt_threads),
1468-
[&parameters, &light_state, &progress](int64_t nth_thread) {
1469-
mark_light_thread(parameters, nth_thread, light_state, progress);
1473+
[&parameters, &hash_table, &light_state, &progress](int64_t nth_thread) {
1474+
mark_light_thread(parameters, hash_table, nth_thread, light_state, progress);
14701475
}));
14711476
light_tr->run();
14721477
}
@@ -1492,8 +1497,8 @@ auto algo_d1_run(struct Parameters const & parameters) -> void
14921497
// refactoring C++14: use std::make_unique
14931498
std::unique_ptr<ThreadRunner> heavy_tr (new ThreadRunner(
14941499
static_cast<int>(parameters.opt_threads),
1495-
[&parameters, &heavy_state, &graft_state, &progress](int64_t nth_thread) {
1496-
check_heavy_thread(parameters, nth_thread, heavy_state, graft_state, progress);
1500+
[&parameters, &hash_table, &heavy_state, &graft_state, &progress](int64_t nth_thread) {
1501+
check_heavy_thread(parameters, hash_table, nth_thread, heavy_state, graft_state, progress);
14971502
}));
14981503
heavy_tr->run();
14991504
}
@@ -1523,7 +1528,6 @@ auto algo_d1_run(struct Parameters const & parameters) -> void
15231528
std::fprintf(parameters.logfile, "Max generations: %u\n", maxgen);
15241529

15251530
bloomflex_exit(bloom_filter);
1526-
hash_free();
15271531

15281532
swarminfo = nullptr;
15291533
ampinfo = nullptr;

0 commit comments

Comments
 (0)