Skip to content

crash when KnnSearch and Tune concurencly #1759

@shadowao

Description

@shadowao

Describe the bug
crash backtrace

#0  _mm512_fmadd_ps(float __vector(16), float __vector(16), float __vector(16)) (__C=..., __B=..., __A=...)
    at /usr/lib/gcc/x86_64-redhat-linux/10/include/avx512fintrin.h:189
189       return __Y;
[Current thread is 1 (Thread 0x147202dff640 (LWP 212647))]
(gdb) bt
#0  _mm512_fmadd_ps(float __vector(16), float __vector(16), float __vector(16)) (__C=..., __B=..., __A=...)
    at /usr/lib/gcc/x86_64-redhat-linux/10/include/avx512fintrin.h:189
#1  vsag::avx512::SQ8ComputeL2Sqr (query=0x3130480, 
    codes=0x14703c074790 "a\266\352\r+\017\375\353\227\345_\017\356\370)\031\272\376\243uFe\275zk\224\241\235\374,\246\326\006\325\307\354}o\236\223:Mzl\203\327\352d\331L\314⎯\037y\204\024\333\317\372<\366\035\215\a\177!ﷴ\334\371\024\323O~Æ\220\205\333\340\213\243}J4\311\\\205\251\034\217e \016Z)\022\367\276f\311g\2722'x\307#\316/\347\216\375\375\266\177\340&$\363\364oL\230\311\371\331T\272ɵ\346q\307\366\237\343|6<@\337ع*\240\256\352R\315\372i\367\0313\223\362j\004\002\005\204\251\001ٜ\336\314V\303\341\365I\361\245\217\"\366\245Q}\353n}\002\367\363AB\337\273\367;x\214߾"..., 
    lower_bound=0x31, diff=0x42, dim=128) at /data/vsag/dev/vsag/src/simd/avx512.cpp:651
#2  0x000014722b1b3296 in vsag::ScalarQuantizer<(vsag::MetricType)0, 8>::ComputeDistImpl (this=0x107b990, computer=..., 
    codes=0x14703c074790 "a\266\352\r+\017\375\353\227\345_\017\356\370)\031\272\376\243uFe\275zk\224\241\235\374,\246\326\006\325\307\354}o\236\223:Mzl\203\327\352d\331L\314⎯\037y\204\024\333\317\372<\366\035\215\a\177!ﷴ\334\371\024\323O~Æ\220\205\333\340\213\243}J4\311\\\205\251\034\217e \016Z)\022\367\276f\311g\2722'x\307#\316/\347\216\375\375\266\177\340&$\363\364oL\230\311\371\331T\272ɵ\346q\307\366\237\343|6<@\337ع*\240\256\352R\315\372i\367\0313\223\362j\004\002\005\204\251\001ٜ\336\314V\303\341\365I\361\245\217\"\366\245Q}\353n}\002\367\363AB\337\273\367;x\214߾"..., 
    dists=0x147058001c50) at /data/vsag/dev/vsag/src/quantization/scalar_quantization/scalar_quantizer.cpp:203
#3  0x000014722b9e677d in vsag::Quantizer<vsag::ScalarQuantizer<(vsag::MetricType)0, 8> >::ComputeDistsBatch4 (this=0x107b990, computer=..., 
    codes1=0x14703c074790 "a\266\352\r+\017\375\353\227\345_\017\356\370)\031\272\376\243uFe\275zk\224\241\235\374,\246\326\006\325\307\354}o\236\223:Mzl\203\327\352d\331L\314⎯\037y\204\024\333\317\372<\366\035\215\a\177!ﷴ\334\371\024\323O~Æ\220\205\333\340\213\243}J4\311\\\205\251\034\217e \016Z)\022\367\276f\311g\2722'x\307#\316/\347\216\375\375\266\177\340&$\363\364oL\230\311\371\331T\272ɵ\346q\307\366\237\343|6<@\337ع*\240\256\352R\315\372i\367\0313\223\362j\004\002\005\204\251\001ٜ\336\314V\303\341\365I\361\245\217\"\366\245Q}\353n}\002\367\363AB\337\273\367;x\214߾"..., 
    codes2=0x14703c0a4b10 "\223\"\016Z\322`\217\307K#\231\r?)9\341f\347\f\204\022\235\b#d\356\210\326*v\033U\356\t\275\266!\016\234U\225\275\001\371\373\321r\204̗\341a\231\354\215,\231", 
    codes3=0x14703c0bae90 "\032\367.z\264\326\017!f\335v\257і~\211j\253\212\255\265\341@%\037˥e4\001z\325wG\024\021bY\356\200i\210\257\373\024]Z\320\031/u߸g\021\031Q\003\361Q\211>Q\2600r\350u\340\252\327\343l\200?\022K\215\367\017\204\266\315D\246\214I\302\370\211\222\254\203\240\214S6\376\365\2559\331#҃\033\363\310Yp[\032\006:-\333\375\215\2257", 
    codes4=0x14703c09e310 "\236\316V+\336x\263\232\037ڲ9\v\241h]\341\265k\236M\210\225I0\353\340\033\373.Xƺ\333\006\033K%\321\006\0375\"\034s\266\307\b\264>\375蝌%\314\027\336\031\230йvL^\332hA\236C\214\026\354\340\016\022Z\332=\366\366\224|\345\023\334\346/\226\222\274\371\301jvX\026Uh\3161\361GM\3128\345\307\1776\234\\\b')9\354\346\307\0348?>EnnS9&\253\234\257\356\\\032n*\2169&\277O$\257s\2314!\035\272\235\234\021|\264\331o\335*D\207Q\262\212\225E\016\213\262\233\002\254\261\344M\203<\220X\r\024|!\205\260\353\310yi\376\227\313\373,\343\345\201\t\300$"..., dists1=@0x147058001c50: 0, 
    dists2=@0x147058001c54: 0, dists3=@0x147058001c58: 0, dists4=@0x147058001c5c: 0)
    at /data/vsag/dev/vsag/src/quantization/quantizer.h:196
#4  0x000014722b9cd306 in vsag::Computer<vsag::ScalarQuantizer<(vsag::MetricType)0, 8> >::ComputeDistsBatch4 (this=0x147058001930, 
    codes1=0x14703c074790 "a\266\352\r+\017\375\353\227\345_\017\356\370)\031\272\376\243uFe\275zk\224\241\235\374,\246\326\006\325\307\354}o\236\223:Mzl\203\327\352d\331L\314⎯\037y\204\024\333\317\372<\366\035\215\a\177!ﷴ\334\371\024\323O~Æ\220\205\333\340\213\243}J4\311\\\205\251\034\217e \016Z)\022\367\276f\311g\2722'x\307#\316/\347\216\375\375\266\177\340&$\363\364oL\230\311\371\331T\272ɵ\346q\307\366\237\343|6<@\337ع*\240\256\352R\315\372i\367\0313\223\362j\004\002\005\204\251\001ٜ\336\314V\303\341\365I\361\245\217\"\366\245Q}\353n}\002\367\363AB\337\273\367;x\214߾"..., 
    codes2=0x14703c0a4b10 "\223\"\016Z\322`\217\307K#\231\r?)9\341f\347\f\204\022\235\b#d\356\210\326*v\033U\356\t\275\266!\016\234U\225\275\001\371\373\321r\204̗\341a\231\354\215,\231", 
    codes3=0x14703c0bae90 "\032\367.z\264\326\017!f\335v\257і~\211j\253\212\255\265\341@%\037˥e4\001z\325wG\024\021bY\356\200i\210\257\373\024]Z\320\031/u߸g\021\031Q\003\361Q\211>Q\2600r\350u\340\252\327\343l\200?\022K\215\367\017\204\266\315D\246\214I\302\370\211\222\254\203\240\214S6\376\365\2559\331#҃\033\363\310Yp[\032\006:-\333\375\215\2257", 
    codes4=0x14703c09e310 "\236\316V+\336x\263\232\037ڲ9\v\241h]\341\265k\236M\210\225I0\353\340\033\373.Xƺ\333\006\033K%\321\006\0375\"\034s\266\307\b\264>\375蝌%\314\027\336\031\230йvL^\332hA\236C\214\026\354\340\016\022Z\332=\366\366\224|\345\023\334\346/\226\222\274\371\301jvX\026Uh\3161\361GM\3128\345\307\1776\234\\\b')9\354\346\307\0348?>EnnS9&\253\234\257\356\\\032n*\2169&\277O$\257s\2314!\035\272\235\234\021|\264\331o\335*D\207Q\262\212\225E\016\213\262\233\002\254\261\344M\203<\220X\r\024|!\205\260\353\310yi\376\227\313\373,\343\345\201\t\300$"..., dists1=@0x147058001c50: 0, 
    dists2=@0x147058001c54: 0, dists3=@0x147058001c58: 0, dists4=@0x147058001c5c: 0)
    at /data/vsag/dev/vsag/src/quantization/computer.h:72
#5  0x000014722b9934dc in vsag::FlattenDataCell<vsag::ScalarQuantizer<(vsag::MetricType)0, 8>, vsag::MemoryBlockIO>::query (this=0x313ca20, 
    result_dists=0x147058001c50, computer=0x147058001930, idx=0x147058001b30, id_count=31, ctx=0x147202dfa870)
    at /data/vsag/dev/vsag/src/datacell/flatten_datacell.h:379
#6  0x000014722b873987 in vsag::FlattenDataCell<vsag::ScalarQuantizer<(vsag::MetricType)0, 8>, vsag::MemoryBlockIO>::Query (this=0x313ca20, 
    result_dists=0x147058001c50, computer=..., idx=0x147058001b30, id_count=31, ctx=0x147202dfa870)
    at /data/vsag/dev/vsag/src/datacell/flatten_datacell.h:54
#7  0x000014722bae60d3 in vsag::BasicSearcher::search_impl<(vsag::InnerSearchMode)1> (this=0xb43e00, graph=..., flatten=..., vl=..., 
    query=0x101b020, inner_search_param=..., iter_ctx=0x147058000f50, ctx=0x147202dfa870)
    at /data/vsag/dev/vsag/src/impl/searcher/basic_searcher.cpp:199
#8  0x000014722bae1d91 in vsag::BasicSearcher::Search (this=0xb43e00, graph=..., flatten=..., vl=..., query=0x101b020, inner_search_param=..., 
    iter_ctx=0x147058000f50, ctx=0x147202dfa870) at /data/vsag/dev/vsag/src/impl/searcher/basic_searcher.cpp:99
#9  0x000014722bb42404 in vsag::HGraph::search_one_graph<(vsag::InnerSearchMode)1> (this=0x1079ae0, query=0x101b020, graph=..., flatten=..., 
    inner_search_param=..., iter_ctx=0x147058000f50, ctx=0x147202dfa870) at /data/vsag/dev/vsag/src/algorithm/hgraph.cpp:1047
#10 0x000014722bb3036d in vsag::HGraph::KnnSearch (this=0x1079ae0, query=..., k=5, parameters=..., filter=..., allocator=0x0, 
    iter_ctx=@0x147202dfb0a8: 0x147058000f50, is_last_filter=false) at /data/vsag/dev/vsag/src/algorithm/hgraph.cpp:911
#11 0x000014722b27d7e6 in vsag::IndexImpl<vsag::HGraph>::KnnSearch (this=0xb50c20, query=..., k=5, parameters=..., filter=..., 
    iter_ctx=@0x147202dfb0a8: 0x147058000f50, is_last_filter=false) at /data/vsag/dev/vsag/src/index/index_impl.h:315
#12 0x0000000000589180 in operator() (__closure=0x147202dfb190) at /data/vsag/dev/vsag/tests/test_hgraph.cpp:2433
#13 0x00000000005894b0 in operator() (__closure=0x3135ef0, is_read=true) at /data/vsag/dev/vsag/tests/test_hgraph.cpp:2454
#14 0x0000000000595f00 in std::__invoke_impl<void, CATCH2_INTERNAL_TEST_206()::<lambda(bool)>, bool>(std::__invoke_other, struct {...} &&) (__f=...)
    at /usr/include/c++/10/bits/invoke.h:60
#15 0x0000000000595ddd in std::__invoke<CATCH2_INTERNAL_TEST_206()::<lambda(bool)>, bool>(struct {...} &&) (__fn=...)

To Reproduce
Codes to reproduce the behavior:

TEST_CASE("HGraph Concurrent Read Write", "[ft][hgraph][tune]") {
  uint32_t op_num = 10000;
  uint32_t dim = 128;
  uint32_t top_k = 5;
  float read_ratio = 0.8;
  float thread_num = 10;

  std::vector<std::vector<float>> dataset;
  dataset.reserve(op_num);
  auto seed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
  std::mt19937 rng(seed);
  std::uniform_real_distribution<float> dist(-10.0, 10.0);
  for (uint32_t i = 0; i < op_num; ++i) {
      std::vector<float> vector_data;
      vector_data.reserve(dim);
      for (uint32_t j = 0; j < dim; ++j) {
          vector_data.emplace_back(dist(rng));
      }
      dataset.emplace_back(std::move(vector_data));
  }

  std::string search_params = R"({
      "hgraph": {
        "ef_search": 100
      }
  })";

  std::string hgraph_params = R"({
      "dtype": "float32",
      "metric_type": "l2",
      "dim": 128,
      "index_param": {
          "base_quantization_type": "fp32",
          "max_degree": 32,
          "ef_construction": 100,
          "store_raw_vector": true
      }
  })";

  std::string sq8_params = R"({
      "dtype": "float32",
      "metric_type": "l2",
      "dim": 128,
      "index_param": {
          "base_quantization_type": "sq8",
          "max_degree": 32,
          "ef_construction": 100
      }
  })";
  auto build_res = vsag::Factory::CreateIndex("hgraph", hgraph_params);
  auto vsag_index = std::move(build_res.value());

  std::atomic<uint32_t> actual_read_num{0};
  std::atomic<uint32_t> actual_write_num{0};
  uint32_t expect_read_num = 1000000;
  uint32_t expect_write_num = op_num;
  std::atomic<bool> is_tune{false};
  std::mutex tune_mutex;

  auto test_func = [&](bool is_read) {
      // Decide whether each operation is a write or a read.
      std::random_device rd;
      std::mt19937 gen(rd());
      std::uniform_real_distribution<float> dist(0.0, 1.0);

      uint32_t local_read_num{0};
      uint32_t local_write_num{0};

      auto write_func = [&]() {
          uint32_t old_value = actual_write_num.fetch_add(1);
          if (old_value >= expect_write_num) {
              return;
          }

          int64_t vec_id = static_cast<int64_t>(old_value);
          auto base = vsag::Dataset::Make();
          base->NumElements(1)
              ->Dim(dim)
              ->Ids(&vec_id)
              ->Float32Vectors(dataset[old_value].data())
              ->Owner(false);

          // Do hnsw add.
          auto res = vsag_index->Add(base);
          if (!res.has_value()) {
              std::cout << "put error: " << res.error().message << std::endl;
          }

          ++local_write_num;
      };

      auto read_func = [&]() {
          uint32_t old_value = actual_read_num.fetch_add(1);
          if (old_value >= expect_read_num) {
              return;
          }

          auto query = vsag::Dataset::Make();
          query->NumElements(1)
              ->Dim(dim)
              ->Float32Vectors(dataset[old_value%op_num].data())
              ->Owner(false);

          // Do knn search.
          vsag::IteratorContext* iter_ctx = nullptr;
          auto res = vsag_index->KnnSearch(query, top_k, search_params, nullptr, iter_ctx, false);
          if (!res.has_value()) {
              std::cout << "query error: " << res.error().message << std::endl;
          }
          if (iter_ctx != nullptr) {
              delete iter_ctx;
          }
          ++local_read_num;
      };

      while (true) {
          if (is_read) {
            if (actual_read_num >= expect_read_num) {
              break;
            }
          } else {
            if (actual_write_num >= expect_write_num) {
              break;
            }
          }
          if (is_read) {
              read_func();
          } else {
              write_func();
          }
      }
  };

  test_func(false);
  std::cout << "write completed" << std::endl;
  
  auto threads = std::make_unique<std::vector<std::thread>>();
  threads->reserve(thread_num);
  for (uint32_t i = 0; i < thread_num ; ++i) {
      threads->emplace_back(test_func, true);
  }
  std::this_thread::sleep_for(std::chrono::milliseconds(5000));
  vsag_index->Tune(sq8_params, true);
  std::cout << "tune completed" << std::endl;

  std::cout << "actual_write_num: " << actual_write_num.load() << std::endl;
  std::cout << "actual_read_num: " << actual_read_num.load() << std::endl;

  // Wait write completed.
  for (auto& thread : *threads) {
      thread.join();
  }
}

Environment
Please run bash scripts/check_environment.sh and paste the output here:

  • OS: Linux
  • vsag version: v0.18.3
  • compiler version: GCC9
  • interface: cpp

Expected behavior
A clear and concise description of what you expected to happen.

Screenshots
If applicable, add screenshots to help explain your problem.

Additional context
Add any other context about the problem here.

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions