Skip to content

Commit 8be29ae

Browse files
committed
feat(mtr):hotfix mtr for schema embedding
to fixup the assert failed in Assertion failure: trx0sys.cc:680:trx->state.load(std::memory_order_relaxed) == TRX_STATE_NOT_STARTED ha_pre_dd_shutdown() calls plugin_foreach(MYSQL_STORAGE_ENGINE_PLUGIN, ...). Plugin registration order determines call order. InnoDB is a built-in plugin registered before Shannon Rapid, so the actual sequence is: ``` ha_pre_dd_shutdown() → innodb_pre_dd_shutdown() ← InnoDB fires FIRST → srv_pre_dd_shutdown() → trx_sys_before_pre_dd_shutdown_validate() ← assertion here → rapid_pre_dd_shutdown() ← Rapid fires SECOND → EmbeddingManager::shutdown() ← too late ```
1 parent 6d790b1 commit 8be29ae

9 files changed

Lines changed: 40 additions & 94 deletions

File tree

.github/workflows/nightly.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ jobs:
109109
sudo chown -R $USER:$USER /home/shannon-bin/
110110
cd /home/shannon-bin/mysql-test/
111111
sudo chmod -R u+rwx mysql-test-run.pl
112-
./mysql-test-run.pl --mysqld=--loose-rapid_schema_embedding=OFF --suite=main,innodb,federated,rpl,rpl_ndb,rpl_gtid,rpl_nogtid,funcs_1,funcs_2,information_schema,sys_vars,opt_trace,secondary_engine,ml \
112+
./mysql-test-run.pl --mysqld=--loose-rapid_schema_embedding=OFF --suite=main,innodb,innodb_zip,federated,rpl,rpl_ndb,rpl_gtid,rpl_nogtid,funcs_1,funcs_2,information_schema,sys_vars,opt_trace,secondary_engine,ml \
113113
--big-test --mysqld=--user=$USER --mysqld=--default-storage-engine=innodb --nowarnings --force --nocheck-testcases --retry=3 --parallel=$(nproc)
114114
# binlog,binlog_gtid,binlog_nogtid, due to disk space limition on github runner, when we have a fast git action runner, we can use the following command to run the test.
115115
#sudo chmod +x ./collections/default.push && ./collections/default.daily
@@ -171,7 +171,7 @@ jobs:
171171
sudo chown -R $USER:$USER /home/shannon-bin/
172172
cd /home/shannon-bin/mysql-test/
173173
sudo chmod -R u+rwx mysql-test-run.pl
174-
./mysql-test-run.pl --mysqld=--loose-rapid_schema_embedding=OFF --suite=main,innodb,binlog,binlog_gtid,binlog_nogtid,secondary_engine,ml \
174+
./mysql-test-run.pl --mysqld=--loose-rapid_schema_embedding=OFF --suite=main,innodb,innodb_zip,binlog,binlog_gtid,binlog_nogtid,secondary_engine,ml \
175175
--big-test --mysqld=--user=$USER --mysqld=--default-storage-engine=innodb --nowarnings --force --nocheck-testcases --skip-test-list=skip-tests.arm --retry=3 --parallel=$(nproc)
176176
# when we have a fast git action runner, we can use the following command to run the test.
177177
#sudo chmod +x ./collections/default.push && ./collections/default.daily

.github/workflows/pull-requests.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ jobs:
125125
export ASAN_OPTIONS="detect_leaks=1"
126126
export LSAN_OPTIONS="suppressions=/home/shannon-bin/mysql-test/lsan.supp"
127127
./mysql-test-run.pl --mysqld=--loose-rapid_schema_embedding=OFF --sanitize --suite=main,innodb,binlog,binlog_gtid,binlog_nogtid,federated,rpl,rpl_gtid,rpl_nogtid,funcs_1,funcs_2,\
128-
information_schema,secondary_engine,ml,sys_vars \
128+
information_schema,secondary_engine,ml,sys_vars,innodb_zip \
129129
--mysqld=--user=$USER --mysqld=--default-storage-engine=innodb --nowarnings --force --nocheck-testcases --retry=3 --parallel=$(nproc)
130130
# when we have a fast git action runner, we can use the following command to run the test.
131131
# sudo chmod -R u+rwx mysql-test-run.pl && sudo chmod +x ./collections/default.push
@@ -190,7 +190,7 @@ jobs:
190190
sudo chmod -R u+rwx mysql-test-run.pl
191191
export ASAN_OPTIONS="detect_leaks=1"
192192
export LSAN_OPTIONS="suppressions=/home/shannon-bin/mysql-test/lsan.supp"
193-
./mysql-test-run.pl --mysqld=--loose-rapid_schema_embedding=OFF --sanitize --suite=main,innodb,secondary_engine,ml,sys_vars \
193+
./mysql-test-run.pl --mysqld=--loose-rapid_schema_embedding=OFF --sanitize --suite=main,innodb,innodb_zip,secondary_engine,ml,sys_vars \
194194
--mysqld=--user=$USER --mysqld=--default-storage-engine=innodb --nowarnings --force --nocheck-testcases --retry=3 --parallel=$(nproc)
195195
# when we have a fast git action runner, we can use the following command to run the test.
196196
# sudo chmod -R u+rwx mysql-test-run.pl && sudo chmod +x ./collections/default.push

ml/ml_retrieve_schema_metadata.cpp

Lines changed: 24 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,6 @@ std::atomic<embedding_state_t> EmbeddingManager::m_state{embedding_state_t::EMBE
7171
std::condition_variable EmbeddingManager::m_manager_cv;
7272
std::mutex EmbeddingManager::m_manager_mutex;
7373

74-
std::condition_variable EmbeddingManager::m_fully_stopped_cv;
75-
std::mutex EmbeddingManager::m_fully_stopped_mutex;
76-
bool EmbeddingManager::m_fully_stopped{true};
77-
std::atomic<bool> EmbeddingManager::m_shutdown_initiated{false};
78-
7974
struct ScopedInternalTHD {
8075
THD *thd{nullptr};
8176

@@ -662,7 +657,6 @@ static void *embedding_table_worker_func(void *arg) {
662657
}
663658

664659
DBUG_PRINT("ml", ("ML TableWorker [%s]: exiting", ctx->key.c_str()));
665-
mgr->on_thread_exiting();
666660
return nullptr;
667661
}
668662

@@ -824,12 +818,11 @@ static void *embedding_manager_func(void *arg) {
824818
}
825819

826820
DBUG_PRINT("ml", ("ML EmbeddingManager: event loop finished, exiting."));
827-
mgr->on_thread_exiting();
828821
return nullptr;
829822
}
830823

831824
void EmbeddingManager::start_impl() {
832-
if (!m_initialized.load() || m_shutdown_initiated.load(std::memory_order_acquire)) return;
825+
if (!m_initialized.load()) return;
833826
embedding_state_t expected = embedding_state_t::EMBEDDING_STATE_EXIT;
834827
if (!m_state.compare_exchange_strong(expected, embedding_state_t::EMBEDDING_STATE_RUN, std::memory_order_acq_rel,
835828
std::memory_order_acquire)) {
@@ -840,13 +833,6 @@ void EmbeddingManager::start_impl() {
840833
}
841834
}
842835

843-
{
844-
std::lock_guard<std::mutex> lk(m_fully_stopped_mutex);
845-
m_fully_stopped = false;
846-
}
847-
848-
m_active_thread_count.fetch_add(1, std::memory_order_relaxed);
849-
850836
{
851837
std::lock_guard<std::mutex> lk(m_embedder_mutex);
852838
m_embedder = std::make_unique<ML_embedding_row>();
@@ -862,15 +848,36 @@ void EmbeddingManager::start_impl() {
862848
if (my_thread_create(&m_manager_thread, &attr, embedding_manager_func, this) != 0) {
863849
sql_print_error("[EmbeddingManager] start: failed to create coordinator thread");
864850
my_thread_attr_destroy(&attr);
865-
m_active_thread_count.fetch_sub(1, std::memory_order_relaxed);
866851
m_state.store(embedding_state_t::EMBEDDING_STATE_STOP, std::memory_order_release);
867852
return;
868853
}
869854
my_thread_attr_destroy(&attr);
870855
}
871856

857+
void EmbeddingManager::shutdown_impl() {
858+
initiate_shutdown_impl();
859+
860+
if (m_manager_thread.thread != 0) {
861+
my_thread_join(&m_manager_thread, nullptr);
862+
m_manager_thread = {};
863+
}
864+
865+
{
866+
std::lock_guard<std::mutex> lk(m_table_workers_mutex);
867+
for (auto &[key, ctx] : m_table_workers) {
868+
if (ctx->thread.thread != 0) {
869+
my_thread_join(&ctx->thread, nullptr);
870+
ctx->thread = {};
871+
}
872+
}
873+
m_table_workers.clear();
874+
}
875+
876+
m_state.store(embedding_state_t::EMBEDDING_STATE_EXIT, std::memory_order_release);
877+
m_initialized.store(false);
878+
}
879+
872880
void EmbeddingManager::initiate_shutdown_impl() {
873-
m_shutdown_initiated.store(true, std::memory_order_release);
874881
embedding_state_t expected = embedding_state_t::EMBEDDING_STATE_RUN;
875882
if (!m_state.compare_exchange_strong(expected, embedding_state_t::EMBEDDING_STATE_STOP, std::memory_order_acq_rel,
876883
std::memory_order_acquire)) {
@@ -907,55 +914,6 @@ void EmbeddingManager::initiate_shutdown_impl() {
907914
m_manager_cv.notify_all();
908915
}
909916

910-
void EmbeddingManager::on_thread_exiting() {
911-
int prev = m_active_thread_count.fetch_sub(1, std::memory_order_acq_rel);
912-
if (prev == 1) {
913-
std::lock_guard<std::mutex> lk(m_fully_stopped_mutex);
914-
m_fully_stopped = true;
915-
m_fully_stopped_cv.notify_all();
916-
DBUG_PRINT("ml", ("ML EmbeddingManager: all threads exited — shutdown gate open."));
917-
}
918-
}
919-
920-
void EmbeddingManager::initiate_shutdown() {
921-
auto *mgr = instance();
922-
if (mgr && EmbeddingManager::is_running()) mgr->initiate_shutdown_impl();
923-
}
924-
925-
bool EmbeddingManager::wait_until_fully_stopped(std::chrono::milliseconds timeout) {
926-
std::unique_lock<std::mutex> lk(m_fully_stopped_mutex);
927-
return m_fully_stopped_cv.wait_for(lk, timeout, [] { return m_fully_stopped; });
928-
}
929-
930-
void EmbeddingManager::shutdown_impl() {
931-
initiate_shutdown_impl();
932-
933-
if (m_manager_thread.thread != 0) {
934-
my_thread_join(&m_manager_thread, nullptr);
935-
m_manager_thread = {};
936-
}
937-
938-
{
939-
std::lock_guard<std::mutex> lk(m_table_workers_mutex);
940-
for (auto &[key, ctx] : m_table_workers) {
941-
if (ctx->thread.thread != 0) {
942-
my_thread_join(&ctx->thread, nullptr);
943-
ctx->thread = {};
944-
}
945-
}
946-
m_table_workers.clear();
947-
}
948-
949-
{
950-
std::lock_guard<std::mutex> lk(m_fully_stopped_mutex);
951-
m_fully_stopped = true;
952-
}
953-
m_fully_stopped_cv.notify_all();
954-
955-
m_state.store(embedding_state_t::EMBEDDING_STATE_EXIT, std::memory_order_release);
956-
m_initialized.store(false);
957-
}
958-
959917
/**
960918
* When the pool is below MAX_WORKER_THREADS, spawn a new dedicated worker for
961919
* this (schema, table) key. Once the pool is full, return the existing worker
@@ -977,13 +935,11 @@ TableWorkerContext *EmbeddingManager::get_or_create_worker(const std::string &ke
977935
my_thread_attr_t attr;
978936
my_thread_attr_init(&attr);
979937
my_thread_attr_setdetachstate(&attr, MY_THREAD_CREATE_JOINABLE);
980-
m_active_thread_count.fetch_add(1, std::memory_order_relaxed);
981938
int rc = my_thread_create(&ctx->thread, &attr, embedding_table_worker_func, ctx.get());
982939
my_thread_attr_destroy(&attr);
983940

984941
if (rc != 0) {
985942
DBUG_PRINT("ml", ("ML EmbeddingManager: failed to spawn worker for %s", key.c_str()));
986-
m_active_thread_count.fetch_sub(1, std::memory_order_relaxed);
987943
return nullptr;
988944
}
989945

ml/ml_retrieve_schema_metadata.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,6 @@ class EmbeddingManager {
152152
instance()->shutdown_impl();
153153
}
154154

155-
static void initiate_shutdown();
156-
157-
static bool wait_until_fully_stopped(std::chrono::milliseconds timeout = std::chrono::seconds(30));
158-
159155
bool initialized() const { return m_initialized.load(); }
160156

161157
static inline bool is_running() noexcept {
@@ -192,10 +188,6 @@ class EmbeddingManager {
192188

193189
std::atomic<THD *> m_current_thd{nullptr}; // THD inside open_and_lock_tables
194190

195-
static std::condition_variable m_fully_stopped_cv;
196-
static std::mutex m_fully_stopped_mutex;
197-
static bool m_fully_stopped;
198-
199191
TableWorkerContext *get_or_create_worker(const std::string &key);
200192
void consume_results(THD *thd, TABLE *schema_embedding_table_ptr);
201193
void on_thread_exiting();
@@ -212,11 +204,9 @@ class EmbeddingManager {
212204

213205
my_thread_handle m_manager_thread{};
214206
std::atomic<bool> m_initialized{false};
215-
std::atomic<int> m_active_thread_count{0};
216207

217208
static std::once_flag s_once;
218209
static EmbeddingManager *s_instance;
219-
static std::atomic<bool> m_shutdown_initiated;
220210
};
221211

222212
void shannon_ml_on_ddl_event(const DDLEvent &event);

mysql-test/suite/innodb_zip/r/16k.result

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ AND t.name LIKE 'mysql%'
2121
AND t.name NOT LIKE 'mysql/ndb_binlog_index'
2222
ORDER BY t.name, i.index_id;
2323
table_name n_cols table_flags index_name root_page type n_fields merge_threshold
24+
mysql/schema_embeddings 10 161 PRIMARY 41 3 9 50
25+
mysql/schema_embeddings 10 161 unique_schema_table 42 2 3 50
2426
CREATE TABLE t1 (a INT KEY, b TEXT) ROW_FORMAT=REDUNDANT ENGINE=innodb;
2527
CREATE TABLE t2 (a INT KEY, b TEXT) ROW_FORMAT=COMPACT ENGINE=innodb;
2628
CREATE TABLE t3 (a INT KEY, b TEXT) ROW_FORMAT=COMPRESSED ENGINE=innodb;

mysql-test/suite/innodb_zip/r/4k.result

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ AND t.name LIKE 'mysql%'
2121
AND t.name NOT LIKE 'mysql/ndb_binlog_index'
2222
ORDER BY t.name, i.index_id;
2323
table_name n_cols table_flags index_name root_page type n_fields merge_threshold
24+
mysql/schema_embeddings 10 161 PRIMARY 138 3 9 50
25+
mysql/schema_embeddings 10 161 unique_schema_table 139 2 3 50
2426
CREATE TABLE t1 (a INT KEY, b TEXT) ROW_FORMAT=REDUNDANT ENGINE=innodb;
2527
CREATE TABLE t2 (a INT KEY, b TEXT) ROW_FORMAT=COMPACT ENGINE=innodb;
2628
CREATE TABLE t3 (a INT KEY, b TEXT) ROW_FORMAT=COMPRESSED ENGINE=innodb;

mysql-test/suite/innodb_zip/r/8k.result

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ AND t.name LIKE 'mysql%'
2121
AND t.name NOT LIKE 'mysql/ndb_binlog_index'
2222
ORDER BY t.name, i.index_id;
2323
table_name n_cols table_flags index_name root_page type n_fields merge_threshold
24+
mysql/schema_embeddings 10 161 PRIMARY 73 3 9 50
25+
mysql/schema_embeddings 10 161 unique_schema_table 74 2 3 50
2426
CREATE TABLE t1 (a INT KEY, b TEXT) ROW_FORMAT=REDUNDANT ENGINE=innodb;
2527
CREATE TABLE t2 (a INT KEY, b TEXT) ROW_FORMAT=COMPACT ENGINE=innodb;
2628
CREATE TABLE t3 (a INT KEY, b TEXT) ROW_FORMAT=COMPRESSED ENGINE=innodb;

sql/mysqld.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -993,6 +993,7 @@ MySQL clients support the protocol:
993993
#include "sql/server_component/persistent_dynamic_loader_imp.h"
994994
#include "sql/srv_session.h"
995995

996+
#include "ml/ml_retrieve_schema_metadata.h" // ml::Schema_manager
996997
using mysql::binlog::event::enum_binlog_checksum_alg;
997998
using std::max;
998999
using std::min;
@@ -2713,6 +2714,8 @@ static void clean_up(bool print_message) {
27132714
authentication_policy::deinit();
27142715
denit_command_maps();
27152716

2717+
ShannonBase::ML::EmbeddingManager::shutdown();
2718+
27162719
ha_pre_dd_shutdown();
27172720
dd::shutdown();
27182721

storage/rapid_engine/handler/ha_shannon_rapid.cc

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1943,18 +1943,9 @@ static handler *rapid_create_handler(handlerton *hton, TABLE_SHARE *table_share,
19431943

19441944
static void rapid_pre_dd_shutdown(handlerton *) {
19451945
auto *mgr = ShannonBase::ML::EmbeddingManager::instance();
1946-
if ((!mgr || !mgr->initialized()) || !ShannonBase::ML::EmbeddingManager::is_running())
1947-
return; // already stopped or never started
1948-
1949-
ShannonBase::ML::EmbeddingManager::initiate_shutdown();
1950-
1951-
constexpr auto kTimeout = std::chrono::seconds(60);
1952-
if (!ShannonBase::ML::EmbeddingManager::wait_until_fully_stopped(kTimeout)) {
1953-
sql_print_warning(
1954-
"[EmbeddingManager] shannon_ml_pre_dd_shutdown: "
1955-
"threads did not stop within 60 s — proceeding anyway.");
1956-
}
1946+
if ((!mgr || !mgr->initialized())) return;
19571947

1948+
ShannonBase::ML::EmbeddingManager::shutdown();
19581949
DBUG_PRINT("ml", ("ML EmbeddingManager: shannon_ml_pre_dd_shutdown — all threads stopped."));
19591950
}
19601951

@@ -1963,7 +1954,7 @@ static void rapid_pre_dd_shutdown(handlerton *) {
19631954
@retval 0 always */
19641955
static int rapid_shutdown(handlerton *, ha_panic_function) {
19651956
DBUG_TRACE;
1966-
// embedding worker thread.
1957+
// embedding worker thread shut down. Idempotent operation.
19671958
ShannonBase::ML::EmbeddingManager::shutdown();
19681959

19691960
// recovery worker

0 commit comments

Comments
 (0)