diff --git a/runtime/components/embedding_lookup/BUILD b/runtime/components/embedding_lookup/BUILD index ba5f480f6..a0360c267 100644 --- a/runtime/components/embedding_lookup/BUILD +++ b/runtime/components/embedding_lookup/BUILD @@ -28,6 +28,7 @@ cc_library( deps = [ "@com_google_absl//absl/status", "@com_google_absl//absl/types:span", + "@litert//litert/cc:litert_expected", ] + select({ "@litert//litert:litert_link_capi_so": [ "@litert//litert/cc:litert_api_with_dynamic_runtime", @@ -51,6 +52,8 @@ cc_library( "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:string_view", "@com_google_absl//absl/types:span", + "@litert//litert/c:litert_common", + "@litert//litert/cc:litert_expected", "//runtime/util:litert_status_util", ] + select({ "@litert//litert:litert_link_capi_so": [ @@ -152,6 +155,8 @@ cc_library( "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings:string_view", "@com_google_absl//absl/types:span", + "@litert//litert/c:litert_common", + "@litert//litert/cc:litert_expected", "@litert//litert/cc:litert_macros", "//runtime/executor:llm_executor_io_types", "//runtime/util:litert_status_util", diff --git a/runtime/components/embedding_lookup/embedding_lookup.h b/runtime/components/embedding_lookup/embedding_lookup.h index 1116caa5e..6d726960c 100644 --- a/runtime/components/embedding_lookup/embedding_lookup.h +++ b/runtime/components/embedding_lookup/embedding_lookup.h @@ -22,6 +22,7 @@ #include "absl/status/status.h" // from @com_google_absl #include "absl/types/span.h" // from @com_google_absl +#include "litert/cc/litert_expected.h" // from @litert #include "litert/cc/litert_tensor_buffer.h" // from @litert namespace litert::lm { @@ -69,6 +70,9 @@ class EmbeddingLookup { virtual absl::Status LookupPrefill(absl::Span tokens, litert::TensorBuffer* output_tensor, size_t byte_offset) = 0; + + // Returns whether the embedding lookup compiled model is fully accelerated. + virtual litert::Expected IsFullyAccelerated() = 0; }; } // namespace litert::lm diff --git a/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.cc b/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.cc index 308d542e4..43be20c24 100644 --- a/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.cc +++ b/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.cc @@ -154,13 +154,13 @@ EndOfMultiModalEmbedding::Create(litert::Environment& env, const litert::Model* absl_nonnull model, int special_token) { auto handler = std::unique_ptr( - new EndOfMultiModalEmbedding(env, model, special_token)); + new EndOfMultiModalEmbedding(env, special_token)); RETURN_IF_ERROR( // IWYU pragma: keep as is included by status_macros.h - handler->Initialize()); + handler->Initialize(*model)); return handler; } -absl::Status EndOfMultiModalEmbedding::Initialize() { +absl::Status EndOfMultiModalEmbedding::Initialize(const Model& model) { LITERT_ASSIGN_OR_RETURN(auto options, Options::Create()); #if defined(__ANDROID__) options.SetHardwareAccelerators(litert::HwAccelerators::kNpu | @@ -179,8 +179,8 @@ absl::Status EndOfMultiModalEmbedding::Initialize() { LITERT_ASSIGN_OR_RETURN( litert::CompiledModel compiled_model, - litert::CompiledModel::Create(env_, model_.Get(), options)); - if (auto num_signatures = model_.GetNumSignatures(); num_signatures != 1) { + litert::CompiledModel::Create(env_, model.Get(), options)); + if (auto num_signatures = model.GetNumSignatures(); num_signatures != 1) { return absl::InvalidArgumentError(absl::StrCat( "The Embedding model must have exactly one signature but got ", num_signatures)); @@ -237,6 +237,10 @@ absl::Status EndOfMultiModalEmbedding::Initialize() { size_t bytes = end_of_multi_modal_embedding_.size() * sizeof(float); output_buffers[0].Read(absl::MakeSpan(data_ptr, bytes)); + if (auto res = compiled_model.IsFullyAccelerated(); res.HasValue()) { + is_fully_accelerated_ = *res; + } + return absl::OkStatus(); } diff --git a/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h b/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h index 09d957c86..4f8b8e390 100644 --- a/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h +++ b/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h @@ -19,7 +19,6 @@ #include #include -#include #include #include "absl/base/nullability.h" // from @com_google_absl @@ -29,7 +28,6 @@ #include "litert/cc/litert_environment.h" // from @litert #include "litert/cc/litert_layout.h" // from @litert #include "litert/cc/litert_model.h" // from @litert -#include "litert/cc/litert_options.h" // from @litert #include "litert/cc/litert_tensor_buffer.h" // from @litert #include "runtime/components/embedding_lookup/embedding_lookup.h" @@ -74,20 +72,19 @@ class EndOfMultiModalEmbedding : public EmbeddingLookup { litert::TensorBuffer* prefill_output, size_t byte_offset) override; + litert::Expected IsFullyAccelerated() override { + return is_fully_accelerated_; + } + protected: - EndOfMultiModalEmbedding(litert::Environment& env, - const litert::Model* absl_nonnull model, - int special_token) - : env_(env), model_(*model), special_token_(special_token) {} + EndOfMultiModalEmbedding(litert::Environment& env, int special_token) + : env_(env), special_token_(special_token) {} // Loads the provided model. This must be called before Lookup functions. - absl::Status Initialize(); + absl::Status Initialize(const litert::Model& model); // The environment for the embedding lookup. litert::Environment& env_; - // The model for the embedding lookup. The actual model instance is owned by - // the model resources. - const litert::Model& model_; // The layout of the output tensor from the embedding model. litert::Layout output_buffer_layout_; @@ -99,6 +96,8 @@ class EndOfMultiModalEmbedding : public EmbeddingLookup { // Contains the end of multi-modal embedding that was looked up from the // model. std::vector end_of_multi_modal_embedding_; + + bool is_fully_accelerated_ = false; }; } // namespace litert::lm diff --git a/runtime/components/embedding_lookup/embedding_lookup_manager.cc b/runtime/components/embedding_lookup/embedding_lookup_manager.cc index 7d90c6289..95f4fd7c3 100644 --- a/runtime/components/embedding_lookup/embedding_lookup_manager.cc +++ b/runtime/components/embedding_lookup/embedding_lookup_manager.cc @@ -29,7 +29,9 @@ #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "absl/types/span.h" // from @com_google_absl +#include "litert/c/litert_common.h" // from @litert #include "litert/cc/litert_environment.h" // from @litert +#include "litert/cc/litert_expected.h" // from @litert #include "litert/cc/litert_macros.h" // from @litert #include "litert/cc/litert_model.h" // from @litert #include "litert/cc/litert_tensor_buffer.h" // from @litert @@ -265,4 +267,21 @@ absl::Status EmbeddingLookupManager::Initialize( return absl::OkStatus(); } +litert::Expected EmbeddingLookupManager::IsFullyAccelerated() const { + if (text_embedding_lookup_ == nullptr) { + return litert::Error(kLiteRtStatusErrorRuntimeFailure, + "Text embedding lookup has not been created."); + } + if (auto res = text_embedding_lookup_->IsFullyAccelerated(); + !res.HasValue() || !*res) { + return res; + } + for (const auto& lookup : end_of_multi_modal_embedding_lookups_) { + if (auto res = lookup->IsFullyAccelerated(); !res.HasValue() || !*res) { + return res; + } + } + return true; +} + } // namespace litert::lm diff --git a/runtime/components/embedding_lookup/embedding_lookup_manager.h b/runtime/components/embedding_lookup/embedding_lookup_manager.h index 1e63d4695..43da8f7bf 100644 --- a/runtime/components/embedding_lookup/embedding_lookup_manager.h +++ b/runtime/components/embedding_lookup/embedding_lookup_manager.h @@ -27,6 +27,7 @@ #include "absl/status/statusor.h" // from @com_google_absl #include "absl/types/span.h" // from @com_google_absl #include "litert/cc/litert_environment.h" // from @litert +#include "litert/cc/litert_expected.h" // from @litert #include "litert/cc/litert_model.h" // from @litert #include "litert/cc/litert_tensor_buffer.h" // from @litert #include "runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h" @@ -116,6 +117,8 @@ class EmbeddingLookupManager { return text_embedding_lookup_.get(); } + litert::Expected IsFullyAccelerated() const; + protected: absl::Status Initialize( litert::Environment& env, diff --git a/runtime/components/embedding_lookup/embedding_lookup_multi_modal.h b/runtime/components/embedding_lookup/embedding_lookup_multi_modal.h index 5d833c83e..89fa11075 100644 --- a/runtime/components/embedding_lookup/embedding_lookup_multi_modal.h +++ b/runtime/components/embedding_lookup/embedding_lookup_multi_modal.h @@ -91,6 +91,8 @@ class EmbeddingLookupMultiModal : public EmbeddingLookup { // Returns true if there are any embeddings left to be read. bool HasRemainingEmbeddings() const { return embedding_.size() > 0; } + litert::Expected IsFullyAccelerated() override { return true; } + protected: absl::Status Initialize(const ::litert::TensorBuffer* embedding_buffer, int special_token); diff --git a/runtime/components/embedding_lookup/embedding_lookup_text.cc b/runtime/components/embedding_lookup/embedding_lookup_text.cc index 580a28558..35c2ab2a2 100644 --- a/runtime/components/embedding_lookup/embedding_lookup_text.cc +++ b/runtime/components/embedding_lookup/embedding_lookup_text.cc @@ -31,10 +31,7 @@ #include "absl/strings/string_view.h" // from @com_google_absl #include "absl/types/span.h" // from @com_google_absl #include "litert/cc/litert_common.h" // from @litert -#include "litert/cc/litert_compiled_model.h" // from @litert -#include "litert/cc/litert_element_type.h" // from @litert -#include "litert/cc/litert_environment.h" // from @litert -#include "litert/cc/litert_macros.h" // from @litert +#include "litert/cc/litert_expected.h" // from @litert #include "litert/cc/litert_model.h" // from @litert #include "litert/cc/litert_options.h" // from @litert #include "litert/cc/litert_tensor_buffer.h" // from @litert @@ -248,12 +245,12 @@ EmbeddingLookupText::Create(litert::Environment& env, const litert::Model* absl_nonnull model, std::optional signature_key) { auto handler = std::unique_ptr( - new EmbeddingLookupText(env, model, signature_key)); - RETURN_IF_ERROR(handler->Initialize()); + new EmbeddingLookupText(env, signature_key)); + RETURN_IF_ERROR(handler->Initialize(*model)); return handler; } -absl::Status EmbeddingLookupText::Initialize() { +absl::Status EmbeddingLookupText::Initialize(const litert::Model& model) { LITERT_ASSIGN_OR_RETURN(auto options, Options::Create()); #if defined(__ANDROID__) options.SetHardwareAccelerators(litert::HwAccelerators::kNpu | @@ -271,8 +268,8 @@ absl::Status EmbeddingLookupText::Initialize() { #endif LITERT_ASSIGN_OR_RETURN(compiled_model_, litert::CompiledModel::Create( - env_, model_.Get(), options)); - LITERT_ASSIGN_OR_RETURN(auto signatures, model_.GetSignatures()); + env_, model.Get(), options)); + LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures()); if (signature_key_.has_value()) { bool found = false; @@ -354,4 +351,12 @@ absl::Status EmbeddingLookupText::Initialize() { return absl::OkStatus(); } +litert::Expected EmbeddingLookupText::IsFullyAccelerated() { + if (!compiled_model_.has_value()) { + return litert::Error(litert::Status::kErrorRuntimeFailure, + "Compiled model has not been created."); + } + return compiled_model_->IsFullyAccelerated(); +} + } // namespace litert::lm diff --git a/runtime/components/embedding_lookup/embedding_lookup_text.h b/runtime/components/embedding_lookup/embedding_lookup_text.h index d40af5712..a3243a5ba 100644 --- a/runtime/components/embedding_lookup/embedding_lookup_text.h +++ b/runtime/components/embedding_lookup/embedding_lookup_text.h @@ -22,7 +22,6 @@ #include #include #include -#include #include #include "absl/base/nullability.h" // from @com_google_absl @@ -31,6 +30,7 @@ #include "absl/types/span.h" // from @com_google_absl #include "litert/cc/litert_compiled_model.h" // from @litert #include "litert/cc/litert_environment.h" // from @litert +#include "litert/cc/litert_expected.h" // from @litert #include "litert/cc/litert_model.h" // from @litert #include "litert/cc/litert_options.h" // from @litert #include "litert/cc/litert_ranked_tensor_type.h" // from @litert @@ -103,6 +103,8 @@ class EmbeddingLookupText : public EmbeddingLookup { // Returns number of floats per token in the output tensor. size_t GetFloatsPerToken(); + litert::Expected IsFullyAccelerated() override; + // Returns the default embedding vector to use when a token is not found in // the lookup table. const std::vector& GetDefaultEmbeddingVector() const { @@ -116,12 +118,11 @@ class EmbeddingLookupText : public EmbeddingLookup { protected: EmbeddingLookupText(litert::Environment& env, - const litert::Model* absl_nonnull model, std::optional signature_key) - : env_(env), model_(*model), signature_key_(signature_key) {} + : env_(env), signature_key_(signature_key) {} // Loads the provided model. This must be called before Lookup. - absl::Status Initialize(); + absl::Status Initialize(const litert::Model& model); // Internal implementation of Lookup for both the single and multiple token // cases. @@ -129,9 +130,6 @@ class EmbeddingLookupText : public EmbeddingLookup { // The environment for the embedding lookup. litert::Environment& env_; - // The model for the embedding lookup. The actual model instance is owned by - // the model resources. - const litert::Model& model_; // The compiled model for the embedding model. std::optional compiled_model_; diff --git a/runtime/components/model_resources.h b/runtime/components/model_resources.h index bd5311276..24ba00083 100644 --- a/runtime/components/model_resources.h +++ b/runtime/components/model_resources.h @@ -185,6 +185,11 @@ class ModelResources { // Returns the llm metadata. virtual absl::StatusOr GetLlmMetadata() = 0; + + // Releases the TFLite model from RAM. This is used to reduce peak memory + // usage after the model has been compiled into a hardware-specific + // executable. + virtual absl::Status ReleaseTFLiteModel(ModelType model_type) = 0; }; } // namespace litert::lm diff --git a/runtime/components/model_resources_litert_lm.cc b/runtime/components/model_resources_litert_lm.cc index a28ce0f2c..ee25c07d4 100644 --- a/runtime/components/model_resources_litert_lm.cc +++ b/runtime/components/model_resources_litert_lm.cc @@ -162,4 +162,14 @@ ModelResourcesLitertLm::GetWeightsSectionOffset(ModelType model_type) { BufferKey(schema::AnySectionDataType_TFLiteWeights, model_type)); } +absl::Status ModelResourcesLitertLm::ReleaseTFLiteModel(ModelType model_type) { + model_map_.erase(model_type); + RETURN_IF_ERROR(litert_lm_loader_->ReleaseSection( + BufferKey(schema::AnySectionDataType_TFLiteModel, model_type))); + RETURN_IF_ERROR(litert_lm_loader_->ReleaseSection( + BufferKey(schema::AnySectionDataType_TFLiteWeights, model_type))); + + return absl::OkStatus(); +} + } // namespace litert::lm diff --git a/runtime/components/model_resources_litert_lm.h b/runtime/components/model_resources_litert_lm.h index e0c513abb..e1ebcc132 100644 --- a/runtime/components/model_resources_litert_lm.h +++ b/runtime/components/model_resources_litert_lm.h @@ -23,6 +23,7 @@ #include #include "absl/container/flat_hash_map.h" // from @com_google_absl +#include "absl/status/status.h" // from @com_google_absl #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "litert/cc/litert_model.h" // from @litert @@ -64,6 +65,8 @@ class ModelResourcesLitertLm : public ModelResources { absl::StatusOr> GetWeightsSectionOffset( ModelType model_type) override; + absl::Status ReleaseTFLiteModel(ModelType model_type) override; + protected: explicit ModelResourcesLitertLm( std::unique_ptr litert_lm_loader) diff --git a/runtime/components/model_resources_streaming.cc b/runtime/components/model_resources_streaming.cc index 44dc5154c..3a668afd0 100644 --- a/runtime/components/model_resources_streaming.cc +++ b/runtime/components/model_resources_streaming.cc @@ -73,4 +73,8 @@ ModelResourcesStreaming::GetLlmMetadata() { return absl::UnimplementedError("Not implemented."); } +absl::Status ModelResourcesStreaming::ReleaseTFLiteModel(ModelType model_type) { + return absl::UnimplementedError("Not implemented."); +} + } // namespace litert::lm diff --git a/runtime/components/model_resources_streaming.h b/runtime/components/model_resources_streaming.h index 2e5ba9272..c56a0208a 100644 --- a/runtime/components/model_resources_streaming.h +++ b/runtime/components/model_resources_streaming.h @@ -22,6 +22,7 @@ #include #include +#include "absl/status/status.h" // from @com_google_absl #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "litert/cc/litert_model.h" // from @litert @@ -60,6 +61,8 @@ class ModelResourcesStreaming : public ModelResources { absl::StatusOr> GetTokenizer() override; absl::StatusOr GetLlmMetadata() override; + + absl::Status ReleaseTFLiteModel(ModelType model_type) override; }; } // namespace litert::lm diff --git a/runtime/components/model_resources_task.cc b/runtime/components/model_resources_task.cc index 715586645..44ead88cc 100644 --- a/runtime/components/model_resources_task.cc +++ b/runtime/components/model_resources_task.cc @@ -75,6 +75,11 @@ absl::StatusOr ModelResourcesTask::GetTFLiteModel( return model_map_[model_type].get(); } +absl::Status ModelResourcesTask::ReleaseTFLiteModel(ModelType model_type) { + model_map_.erase(model_type); + return absl::OkStatus(); +} + absl::StatusOr> ModelResourcesTask::GetTokenizer() { ASSIGN_OR_RETURN(auto string_view, model_asset_bundle_resources_->GetFile("TOKENIZER_MODEL")); diff --git a/runtime/components/model_resources_task.h b/runtime/components/model_resources_task.h index 2dea300ef..9c30f9c34 100644 --- a/runtime/components/model_resources_task.h +++ b/runtime/components/model_resources_task.h @@ -28,7 +28,6 @@ #include "absl/strings/string_view.h" // from @com_google_absl #include "litert/cc/litert_model.h" // from @litert #include "runtime/components/model_resources.h" -#include "runtime/components/sentencepiece_tokenizer.h" #include "runtime/components/tokenizer.h" #include "runtime/proto/llm_metadata.pb.h" #include "runtime/util/model_asset_bundle_resources.h" @@ -56,6 +55,7 @@ class ModelResourcesTask : public ModelResources { // Task model does not support prefer activation type. return std::nullopt; }; + absl::Status ReleaseTFLiteModel(ModelType model_type) override; absl::StatusOr> GetTokenizer() override; absl::StatusOr GetLlmMetadata() override; absl::StatusOr> GetScopedFile() override { @@ -74,7 +74,8 @@ class ModelResourcesTask : public ModelResources { : model_asset_bundle_resources_(std::move(model_asset_bundle_resources)) { } - absl::flat_hash_map> model_map_; + absl::flat_hash_map> model_map_; + std::unique_ptr llm_metadata_; // The model asset bundle resources produced by reading task bundle. Not null diff --git a/runtime/components/model_resources_test.cc b/runtime/components/model_resources_test.cc index dbae34481..8f8d1972b 100644 --- a/runtime/components/model_resources_test.cc +++ b/runtime/components/model_resources_test.cc @@ -65,6 +65,58 @@ TEST(ModelResourcesTest, InitializeWithValidLitertLmLoader) { ASSERT_NE(tokenizer.value(), nullptr); } +TEST(ModelResourcesTest, ReleaseTFLiteModel) { + const auto model_path = + std::filesystem::path(::testing::SrcDir()) / + "litert_lm/runtime/testdata/test_lm.litertlm"; + auto model_file = ScopedFile::Open(model_path.string()); + ASSERT_TRUE(model_file.ok()); + ASSERT_OK_AND_ASSIGN(auto loader, + LitertLmLoader::Create(std::move(model_file.value()))); + + auto model_resources = ModelResourcesLitertLm::Create(std::move(loader)); + ASSERT_OK(model_resources); + + // Load the model. + auto tflite_model = + model_resources.value()->GetTFLiteModel(ModelType::kTfLitePrefillDecode); + ASSERT_OK(tflite_model); + + // Release the model. + ASSERT_OK(model_resources.value()->ReleaseTFLiteModel( + ModelType::kTfLitePrefillDecode)); + + // Subsequent GetTFLiteModelBuffer should return NotFound. + EXPECT_THAT(model_resources.value()->GetTFLiteModelBuffer( + ModelType::kTfLitePrefillDecode), + testing::status::StatusIs(absl::StatusCode::kNotFound)); +} + +TEST(ModelResourcesTest, ReleaseTFLiteModelDoesNotBreakSubsequentLoads) { + const auto model_path = + std::filesystem::path(::testing::SrcDir()) / + "litert_lm/runtime/testdata/test_lm.litertlm"; + auto model_file = ScopedFile::Open(model_path.string()); + ASSERT_TRUE(model_file.ok()); + ASSERT_OK_AND_ASSIGN(auto loader, + LitertLmLoader::Create(std::move(model_file.value()))); + + auto model_resources = ModelResourcesLitertLm::Create(std::move(loader)); + ASSERT_OK(model_resources); + + // Load one model and release it. + ASSERT_OK(model_resources.value() + ->GetTFLiteModel(ModelType::kTfLitePrefillDecode) + .status()); + ASSERT_OK(model_resources.value()->ReleaseTFLiteModel( + ModelType::kTfLitePrefillDecode)); + + // Subsequent loads should still succeed (e.g., Tokenizer or other models). + // test_lm.litertlm contains a tokenizer. + auto tokenizer = model_resources.value()->GetTokenizer(); + EXPECT_OK(tokenizer.status()); +} + TEST(ModelResourcesTest, InitializeWithExternalWeights) { const auto model_path = std::filesystem::path(::testing::SrcDir()) / diff --git a/runtime/executor/BUILD b/runtime/executor/BUILD index 5d7ed3241..c9cf55dcb 100644 --- a/runtime/executor/BUILD +++ b/runtime/executor/BUILD @@ -279,6 +279,7 @@ cc_library( "@com_google_absl//absl/types:span", "@litert//litert/cc:litert_model_types", "@litert//litert/cc:litert_tensor_buffer_types", + "@litert//litert/cc/internal:litert_handle", "//runtime/components:model_resources", "//runtime/components:model_resources_litert_lm", "//runtime/components:model_resources_task", @@ -1069,6 +1070,7 @@ cc_test( ":llm_executor_settings", ":magic_number_configs_helper", "@com_google_googletest//:gtest_main", + "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings:string_view", "@litert//litert/cc:litert_macros", diff --git a/runtime/executor/audio_litert_compiled_model_executor.cc b/runtime/executor/audio_litert_compiled_model_executor.cc index e9e347e02..54a89fd34 100644 --- a/runtime/executor/audio_litert_compiled_model_executor.cc +++ b/runtime/executor/audio_litert_compiled_model_executor.cc @@ -62,9 +62,37 @@ #include "runtime/util/tensor_buffer_util.h" #include "tflite/types/half.h" // from @litert +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) +#include +#include +#endif + namespace litert::lm { namespace { +void MadviseMemoryBuffer(absl::string_view buffer) { +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) + if (buffer.empty()) return; + size_t page_size = getpagesize(); + uintptr_t addr = reinterpret_cast(buffer.data()); + size_t size = buffer.size(); + + // We perform proper system page alignment to guarantee that the advisor + // does not fail on non-aligned offsets. + uintptr_t aligned_addr = (addr + page_size - 1) & ~(page_size - 1); + if (aligned_addr > addr) { + size_t gap = aligned_addr - addr; + if (gap >= size) return; + size -= gap; + } + size &= ~(page_size - 1); + + if (size > 0) { + (void)madvise(reinterpret_cast(aligned_addr), size, MADV_DONTNEED); + } +#endif +} + // Set the default GPU options for the model. absl::Status SetGpuOptions(const AudioExecutorSettings& executor_settings, litert::GpuOptions& gpu_options) { @@ -99,6 +127,7 @@ absl::Status SetGpuOptions(const AudioExecutorSettings& executor_settings, gpu_options.SetConvertWeightsOnGpu(true); gpu_options.SetHintFullyDelegatedToSingleDelegate(true); gpu_options.EnableInfiniteFloatCapping(true); + gpu_options.WaitForWeightsConversionComplete(true); return absl::OkStatus(); } @@ -179,13 +208,13 @@ AudioLiteRtCompiledModelExecutor::AudioStaticEncoder::Create( const AudioExecutorSettings& executor_settings, Environment& env, const Model* absl_nonnull model) { auto handler = std::unique_ptr( - new AudioStaticEncoder(executor_settings, env, model)); - RETURN_IF_ERROR(handler->Initialize()); + new AudioStaticEncoder(executor_settings, env)); + RETURN_IF_ERROR(handler->Initialize(*model)); return handler; } -absl::Status -AudioLiteRtCompiledModelExecutor::AudioStaticEncoder::Initialize() { +absl::Status AudioLiteRtCompiledModelExecutor::AudioStaticEncoder::Initialize( + const Model& model) { LITERT_ASSIGN_OR_RETURN(auto options, Options::Create()); auto weight_cache_file = executor_settings_.GetWeightCacheFile( absl::StrCat(AudioExecutorSettings::kStaticEncoderName, @@ -222,15 +251,15 @@ AudioLiteRtCompiledModelExecutor::AudioStaticEncoder::Initialize() { } LITERT_ASSIGN_OR_RETURN(compiled_model_, - CompiledModel::Create(env_, model_.Get(), options)); - LITERT_ASSIGN_OR_RETURN(auto signatures, model_.GetSignatures()); + CompiledModel::Create(env_, model.Get(), options)); + LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures()); if (signatures.size() != 1) { return absl::InvalidArgumentError( absl::StrCat("The Audio Static Encoder model must have exactly one " "signature but got ", signatures.size())); } - LITERT_ASSIGN_OR_RETURN(auto signature, model_.GetSignature(0)); + LITERT_ASSIGN_OR_RETURN(auto signature, model.GetSignature(0)); // Initialize the input buffers. LITERT_ASSIGN_OR_RETURN(auto input_buffers, @@ -311,13 +340,14 @@ AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::Create( const AudioExecutorSettings& executor_settings, Environment& env, const Model* absl_nonnull model) { auto handler = std::unique_ptr( - new AudioStreamingEncoder(executor_settings, env, model)); - RETURN_IF_ERROR(handler->Initialize()); + new AudioStreamingEncoder(executor_settings, env)); + RETURN_IF_ERROR(handler->Initialize(*model)); return handler; } absl::Status -AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::Initialize() { +AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::Initialize( + const Model& model) { LITERT_ASSIGN_OR_RETURN(auto options, Options::Create()); auto weight_cache_file = executor_settings_.GetWeightCacheFile( absl::StrCat(AudioExecutorSettings::kStreamingEncoderName, @@ -356,15 +386,15 @@ AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::Initialize() { } LITERT_ASSIGN_OR_RETURN(compiled_model_, - CompiledModel::Create(env_, model_.Get(), options)); - LITERT_ASSIGN_OR_RETURN(auto signatures, model_.GetSignatures()); + CompiledModel::Create(env_, model.Get(), options)); + LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures()); if (signatures.size() != 1) { return absl::InvalidArgumentError(absl::StrCat( "The Audio Encoder model must have exactly one signature but got ", signatures.size())); } - LITERT_ASSIGN_OR_RETURN(auto signature, model_.GetSignature(0)); + LITERT_ASSIGN_OR_RETURN(auto signature, model.GetSignature(0)); // Initialize the input buffers. LITERT_ASSIGN_OR_RETURN(auto input_buffers, @@ -504,13 +534,14 @@ absl::StatusOr> AudioLiteRtCompiledModelExecutor::AudioAdapter::Create( const AudioExecutorSettings& executor_settings, Environment& env, const Model* absl_nonnull model) { - auto handler = std::unique_ptr( - new AudioAdapter(executor_settings, env, model)); - RETURN_IF_ERROR(handler->Initialize()); + auto handler = + std::unique_ptr(new AudioAdapter(executor_settings, env)); + RETURN_IF_ERROR(handler->Initialize(*model)); return handler; } -absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize() { +absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize( + const Model& model) { LITERT_ASSIGN_OR_RETURN(auto options, Options::Create()); auto weight_cache_file = executor_settings_.GetWeightCacheFile( absl::StrCat(AudioExecutorSettings::kAdapterName, @@ -524,6 +555,7 @@ absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize() { #if defined(LITERT_USE_WEBGPU_ACCELERATOR) gpu_options.SetBackend(GpuOptions::Backend::kWebGpu); #endif // defined(LITERT_USE_WEBGPU_ACCELERATOR) + gpu_options.WaitForWeightsConversionComplete(true); options.SetHardwareAccelerators(litert::HwAccelerators::kGpu); } else if (executor_settings_.GetBackend() == Backend::CPU) { LITERT_ASSIGN_OR_RETURN(auto& cpu_options, options.GetCpuOptions()); @@ -540,8 +572,8 @@ absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize() { } LITERT_ASSIGN_OR_RETURN(compiled_model_, - CompiledModel::Create(env_, model_.Get(), options)); - LITERT_ASSIGN_OR_RETURN(auto signatures, model_.GetSignatures()); + CompiledModel::Create(env_, model.Get(), options)); + LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures()); if (signatures.size() != 1) { return absl::InvalidArgumentError(absl::StrCat( "The Audio Adapter model must have exactly one signature but got ", @@ -565,7 +597,7 @@ absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize() { output_buffers_.size())); } - LITERT_ASSIGN_OR_RETURN(auto signature, model_.GetSignature(0)); + LITERT_ASSIGN_OR_RETURN(auto signature, model.GetSignature(0)); for (int i = 0; i < signature.InputNames().size(); ++i) { if (absl::StrContains(signature.InputNames()[i], kFeaturesName)) { features_buffer_ = &input_buffers_[i]; @@ -599,6 +631,10 @@ AudioLiteRtCompiledModelExecutor::Create( resources->GetTFLiteModel(ModelType::kTfLiteAudioEncoderHw)); ASSIGN_OR_RETURN(auto audio_adapter_model, resources->GetTFLiteModel(ModelType::kTfLiteAudioAdapter)); + LITERT_ASSIGN_OR_RETURN( + auto executor_properties, + GetAudioExecutorPropertiesFromModelResources(*resources)); + const int encoder_shrinking_factor = executor_properties.audio_shrink_factor; std::unique_ptr audio_encoder; LITERT_ASSIGN_OR_RETURN(auto encoder_signature, audio_encoder_model->GetSignature(0)); @@ -613,9 +649,35 @@ AudioLiteRtCompiledModelExecutor::Create( AudioStaticEncoder::Create(executor_settings, env, audio_encoder_model)); } + + if (auto is_fully_accelerated = + audio_encoder->GetMutableCompiledModel().IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + // Instead of destructively unmapping the model which invalidates underlying + // interpreter dependencies, we dynamically advise the kernel that the pages + // are no longer needed. This guarantees transparent, non-crashing, stable + // runtime while freeing full resident RAM pages immediately. + auto buf_status = + resources->GetTFLiteModelBuffer(ModelType::kTfLiteAudioEncoderHw); + if (buf_status.ok()) { + MadviseMemoryBuffer(*buf_status); + } + } + LITERT_ASSIGN_OR_RETURN( auto audio_adapter, AudioAdapter::Create(executor_settings, env, audio_adapter_model)); + + if (auto is_fully_accelerated = + audio_adapter->GetMutableCompiledModel().IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + auto buf_status = + resources->GetTFLiteModelBuffer(ModelType::kTfLiteAudioAdapter); + if (buf_status.ok()) { + MadviseMemoryBuffer(*buf_status); + } + } + const auto& tmp = audio_encoder->GetInputMaskBuffer(); LITERT_ASSIGN_OR_RETURN(auto mask_tensor_type, tmp.TensorType()); LITERT_ASSIGN_OR_RETURN(int sequence_length, @@ -629,10 +691,7 @@ AudioLiteRtCompiledModelExecutor::Create( audio_adapter->GetOutputBuffers()[0].TensorType()); const auto dims = adapter_output_tensor_type.Layout().Dimensions(); const int audio_embedding_dimensions = dims.back(); - LITERT_ASSIGN_OR_RETURN( - auto executor_properties, - GetAudioExecutorPropertiesFromModelResources(*resources)); - const int encoder_shrinking_factor = executor_properties.audio_shrink_factor; + if (!is_streaming_encoder) { if (audio_encoder->GetOutputBuffersMap().size() != audio_adapter->GetInputBuffers().size()) { @@ -832,7 +891,7 @@ absl::StatusOr AudioLiteRtCompiledModelExecutor::Encode( absl::StatusOr> AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::CreateNewContext() { absl::flat_hash_map state_buffers; - LITERT_ASSIGN_OR_RETURN(auto signature, compiled_model_.GetSignature(0)); + // Removed redundant GetSignature call that crashes post-release. for (auto& [name, buffer] : input_buffers_map_) { if (name == kSegmentValuesName || name == kSegmentMaskName) { // Skip the segment values and mask buffers as they are not part of the @@ -859,7 +918,7 @@ AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::CreateNewContext() { absl::StatusOr> AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::CloneContext() { absl::flat_hash_map state_buffers; - LITERT_ASSIGN_OR_RETURN(auto signature, compiled_model_.GetSignature(0)); + // Removed redundant GetSignature call that crashes post-release. for (const auto& [name, buffer] : input_buffers_map_) { if (name == kSegmentValuesName || name == kSegmentMaskName) { // Skip the segment values and mask buffers as they are not part of the diff --git a/runtime/executor/audio_litert_compiled_model_executor.h b/runtime/executor/audio_litert_compiled_model_executor.h index bc9ccc953..fb02e9c7c 100644 --- a/runtime/executor/audio_litert_compiled_model_executor.h +++ b/runtime/executor/audio_litert_compiled_model_executor.h @@ -75,7 +75,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor { // A unique pointer to the AudioLiteRtCompiledModelExecutor if successful, // or an error status if failed. static absl::StatusOr> - Create(AudioExecutorSettings executor_settings, Environment& env); + Create(AudioExecutorSettings executor_settings, litert::Environment& env); // Run the audio encoder and audio adapter models to encode the spectrogram // tensor into audio embeddings. It is caller's responsibility to ensure the @@ -141,7 +141,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor { public: virtual ~AudioEncoder() = default; - virtual absl::Status Initialize() = 0; + virtual absl::Status Initialize(const Model& model) = 0; virtual absl::Status ClearInputBuffers() = 0; @@ -241,7 +241,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor { // Initialize the AudioStaticEncoder, which will create the input and output // buffers for the audio encoder model. - absl::Status Initialize() override; + absl::Status Initialize(const Model& model) override; absl::Status ClearInputBuffers() override; @@ -249,12 +249,11 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor { private: AudioStaticEncoder(const AudioExecutorSettings& executor_settings, - Environment& env, const Model* absl_nonnull model) - : executor_settings_(executor_settings), env_(env), model_(*model) {} + Environment& env) + : executor_settings_(executor_settings), env_(env) {} - const AudioExecutorSettings& executor_settings_; + AudioExecutorSettings executor_settings_; Environment& env_; - const Model& model_; }; // Audio Encoder for streaming LiteRT model, where the audio is provided in @@ -310,7 +309,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor { // Initialize the AudioStreamingEncoder, which will create the input and // output buffers for the audio encoder model. - absl::Status Initialize() override; + absl::Status Initialize(const Model& model); int GetOverlapSize() const { return overlap_size_; } @@ -331,12 +330,11 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor { private: AudioStreamingEncoder(const AudioExecutorSettings& executor_settings, - Environment& env, const Model* absl_nonnull model) - : executor_settings_(executor_settings), env_(env), model_(*model) {} + Environment& env) + : executor_settings_(executor_settings), env_(env) {} - const AudioExecutorSettings& executor_settings_; + AudioExecutorSettings executor_settings_; Environment& env_; - const Model& model_; int overlap_size_; }; @@ -360,7 +358,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor { // Initialize the AudioAdapter, which will create the input and output // buffers for the audio adapter model. - absl::Status Initialize(); + absl::Status Initialize(const Model& model); const CompiledModel& GetCompiledModel() const { return compiled_model_; } @@ -391,12 +389,11 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor { private: AudioAdapter(const AudioExecutorSettings& executor_settings, - Environment& env, const Model* absl_nonnull model) - : executor_settings_(executor_settings), env_(env), model_(*model) {} + Environment& env) + : executor_settings_(executor_settings), env_(env) {} - const AudioExecutorSettings& executor_settings_; + AudioExecutorSettings executor_settings_; Environment& env_; - const Model& model_; CompiledModel compiled_model_; // The input buffers for the audio adapter model. std::vector input_buffers_; diff --git a/runtime/executor/llm_executor_settings_utils.cc b/runtime/executor/llm_executor_settings_utils.cc index bc87e77e9..38bf2f81e 100644 --- a/runtime/executor/llm_executor_settings_utils.cc +++ b/runtime/executor/llm_executor_settings_utils.cc @@ -203,6 +203,7 @@ absl::StatusOr CreateCompilationOptions( gpu_compilation_options.EnableAllowSrcQuantizedFcConvOps( !advanced_settings.allow_src_quantized_fc_conv_ops.has_value() || advanced_settings.allow_src_quantized_fc_conv_ops.value()); + gpu_compilation_options.WaitForWeightsConversionComplete(true); gpu_compilation_options.HintWaitingForCompletion( advanced_settings.hint_waiting_for_completion.has_value() && advanced_settings.hint_waiting_for_completion.value()); diff --git a/runtime/executor/llm_litert_compiled_model_executor.cc b/runtime/executor/llm_litert_compiled_model_executor.cc index b2d630a94..1e9c62129 100644 --- a/runtime/executor/llm_litert_compiled_model_executor.cc +++ b/runtime/executor/llm_litert_compiled_model_executor.cc @@ -35,6 +35,7 @@ #include "absl/strings/str_cat.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "absl/types/span.h" // from @com_google_absl +#include "litert/cc/internal/litert_handle.h" // from @litert #include "litert/cc/litert_common.h" // from @litert #include "litert/cc/litert_compiled_model.h" // from @litert #include "litert/cc/litert_element_type.h" // from @litert @@ -68,12 +69,40 @@ #include "runtime/util/scoped_file.h" #include "runtime/util/status_macros.h" // IWYU pragma: keep #include "runtime/util/tensor_buffer_util.h" + +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) +#include +#include +#endif + #include "tflite/delegates/xnnpack/xnnpack_delegate.h" // from @litert #include "tflite/types/half.h" // from @litert namespace litert::lm { namespace { +void MadviseMemoryBuffer(absl::string_view buffer) { +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) + if (buffer.empty()) return; + size_t page_size = getpagesize(); + uintptr_t addr = reinterpret_cast(buffer.data()); + size_t size = buffer.size(); + + // Ensure properly system page aligned boundaries for guaranteed safety. + uintptr_t aligned_addr = (addr + page_size - 1) & ~(page_size - 1); + if (aligned_addr > addr) { + size_t gap = aligned_addr - addr; + if (gap >= size) return; + size -= gap; + } + size &= ~(page_size - 1); + + if (size > 0) { + (void)madvise(reinterpret_cast(aligned_addr), size, MADV_DONTNEED); + } +#endif +} + using ::absl::Span; // Names of the signature runners, used to get the signature runners from the @@ -88,7 +117,7 @@ absl::Status InitializeEmbeddingLookups( std::unique_ptr& per_layer_embedding_lookup) { absl::flat_hash_map end_of_multi_modal_embedding_models; { - auto end_of_audio_model = + absl::StatusOr end_of_audio_model = resources.GetTFLiteModel(ModelType::kTfLiteEndOfAudio); if (end_of_audio_model.ok()) { end_of_multi_modal_embedding_models.insert( @@ -96,7 +125,7 @@ absl::Status InitializeEmbeddingLookups( } } { - auto end_of_vision_model = + absl::StatusOr end_of_vision_model = resources.GetTFLiteModel(ModelType::kTfLiteEndOfVision); if (end_of_vision_model.ok()) { end_of_multi_modal_embedding_models.insert( @@ -104,13 +133,24 @@ absl::Status InitializeEmbeddingLookups( } } - auto text_embedder_model = + absl::StatusOr text_embedder_model = resources.GetTFLiteModel(ModelType::kTfLiteEmbedder); if (text_embedder_model.ok()) { ASSIGN_OR_RETURN( embedding_lookup, EmbeddingLookupManager::Create(env, *text_embedder_model, end_of_multi_modal_embedding_models)); + + // TODO: Ideally we release the individual embedding models, but we don't + // have access to the individual embedding models here. + if (auto is_fully_accelerated = embedding_lookup->IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLiteEndOfAudio)); + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLiteEndOfVision)); + RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteEmbedder)); + } } // Create per layer embedding lookups from the resources. @@ -122,14 +162,21 @@ absl::Status InitializeEmbeddingLookups( EmbeddingLookupManager::Create(env, *per_layer_embedder_model, /*fully_supports_multi_modal=*/false)); } + if (per_layer_embedding_lookup != nullptr) { + if (auto is_fully_accelerated = + per_layer_embedding_lookup->IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLitePerLayerEmbedder)); + } + } return absl::OkStatus(); } absl::Status CopyKvCacheBuffers( size_t decode_batch_size, int src_index_to_copy_on_prefill, - const absl::flat_hash_map& - src_kv_cache_buffers, - const absl::flat_hash_map& + const absl::flat_hash_map& src_kv_cache_buffers, + const absl::flat_hash_map& dst_kv_cache_buffers) { for (const auto& [name, src_buffer] : src_kv_cache_buffers) { if (!dst_kv_cache_buffers.contains(name)) { @@ -191,17 +238,62 @@ absl::StatusOr GetDynamicDimIndex(const Model& model, } return absl::InvalidArgumentError("No dynamic dimension found."); } +} // namespace -absl::StatusOr HasDynamicDim(const Model& model, - absl::string_view signature, - absl::string_view tensor_name) { - LITERT_ASSIGN_OR_RETURN(const SimpleSignature& sig, - model.FindSignature(signature)); - LITERT_ASSIGN_OR_RETURN(const SimpleTensor& tensor, - sig.InputTensor(tensor_name)); - LITERT_ASSIGN_OR_RETURN(const RankedTensorType ranked_tensor_type, - tensor.RankedTensorType()); - auto dimensions = ranked_tensor_type.Layout().Dimensions(); +absl::StatusOr +LlmLiteRtCompiledModelExecutorBase::CacheTensorMetadata(const Model& model) { + CachedMetadata cached_metadata; + for (int sig_idx = 0; sig_idx < model.GetNumSignatures(); ++sig_idx) { + LITERT_ASSIGN_OR_RETURN(auto sig, model.GetSignature(sig_idx)); + std::string sig_key(sig.Key()); + if (absl::StartsWith(sig_key, kPrefillSignatureRunner)) { + cached_metadata.prefill_signature_key = sig_key; + } + cached_metadata.signature_key_to_idx[sig_key] = sig_idx; + + auto input_names = sig.InputNames(); + std::vector in_names; + for (int in_idx = 0; in_idx < input_names.size(); ++in_idx) { + std::string input_name(input_names[in_idx]); + in_names.push_back(input_name); + + { + LITERT_ASSIGN_OR_RETURN(auto tensor, sig.InputTensor(in_idx)); + LITERT_ASSIGN_OR_RETURN(auto ranked_type, tensor.RankedTensorType()); + TensorMetadata metadata; + metadata.signature_index = sig_idx; + metadata.input_index = in_idx; + metadata.element_type = ranked_type.ElementType(); + auto dims = ranked_type.Layout().Dimensions(); + metadata.dimensions.assign(dims.begin(), dims.end()); + + cached_metadata + .input_tensor_metadata[absl::StrCat(sig_key, ":", input_name)] = + std::move(metadata); + } + } + cached_metadata.input_names_by_sig_idx[sig_idx] = std::move(in_names); + + auto output_names = sig.OutputNames(); + std::vector out_names; + out_names.reserve(output_names.size()); + for (int out_idx = 0; out_idx < output_names.size(); ++out_idx) { + out_names.push_back(std::string(output_names[out_idx])); + } + cached_metadata.output_names_by_sig_idx[sig_idx] = std::move(out_names); + } + return cached_metadata; +} + +absl::StatusOr LlmLiteRtCompiledModelExecutorBase::HasDynamicDim( + absl::string_view signature, absl::string_view tensor_name) { + std::string cache_key = absl::StrCat(signature, ":", tensor_name); + auto it = input_tensor_metadata_.find(cache_key); + if (it == input_tensor_metadata_.end()) { + ABSL_LOG(ERROR) << "Tensor metadata not found for key: " << cache_key; + return absl::NotFoundError("Tensor metadata not found"); + } + const auto& dimensions = it->second.dimensions; for (int i = 0; i < dimensions.size(); ++i) { if (dimensions[i] == kDynamicDimValue) { return true; @@ -210,17 +302,16 @@ absl::StatusOr HasDynamicDim(const Model& model, return false; } -absl::Status ResolveDynamicShape(const Model& model, - CompiledModel& compiled_model, - absl::string_view signature, - absl::string_view tensor_name, int new_value) { - LITERT_ASSIGN_OR_RETURN(const SimpleSignature& sig, - model.FindSignature(signature)); - LITERT_ASSIGN_OR_RETURN(const SimpleTensor& tensor, - sig.InputTensor(tensor_name)); - LITERT_ASSIGN_OR_RETURN(const RankedTensorType ranked_tensor_type, - tensor.RankedTensorType()); - auto dimensions = ranked_tensor_type.Layout().Dimensions(); +absl::Status LlmLiteRtCompiledModelExecutorBase::ResolveDynamicShape( + absl::string_view signature, absl::string_view tensor_name, int new_value) { + std::string cache_key = absl::StrCat(signature, ":", tensor_name); + auto it = input_tensor_metadata_.find(cache_key); + if (it == input_tensor_metadata_.end()) { + ABSL_LOG(ERROR) << "Tensor metadata not found for key: " << cache_key; + return absl::NotFoundError("Tensor metadata not found"); + } + const auto& metadata = it->second; + const auto& dimensions = metadata.dimensions; bool has_dynamic_dim = false; std::vector new_shape; @@ -235,13 +326,48 @@ absl::Status ResolveDynamicShape(const Model& model, } if (has_dynamic_dim) { - LITERT_RETURN_IF_ERROR( - compiled_model.ResizeInputTensor(signature, tensor_name, new_shape)); + LITERT_RETURN_IF_ERROR(compiled_model_->ResizeInputTensor( + metadata.signature_index, metadata.input_index, new_shape)); + auto layouts_or = compiled_model_->GetOutputTensorLayouts( + metadata.signature_index, /*update_allocation=*/true); + if (!layouts_or) { + return absl::InternalError( + "Failed to update allocation after resizing tensor."); + } } return absl::OkStatus(); } +absl::StatusOr +LlmLiteRtCompiledModelExecutorBase::CreateInputBuffer( + absl::string_view signature, absl::string_view tensor_name) const { + std::string cache_key = absl::StrCat(signature, ":", tensor_name); + auto it = input_tensor_metadata_.find(cache_key); + if (it == input_tensor_metadata_.end()) { + ABSL_LOG(ERROR) << "Tensor metadata not found for key: " << cache_key; + return absl::NotFoundError("Tensor metadata not found"); + } + const auto& metadata = it->second; + + LITERT_ASSIGN_OR_RETURN(litert::TensorBufferRequirements buffer_requirements, + compiled_model_->GetInputBufferRequirements( + metadata.signature_index, metadata.input_index)); + + LITERT_ASSIGN_OR_RETURN(litert::Layout runtime_layout, + compiled_model_->GetInputTensorLayout( + metadata.signature_index, metadata.input_index)); + + litert::RankedTensorType tensor_type(metadata.element_type, + std::move(runtime_layout)); + LITERT_ASSIGN_OR_RETURN(auto buf, + litert::TensorBuffer::CreateManagedFromRequirements( + env_, tensor_type, buffer_requirements)); + return std::move(buf); +} + +namespace { + absl::StatusOr ResizeKVCacheTensorBuffer( Environment& env, TensorBuffer& tensor_buffer, int dynamic_dim_index, int num_entries_to_insert) { @@ -363,18 +489,22 @@ absl::StatusOr CreateFP16OutputBuffer( absl::Status LlmLiteRtCompiledModelExecutorBase::CreatePrefillInputBuffers( absl::string_view prefill_signature, int sequence_length, int context_length, - absl::flat_hash_map& - prefill_input_buffers) { + absl::flat_hash_map& prefill_input_buffers) { auto dyn_shape_resolver = [&](absl::string_view tensor_name) -> absl::Status { - return ResolveDynamicShape(model_, *compiled_model_, prefill_signature, - tensor_name, sequence_length); + ASSIGN_OR_RETURN(bool has_dynamic_dim, + HasDynamicDim(prefill_signature, tensor_name)); + if (has_dynamic_dim) { + RETURN_IF_ERROR( + ResolveDynamicShape(prefill_signature, tensor_name, sequence_length)); + } + return absl::OkStatus(); }; // Create input_token, positions and attn_mask buffers after determining // the prefill length. if (!signatures_.input_tokens.empty()) { RETURN_IF_ERROR(dyn_shape_resolver(signatures_.input_tokens)); - auto tokens_buffer = compiled_model_->CreateInputBuffer( - prefill_signature, signatures_.input_tokens); + auto tokens_buffer = + CreateInputBuffer(prefill_signature, signatures_.input_tokens); prefill_input_buffers[signatures_.input_tokens] = std::move(*tokens_buffer); } else { // If input_tokens is empty, we must have input_embeddings. @@ -388,7 +518,7 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::CreatePrefillInputBuffers( "model is not initialized."); } RETURN_IF_ERROR(dyn_shape_resolver(signatures_.input_embeddings.value())); - auto embeddings_buffer = compiled_model_->CreateInputBuffer( + auto embeddings_buffer = CreateInputBuffer( prefill_signature, signatures_.input_embeddings.value()); prefill_input_buffers[signatures_.input_embeddings.value()] = std::move(*embeddings_buffer); @@ -402,36 +532,50 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::CreatePrefillInputBuffers( } RETURN_IF_ERROR( dyn_shape_resolver(signatures_.input_per_layer_embeddings.value())); - auto per_layer_embeddings_buffer = compiled_model_->CreateInputBuffer( + auto per_layer_embeddings_buffer = CreateInputBuffer( prefill_signature, signatures_.input_per_layer_embeddings.value()); prefill_input_buffers[signatures_.input_per_layer_embeddings.value()] = std::move(*per_layer_embeddings_buffer); } } RETURN_IF_ERROR(dyn_shape_resolver(signatures_.input_positions)); - auto positions_buffer = compiled_model_->CreateInputBuffer( - prefill_signature, signatures_.input_positions); + auto positions_buffer = + CreateInputBuffer(prefill_signature, signatures_.input_positions); prefill_input_buffers[signatures_.input_positions] = std::move(*positions_buffer); if (signatures_.input_attn_mask.has_value()) { - ASSIGN_OR_RETURN(bool is_attn_dyn, - HasDynamicDim(model_, prefill_signature, - signatures_.input_attn_mask.value())); + ASSIGN_OR_RETURN( + bool is_attn_dyn, + HasDynamicDim(prefill_signature, signatures_.input_attn_mask.value())); if (is_attn_dyn) { std::vector new_shape = {1, 1, sequence_length, context_length}; - LITERT_RETURN_IF_ERROR(compiled_model_->ResizeInputTensor( - prefill_signature, signatures_.input_attn_mask.value(), new_shape)); + auto cache_key = absl::StrCat(prefill_signature, ":", + signatures_.input_attn_mask.value()); + auto it = input_tensor_metadata_.find(cache_key); + if (it != input_tensor_metadata_.end()) { + LITERT_RETURN_IF_ERROR(compiled_model_->ResizeInputTensor( + it->second.signature_index, it->second.input_index, new_shape)); + auto layouts_or = compiled_model_->GetOutputTensorLayouts( + it->second.signature_index, /*update_allocation=*/true); + if (!layouts_or) { + return absl::InternalError( + "Failed to update allocation after resizing tensor."); + } + } else { + return absl::NotFoundError( + absl::StrCat("Tensor metadata not found: ", cache_key)); + } } - auto attn_mask_buffer = compiled_model_->CreateInputBuffer( + auto attn_mask_buffer = CreateInputBuffer( prefill_signature, signatures_.input_attn_mask.value()); prefill_input_buffers[signatures_.input_attn_mask.value()] = std::move(*attn_mask_buffer); } if (signatures_.input_int32_param.has_value()) { gpu_optimized_single_buffer_cache_ = true; - auto param_tensor_buffer = compiled_model_->CreateInputBuffer( + auto param_tensor_buffer = CreateInputBuffer( prefill_signature, signatures_.input_int32_param.value()); prefill_input_buffers[signatures_.input_int32_param.value()] = std::move(*param_tensor_buffer); @@ -546,7 +690,7 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::PrepareFirstPrefillAfterDecode( absl::Status LlmLiteRtCompiledModelExecutorBase::PrefillInternal( absl::string_view prefill_signature, - absl::flat_hash_map& prefill_input_buffers, + absl::flat_hash_map& prefill_input_buffers, Span ids, bool async) { RETURN_IF_ERROR(RollBackProcessedTokens()); @@ -712,30 +856,53 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::PrefillInternal( absl::Status LlmLiteRtCompiledModelExecutorBase::BindTensorsAndRunPrefill( absl::string_view prefill_signature, - absl::flat_hash_map& prefill_input_buffers, + absl::flat_hash_map& prefill_input_buffers, bool async) { - absl::flat_hash_map input_buffers; - for (const auto& [input_name, input_buffer] : prefill_input_buffers) { - LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, input_buffer.Duplicate()); - input_buffers[input_name] = std::move(input_buffer_dup); - } - for (const auto& [input_name, input_buffer] : *input_kv_cache_buffers_) { - LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, input_buffer.Duplicate()); - input_buffers[input_name] = std::move(input_buffer_dup); + auto sig_it = signature_key_to_idx_.find(prefill_signature); + if (sig_it == signature_key_to_idx_.end()) { + return absl::NotFoundError( + absl::StrCat("Signature key not found: ", prefill_signature)); + } + size_t current_prefill_idx = sig_it->second; + + std::vector vec_input_buffers; + const auto& in_names = input_names_by_sig_idx_[current_prefill_idx]; + vec_input_buffers.reserve(in_names.size()); + for (const auto& name : in_names) { + if (auto it = prefill_input_buffers.find(name); + it != prefill_input_buffers.end()) { + LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, it->second.Duplicate()); + vec_input_buffers.push_back(std::move(input_buffer_dup)); + } else if (auto it = input_kv_cache_buffers_->find(name); + it != input_kv_cache_buffers_->end()) { + LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, it->second.Duplicate()); + vec_input_buffers.push_back(std::move(input_buffer_dup)); + } else { + vec_input_buffers.push_back(litert::TensorBuffer::WrapCObject( + env_.GetHolder(), nullptr, litert::OwnHandle::kNo)); + } } - absl::flat_hash_map output_buffers; - for (const auto& [output_name, output_buffer] : *output_kv_cache_buffers_) { - LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, output_buffer.Duplicate()); - output_buffer_dup.ClearEvent(); - output_buffers[output_name] = std::move(output_buffer_dup); + + std::vector vec_output_buffers; + const auto& out_names = output_names_by_sig_idx_[current_prefill_idx]; + vec_output_buffers.reserve(out_names.size()); + for (const auto& name : out_names) { + if (auto it = output_kv_cache_buffers_->find(name); + it != output_kv_cache_buffers_->end()) { + LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, it->second.Duplicate()); + output_buffer_dup.ClearEvent(); + vec_output_buffers.push_back(std::move(output_buffer_dup)); + } else { + return absl::NotFoundError(absl::StrCat("Missing output buffer: ", name)); + } } if (async) { LITERT_RETURN_IF_ERROR(compiled_model_->RunAsync( - prefill_signature, input_buffers, output_buffers, async)); + current_prefill_idx, vec_input_buffers, vec_output_buffers, async)); } else { - LITERT_RETURN_IF_ERROR( - compiled_model_->Run(prefill_signature, input_buffers, output_buffers)); + LITERT_RETURN_IF_ERROR(compiled_model_->Run( + current_prefill_idx, vec_input_buffers, vec_output_buffers)); } if (!gpu_optimized_single_buffer_cache_) { @@ -916,36 +1083,52 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::DecodeInternal( absl::Status LlmLiteRtCompiledModelExecutorBase::BindTensorsAndRunDecode( TensorBuffer* output_logits) { - absl::flat_hash_map decode_input_buffers; - for (const auto& [input_name, input_buffer] : decode_input_buffers_) { - LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, input_buffer.Duplicate()); - decode_input_buffers[input_name] = std::move(input_buffer_dup); - } - for (const auto& [input_name, input_buffer] : *input_kv_cache_buffers_) { - LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, input_buffer.Duplicate()); - decode_input_buffers[input_name] = std::move(input_buffer_dup); - } - absl::flat_hash_map decode_output_buffers; - for (const auto& [output_name, output_buffer] : decode_output_buffers_) { - // LITERT_ASSIGN_OR_RETURN() causes a compilation error on windows. - auto output_buffer_dup = - output_logits && output_name == signatures_.output_logits - ? output_logits->Duplicate() - : output_buffer.Duplicate(); - RET_CHECK(output_buffer_dup) << "Failed to duplicate output buffer."; - output_buffer_dup->ClearEvent(); - decode_output_buffers[output_name] = std::move(*output_buffer_dup); - } - for (const auto& [output_name, output_buffer] : *output_kv_cache_buffers_) { - LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, output_buffer.Duplicate()); - output_buffer_dup.ClearEvent(); - decode_output_buffers[output_name] = std::move(output_buffer_dup); + + std::vector vec_input_buffers; + const auto& in_names = input_names_by_sig_idx_[decode_signature_idx_]; + vec_input_buffers.reserve(in_names.size()); + for (const auto& name : in_names) { + if (auto it = decode_input_buffers_.find(name); + it != decode_input_buffers_.end()) { + LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, it->second.Duplicate()); + vec_input_buffers.push_back(std::move(input_buffer_dup)); + } else if (auto it = input_kv_cache_buffers_->find(name); + it != input_kv_cache_buffers_->end()) { + LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, it->second.Duplicate()); + vec_input_buffers.push_back(std::move(input_buffer_dup)); + } else { + vec_input_buffers.push_back(litert::TensorBuffer::WrapCObject( + env_.GetHolder(), nullptr, litert::OwnHandle::kNo)); + } + } + + std::vector vec_output_buffers; + const auto& out_names = output_names_by_sig_idx_[decode_signature_idx_]; + vec_output_buffers.reserve(out_names.size()); + for (const auto& name : out_names) { + if (output_logits && name == signatures_.output_logits) { + LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, + output_logits->Duplicate()); + output_buffer_dup.ClearEvent(); + vec_output_buffers.push_back(std::move(output_buffer_dup)); + } else if (auto it = decode_output_buffers_.find(name); + it != decode_output_buffers_.end()) { + LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, it->second.Duplicate()); + output_buffer_dup.ClearEvent(); + vec_output_buffers.push_back(std::move(output_buffer_dup)); + } else if (auto it = output_kv_cache_buffers_->find(name); + it != output_kv_cache_buffers_->end()) { + LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, it->second.Duplicate()); + output_buffer_dup.ClearEvent(); + vec_output_buffers.push_back(std::move(output_buffer_dup)); + } else { + return absl::NotFoundError(absl::StrCat("Missing output buffer: ", name)); + } } bool async = true; - LITERT_RETURN_IF_ERROR( - compiled_model_->RunAsync(kDecodeSignatureRunner, decode_input_buffers, - decode_output_buffers, async)); + LITERT_RETURN_IF_ERROR(compiled_model_->RunAsync( + decode_signature_idx_, vec_input_buffers, vec_output_buffers, async)); if (!gpu_optimized_single_buffer_cache_) { std::swap(input_kv_cache_buffers_, output_kv_cache_buffers_); @@ -1263,16 +1446,14 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::InitializeSampler( if (sampler_handles_input_) { ABSL_LOG(INFO) << "Sampler will handle decode input tensors."; if (!decode_prev_input_pos_) { - LITERT_ASSIGN_OR_RETURN( - decode_prev_input_pos_, - compiled_model_->CreateInputBuffer(kDecodeSignatureRunner, - signatures_.input_positions)); + LITERT_ASSIGN_OR_RETURN(decode_prev_input_pos_, + CreateInputBuffer(kDecodeSignatureRunner, + signatures_.input_positions)); } if (!decode_prev_mask_ && signatures_.input_attn_mask.has_value()) { - LITERT_ASSIGN_OR_RETURN( - decode_prev_mask_, - compiled_model_->CreateInputBuffer(kDecodeSignatureRunner, - *signatures_.input_attn_mask)); + LITERT_ASSIGN_OR_RETURN(decode_prev_mask_, + CreateInputBuffer(kDecodeSignatureRunner, + *signatures_.input_attn_mask)); } // Set, then reset the input handling to get the underlying model ready, but // not to bind the input tensors. @@ -1547,10 +1728,10 @@ LlmLiteRtCompiledModelExecutorStatic::Create( compiled_model = std::make_unique(std::move(compiled_model_tmp)); } - absl::flat_hash_map decode_input_buffers; - absl::flat_hash_map decode_output_buffers; - absl::flat_hash_map input_kv_cache_buffers; - absl::flat_hash_map output_kv_cache_buffers; + absl::flat_hash_map decode_input_buffers; + absl::flat_hash_map decode_output_buffers; + absl::flat_hash_map input_kv_cache_buffers; + absl::flat_hash_map output_kv_cache_buffers; bool clear_kv_cache_before_prefill = !executor_settings.GetAdvancedSettings() || @@ -1653,17 +1834,17 @@ LlmLiteRtCompiledModelExecutorStatic::Create( << "Output logits must be (batch, seq, vocab)"; int batch_size = output_logits_buffer_tensor_type.Layout().Dimensions()[0]; - std::optional> + std::optional> decode_input_kv_cache_buffers; - std::optional> + std::optional> decode_output_kv_cache_buffers; if (batch_size > 1) { ABSL_LOG(INFO) << "Decode batch size is larger than 1. Allocate decode " << "only KV cache buffers."; decode_input_kv_cache_buffers = - absl::flat_hash_map(); + absl::flat_hash_map(); decode_output_kv_cache_buffers = - absl::flat_hash_map(); + absl::flat_hash_map(); for (auto input_name : decode_signature.InputNames()) { if (absl::StartsWith(input_name, kv_cache_k_root_name) || absl::StartsWith(input_name, kv_cache_v_root_name)) { @@ -1712,9 +1893,24 @@ LlmLiteRtCompiledModelExecutorStatic::Create( } } - return absl::WrapUnique(new LlmLiteRtCompiledModelExecutorStatic( - std::move(executor_settings), lrt_env, litert_model, - std::move(compiled_model), std::move(decode_input_buffers), + Expected is_fully_accelerated = compiled_model->IsFullyAccelerated(); + ASSIGN_OR_RETURN(auto cached_metadata, CacheTensorMetadata(*litert_model)); + + if (is_fully_accelerated.HasValue() && *is_fully_accelerated) { + // Dynamically advise kernel to recycle physical RAM pages instead of + // destructively releasing the underlying memory map, maximizing system + // resource returns while enforcing safe, uninterrupted TFLite execution + // contexts globally. + auto buf_status = + resources.GetTFLiteModelBuffer(ModelType::kTfLitePrefillDecode); + if (buf_status.ok()) { + MadviseMemoryBuffer(*buf_status); + } + } + + auto executor = absl::WrapUnique(new LlmLiteRtCompiledModelExecutorStatic( + std::move(executor_settings), lrt_env, std::move(compiled_model), + std::move(cached_metadata), std::move(decode_input_buffers), std::move(decode_output_buffers), std::move(input_kv_cache_buffers), std::move(output_kv_cache_buffers), std::move(decode_input_kv_cache_buffers), @@ -1722,6 +1918,8 @@ LlmLiteRtCompiledModelExecutorStatic::Create( signatures, batch_size, std::move(cache_path), std::move(embedding_lookup), std::move(per_layer_embedding_lookup), use_fp16_precision, activation_data_type, std::move(mtp_drafter))); + + return executor; } /* ===========================================================================*/ @@ -1785,22 +1983,20 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::PrefillInternal( !executor_settings_.GetAdvancedSettings() || executor_settings_.GetAdvancedSettings()->clear_kv_cache_before_prefill; for (const auto& k_cache_input_name : key_cache_input_names_) { - RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "prefill", - k_cache_input_name, prefill_length)); - LITERT_ASSIGN_OR_RETURN( - auto input_buffer, - compiled_model_->CreateInputBuffer("prefill", k_cache_input_name)); + RETURN_IF_ERROR( + ResolveDynamicShape("prefill", k_cache_input_name, prefill_length)); + LITERT_ASSIGN_OR_RETURN(auto input_buffer, + CreateInputBuffer("prefill", k_cache_input_name)); if (clear_kv_cache_before_prefill) { LITERT_RETURN_IF_ERROR(input_buffer.Clear()); } kv_cache_buffers_1_[k_cache_input_name] = std::move(input_buffer); } for (const auto& v_cache_input_name : value_cache_input_names_) { - RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "prefill", - v_cache_input_name, prefill_length)); - LITERT_ASSIGN_OR_RETURN( - auto input_buffer, - compiled_model_->CreateInputBuffer("prefill", v_cache_input_name)); + RETURN_IF_ERROR( + ResolveDynamicShape("prefill", v_cache_input_name, prefill_length)); + LITERT_ASSIGN_OR_RETURN(auto input_buffer, + CreateInputBuffer("prefill", v_cache_input_name)); if (clear_kv_cache_before_prefill) { LITERT_RETURN_IF_ERROR(input_buffer.Clear()); } @@ -1822,18 +2018,16 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::PrefillInternal( int new_kv_seq_len = kv_length + prefill_length; int entries_to_add = new_kv_seq_len - kv_length; for (const auto& k_cache_input_name : key_cache_input_names_) { - RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "prefill", - k_cache_input_name, - new_kv_seq_len)); + RETURN_IF_ERROR( + ResolveDynamicShape("prefill", k_cache_input_name, new_kv_seq_len)); ASSIGN_OR_RETURN(kv_cache_buffers_1_[k_cache_input_name], ResizeKVCacheTensorBuffer( env_, kv_cache_buffers_1_[k_cache_input_name], key_dynamic_dim_index_, entries_to_add)); } for (const auto& v_cache_input_name : value_cache_input_names_) { - RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "prefill", - v_cache_input_name, - new_kv_seq_len)); + RETURN_IF_ERROR( + ResolveDynamicShape("prefill", v_cache_input_name, new_kv_seq_len)); ASSIGN_OR_RETURN(kv_cache_buffers_1_[v_cache_input_name], ResizeKVCacheTensorBuffer( env_, kv_cache_buffers_1_[v_cache_input_name], @@ -1843,7 +2037,7 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::PrefillInternal( } } - absl::flat_hash_map prefill_input_buffers; + absl::flat_hash_map prefill_input_buffers; RETURN_IF_ERROR(CreatePrefillInputBuffers("prefill", prefill_length, kv_length, prefill_input_buffers)); @@ -1873,16 +2067,16 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::DecodeInternal( int entries_to_add = kv_increament_size_; int new_kv_len = current_kv_len + entries_to_add; for (const auto& k_cache_input_name : key_cache_input_names_) { - RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "decode", - k_cache_input_name, new_kv_len)); + RETURN_IF_ERROR( + ResolveDynamicShape("decode", k_cache_input_name, new_kv_len)); ASSIGN_OR_RETURN(kv_cache_buffers_1_[k_cache_input_name], ResizeKVCacheTensorBuffer( env_, kv_cache_buffers_1_[k_cache_input_name], key_dynamic_dim_index_, entries_to_add)); } for (const auto& v_cache_input_name : value_cache_input_names_) { - RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "decode", - v_cache_input_name, new_kv_len)); + RETURN_IF_ERROR( + ResolveDynamicShape("decode", v_cache_input_name, new_kv_len)); ASSIGN_OR_RETURN(kv_cache_buffers_1_[v_cache_input_name], ResizeKVCacheTensorBuffer( env_, kv_cache_buffers_1_[v_cache_input_name], @@ -1891,13 +2085,11 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::DecodeInternal( current_kv_len = new_kv_len; } - RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "decode", - signatures_.input_attn_mask.value(), - current_kv_len)); + RETURN_IF_ERROR(ResolveDynamicShape( + "decode", signatures_.input_attn_mask.value(), current_kv_len)); LITERT_ASSIGN_OR_RETURN( decode_input_buffers_[signatures_.input_attn_mask.value()], - compiled_model_->CreateInputBuffer("decode", - signatures_.input_attn_mask.value())); + CreateInputBuffer("decode", signatures_.input_attn_mask.value())); return LlmLiteRtCompiledModelExecutorBase::DecodeInternal(token, output_logits); @@ -1967,8 +2159,8 @@ LlmLiteRtCompiledModelExecutorDynamic::Create( std::make_unique(std::move(compiled_model_tmp)); } - absl::flat_hash_map decode_input_buffers; - absl::flat_hash_map decode_output_buffers; + absl::flat_hash_map decode_input_buffers; + absl::flat_hash_map decode_output_buffers; LITERT_ASSIGN_OR_RETURN(auto decode_signature, litert_model->FindSignature(kDecodeSignatureRunner)); @@ -2038,15 +2230,34 @@ LlmLiteRtCompiledModelExecutorDynamic::Create( std::unique_ptr per_layer_embedding_lookup; RETURN_IF_ERROR(InitializeEmbeddingLookups( lrt_env, resources, embedding_lookup, per_layer_embedding_lookup)); - return absl::WrapUnique(new LlmLiteRtCompiledModelExecutorDynamic( - std::move(executor_settings), lrt_env, litert_model, - std::move(compiled_model), std::move(decode_input_buffers), + + Expected is_fully_accelerated = compiled_model->IsFullyAccelerated(); + ASSIGN_OR_RETURN(auto cached_metadata, CacheTensorMetadata(*litert_model)); + + if (is_fully_accelerated.HasValue() && *is_fully_accelerated) { + // Dynamically advise kernel to recycle physical RAM pages instead of + // destructively releasing the underlying memory map, maximizing system + // resource returns while enforcing safe, uninterrupted TFLite execution + // contexts globally. + auto buf_status = + resources.GetTFLiteModelBuffer(ModelType::kTfLitePrefillDecode); + if (buf_status.ok()) { + MadviseMemoryBuffer(*buf_status); + } + } + + auto executor = absl::WrapUnique(new LlmLiteRtCompiledModelExecutorDynamic( + std::move(executor_settings), lrt_env, std::move(compiled_model), + std::move(cached_metadata), std::move(decode_input_buffers), std::move(decode_output_buffers), prefill_chunk_size, k_dynamic_dim, v_dynamic_dim, kv_increament_size, std::move(key_cache_input_names), std::move(value_cache_input_names), signatures, batch_size, std::move(weight_cache_path), std::move(embedding_lookup), - std::move(per_layer_embedding_lookup), /*use_fp16_precision=*/false, + std::move(per_layer_embedding_lookup), + /*use_fp16_precision=*/false, /*logits_data_type=*/LogitsDataType::FLOAT32)); + + return executor; } } // namespace litert::lm diff --git a/runtime/executor/llm_litert_compiled_model_executor.h b/runtime/executor/llm_litert_compiled_model_executor.h index 0c94ee889..cb999b36e 100644 --- a/runtime/executor/llm_litert_compiled_model_executor.h +++ b/runtime/executor/llm_litert_compiled_model_executor.h @@ -23,7 +23,6 @@ #include #include -#include "absl/base/nullability.h" // from @com_google_absl #include "absl/container/flat_hash_map.h" // from @com_google_absl #include "absl/status/status.h" // from @com_google_absl #include "absl/status/statusor.h" // from @com_google_absl @@ -56,6 +55,25 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor { public: using LlmExecutor::Prefill; + struct TensorMetadata { + size_t signature_index; + size_t input_index; + litert::ElementType element_type; + std::vector dimensions; + }; + + struct CachedMetadata { + std::string prefill_signature_key; + absl::flat_hash_map signature_key_to_idx; + absl::flat_hash_map> + input_names_by_sig_idx; + absl::flat_hash_map> + output_names_by_sig_idx; + absl::flat_hash_map input_tensor_metadata; + }; + + static absl::StatusOr CacheTensorMetadata(const Model& model); + // Input APIs: // Basic API to trigger the "prefill" or "prefix" process. // Input is token ids with shape `[batch, sequence_length]` @@ -131,18 +149,15 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor { protected: LlmLiteRtCompiledModelExecutorBase( LlmExecutorSettings executor_settings, Environment& env, - const Model* absl_nonnull model, std::unique_ptr compiled_model, - absl::flat_hash_map decode_input_buffers, - absl::flat_hash_map - decode_output_buffers, - absl::flat_hash_map - input_kv_cache_buffers, - absl::flat_hash_map - output_kv_cache_buffers, - std::optional> + CachedMetadata cached_metadata, + absl::flat_hash_map decode_input_buffers, + absl::flat_hash_map decode_output_buffers, + absl::flat_hash_map input_kv_cache_buffers, + absl::flat_hash_map output_kv_cache_buffers, + std::optional> decode_input_kv_cache_buffers, - std::optional> + std::optional> decode_output_kv_cache_buffers, ModelSignatures signatures, int output_batch_size, std::string weight_cache_path, @@ -152,8 +167,16 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor { std::unique_ptr mtp_drafter) : executor_settings_(std::move(executor_settings)), env_(env), - model_(*model), compiled_model_(std::move(compiled_model)), + prefill_signature_key_( + std::move(cached_metadata.prefill_signature_key)), + signature_key_to_idx_(std::move(cached_metadata.signature_key_to_idx)), + input_names_by_sig_idx_( + std::move(cached_metadata.input_names_by_sig_idx)), + output_names_by_sig_idx_( + std::move(cached_metadata.output_names_by_sig_idx)), + input_tensor_metadata_( + std::move(cached_metadata.input_tensor_metadata)), decode_input_buffers_(std::move(decode_input_buffers)), decode_output_buffers_(std::move(decode_output_buffers)), kv_cache_buffers_1_(std::move(input_kv_cache_buffers)), @@ -170,7 +193,7 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor { logits_data_type_(logits_data_type), mtp_drafter_(std::move(mtp_drafter)) { auto processed_context = std::make_unique( - std::nullopt, absl::flat_hash_map(), + std::nullopt, absl::flat_hash_map(), ProcessedTokens()); auto runtime_config = std::make_unique(); runtime_config->output_heads = output_batch_size; @@ -208,7 +231,7 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor { // with a certain length synchronously or asynchronously. absl::Status PrefillInternal( absl::string_view prefill_signature, - absl::flat_hash_map& + absl::flat_hash_map& prefill_input_buffers, absl::Span ids, bool async); @@ -216,7 +239,7 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor { // and run prefill signature. absl::Status BindTensorsAndRunPrefill( absl::string_view prefill_signature, - absl::flat_hash_map& + absl::flat_hash_map& prefill_input_buffers, bool async); @@ -239,8 +262,7 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor { absl::Status CreatePrefillInputBuffers( absl::string_view prefill_signature, int sequence_length, int context_length, - absl::flat_hash_map& - prefill_input_buffers); + absl::flat_hash_map& prefill_input_buffers); // Fills the input buffer from the unprocessed token. absl::Status FillInputBufferWithToken( @@ -270,22 +292,37 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor { LlmExecutorSettings executor_settings_; Environment& env_; - const Model& model_; std::unique_ptr compiled_model_; - absl::flat_hash_map decode_input_buffers_; - absl::flat_hash_map decode_output_buffers_; + std::string prefill_signature_key_; + size_t decode_signature_idx_ = 0; + + absl::flat_hash_map signature_key_to_idx_; + absl::flat_hash_map> input_names_by_sig_idx_; + absl::flat_hash_map> + output_names_by_sig_idx_; + + absl::flat_hash_map input_tensor_metadata_; + absl::Status ResolveDynamicShape(absl::string_view signature, + absl::string_view tensor_name, + int new_value); + absl::StatusOr HasDynamicDim(absl::string_view signature, + absl::string_view tensor_name); + absl::StatusOr CreateInputBuffer( + absl::string_view signature, absl::string_view tensor_name) const; + + absl::flat_hash_map decode_input_buffers_; + absl::flat_hash_map decode_output_buffers_; // KV cache double buffers because some GPU backends can't allocate one buffer // for both read and write at the same time. - absl::flat_hash_map kv_cache_buffers_1_; - absl::flat_hash_map kv_cache_buffers_2_; - absl::flat_hash_map* input_kv_cache_buffers_; - absl::flat_hash_map* - output_kv_cache_buffers_; + absl::flat_hash_map kv_cache_buffers_1_; + absl::flat_hash_map kv_cache_buffers_2_; + absl::flat_hash_map* input_kv_cache_buffers_; + absl::flat_hash_map* output_kv_cache_buffers_; // KV cache (double) buffers used during decode when output_batch_size_ > 1. - std::optional> + std::optional> decode_kv_cache_buffers_1_; - std::optional> + std::optional> decode_kv_cache_buffers_2_; // The signatures of the model. @@ -349,18 +386,15 @@ class LlmLiteRtCompiledModelExecutorStatic private: LlmLiteRtCompiledModelExecutorStatic( LlmExecutorSettings executor_settings, Environment& env, - const Model* absl_nonnull model, std::unique_ptr compiled_model, - absl::flat_hash_map decode_input_buffers, - absl::flat_hash_map - decode_output_buffers, - absl::flat_hash_map - input_kv_cache_buffers, - absl::flat_hash_map - output_kv_cache_buffers, - std::optional> + CachedMetadata cached_metadata, + absl::flat_hash_map decode_input_buffers, + absl::flat_hash_map decode_output_buffers, + absl::flat_hash_map input_kv_cache_buffers, + absl::flat_hash_map output_kv_cache_buffers, + std::optional> decode_input_kv_cache_buffers, - std::optional> + std::optional> decode_output_kv_cache_buffers, SortedPrefillSignatureMap prefill_signature_map, ModelSignatures signatures, int output_batch_size, @@ -372,9 +406,9 @@ class LlmLiteRtCompiledModelExecutorStatic LogitsDataType logits_data_type = LogitsDataType::FLOAT32, std::unique_ptr mtp_drafter = nullptr) : LlmLiteRtCompiledModelExecutorBase( - std::move(executor_settings), env, model, std::move(compiled_model), - std::move(decode_input_buffers), std::move(decode_output_buffers), - std::move(input_kv_cache_buffers), + std::move(executor_settings), env, std::move(compiled_model), + std::move(cached_metadata), std::move(decode_input_buffers), + std::move(decode_output_buffers), std::move(input_kv_cache_buffers), std::move(output_kv_cache_buffers), std::move(decode_input_kv_cache_buffers), std::move(decode_output_kv_cache_buffers), signatures, @@ -388,7 +422,7 @@ class LlmLiteRtCompiledModelExecutorStatic // to refer to them by just their unique name. absl::flat_hash_map< std::string /*prefill_signature_name*/, - absl::flat_hash_map> + absl::flat_hash_map> prefill_input_buffers_; std::optional do_prefill_sync_; }; @@ -411,11 +445,10 @@ class LlmLiteRtCompiledModelExecutorDynamic private: LlmLiteRtCompiledModelExecutorDynamic( LlmExecutorSettings executor_settings, Environment& env, - const Model* absl_nonnull model, std::unique_ptr compiled_model, - absl::flat_hash_map decode_input_buffers, - absl::flat_hash_map - decode_output_buffers, + CachedMetadata cached_metadata, + absl::flat_hash_map decode_input_buffers, + absl::flat_hash_map decode_output_buffers, int prefill_chunk_size, int key_dynamic_dim_index, int value_dynamic_dim_index, int kv_increament_size, std::vector key_cache_input_names, @@ -429,8 +462,9 @@ class LlmLiteRtCompiledModelExecutorDynamic LogitsDataType logits_data_type = LogitsDataType::FLOAT32, std::unique_ptr mtp_drafter = nullptr) : LlmLiteRtCompiledModelExecutorBase( - std::move(executor_settings), env, model, std::move(compiled_model), - std::move(decode_input_buffers), std::move(decode_output_buffers), + std::move(executor_settings), env, std::move(compiled_model), + std::move(cached_metadata), std::move(decode_input_buffers), + std::move(decode_output_buffers), /*input_kv_cache_buffers=*/{}, /*output_kv_cache_buffers=*/{}, /*decode_input_kv_cache_buffers=*/std::nullopt, diff --git a/runtime/executor/llm_litert_compiled_model_executor_test.cc b/runtime/executor/llm_litert_compiled_model_executor_test.cc index 69635091a..acb7f281e 100644 --- a/runtime/executor/llm_litert_compiled_model_executor_test.cc +++ b/runtime/executor/llm_litert_compiled_model_executor_test.cc @@ -462,6 +462,10 @@ class TfLiteModelResources : public ModelResources { return absl::UnimplementedError("Unsupported model type"); } + absl::Status ReleaseTFLiteModel(ModelType model_type) override { + return absl::UnimplementedError("ReleaseTFLiteModel not implemented"); + } + absl::StatusOr GetTFLiteModelBuffer( ModelType model_type) override { return absl::UnimplementedError("GetTFLiteModelBuffer not implemented."); diff --git a/runtime/executor/llm_litert_mtp_drafter.cc b/runtime/executor/llm_litert_mtp_drafter.cc index d5ba22973..3b8e0ffe4 100644 --- a/runtime/executor/llm_litert_mtp_drafter.cc +++ b/runtime/executor/llm_litert_mtp_drafter.cc @@ -180,10 +180,8 @@ LlmLiteRtMtpDrafter::Create(Environment& env, ModelResources& resources, auto compiled_model, CompiledModel::Create(env, model->Get(), compilation_options)); - absl::flat_hash_map - mtp_drafter_input_buffers; - absl::flat_hash_map - mtp_drafter_output_buffers; + absl::flat_hash_map mtp_drafter_input_buffers; + absl::flat_hash_map mtp_drafter_output_buffers; std::vector kv_cache_input_names; LITERT_ASSIGN_OR_RETURN(SimpleSignature drafter_signature, compiled_model.GetSignature(/*signature_index=*/0)); @@ -210,8 +208,8 @@ LlmLiteRtMtpDrafter::Create(Environment& env, ModelResources& resources, LITERT_ASSIGN_OR_RETURN(SimpleSignature verify_signature, base_model.FindSignature(kVerifySignatureRunner)); - absl::flat_hash_map verifier_input_buffers; - absl::flat_hash_map verifier_output_buffers; + absl::flat_hash_map verifier_input_buffers; + absl::flat_hash_map verifier_output_buffers; int num_draft_steps; { for (absl::string_view input_name : verify_signature.InputNames()) { @@ -277,8 +275,8 @@ LlmLiteRtMtpDrafter::Create(Environment& env, ModelResources& resources, } absl::Status LlmLiteRtMtpDrafter::PrepareDrafterInputBuffers( - int position, absl::flat_hash_map& - output_kv_cache_buffers) { + int position, + absl::flat_hash_map& output_kv_cache_buffers) { for (const auto& kv_cache_input_name : kv_cache_input_names_) { LITERT_ASSIGN_OR_RETURN( auto kv_cache_buffer_dup, @@ -337,9 +335,20 @@ absl::StatusOr> LlmLiteRtMtpDrafter::RunDraftingLoop( } bool async = true; + absl::flat_hash_map draft_input_buffers; + for (const auto& [name, buffer] : active_drafter_input_buffers_) { + LITERT_ASSIGN_OR_RETURN(auto buffer_dup, buffer.Duplicate()); + draft_input_buffers[name] = std::move(buffer_dup); + } + absl::flat_hash_map draft_output_buffers; + for (const auto& [name, buffer] : active_drafter_output_buffers_) { + LITERT_ASSIGN_OR_RETURN(auto buffer_dup, buffer.Duplicate()); + draft_output_buffers[name] = std::move(buffer_dup); + } + LITERT_RETURN_IF_ERROR(mtp_drafter_model_.RunAsync( - drafter_signature_.Key(), active_drafter_input_buffers_, - active_drafter_output_buffers_, async)); + drafter_signature_.Key(), draft_input_buffers, draft_output_buffers, + async)); RETURN_IF_ERROR(drafter_sampler_->SampleToIdAndScoreBuffer( active_drafter_output_buffers_["logits"], drafter_id_tensor_, @@ -357,8 +366,7 @@ absl::StatusOr> LlmLiteRtMtpDrafter::RunDraftingLoop( absl::Status LlmLiteRtMtpDrafter::PrepareVerifierInputBuffers( int position, int token_id, const std::vector& drafted_tokens, - absl::flat_hash_map& - input_kv_cache_buffers) { + absl::flat_hash_map& input_kv_cache_buffers) { { LITERT_ASSIGN_OR_RETURN( auto verifier_input_pos_lock_and_addr, @@ -404,8 +412,7 @@ absl::Status LlmLiteRtMtpDrafter::PrepareVerifierInputBuffers( } absl::Status LlmLiteRtMtpDrafter::PrepareVerifierOutputBuffers( - absl::flat_hash_map& - output_kv_cache_buffers) { + absl::flat_hash_map& output_kv_cache_buffers) { for (const auto& [output_name, output_buffer] : output_kv_cache_buffers) { LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, output_buffer.Duplicate()); active_verifier_output_buffers_[output_name] = std::move(output_buffer_dup); @@ -418,9 +425,20 @@ absl::Status LlmLiteRtMtpDrafter::PrepareVerifierOutputBuffers( absl::StatusOr> LlmLiteRtMtpDrafter::RunVerification() { bool async = true; - LITERT_RETURN_IF_ERROR(base_model_.RunAsync( - verify_signature_.Key(), active_verifier_input_buffers_, - active_verifier_output_buffers_, async)); + absl::flat_hash_map verify_input_buffers; + for (const auto& [name, buffer] : active_verifier_input_buffers_) { + LITERT_ASSIGN_OR_RETURN(auto buffer_dup, buffer.Duplicate()); + verify_input_buffers[name] = std::move(buffer_dup); + } + absl::flat_hash_map verify_output_buffers; + for (const auto& [name, buffer] : active_verifier_output_buffers_) { + LITERT_ASSIGN_OR_RETURN(auto buffer_dup, buffer.Duplicate()); + verify_output_buffers[name] = std::move(buffer_dup); + } + + LITERT_RETURN_IF_ERROR(base_model_.RunAsync(verify_signature_.Key(), + verify_input_buffers, + verify_output_buffers, async)); RETURN_IF_ERROR(verifier_sampler_->SampleToIdAndScoreBuffer( active_verifier_output_buffers_.at("logits"), verifier_id_tensor_, @@ -434,10 +452,8 @@ absl::StatusOr> LlmLiteRtMtpDrafter::RunVerification() { absl::StatusOr>> LlmLiteRtMtpDrafter::Draft( int position, int token_id, std::optional activations, - absl::flat_hash_map& - input_kv_cache_buffers, - absl::flat_hash_map& - output_kv_cache_buffers) { + absl::flat_hash_map& input_kv_cache_buffers, + absl::flat_hash_map& output_kv_cache_buffers) { RETURN_IF_ERROR( PrepareDrafterInputBuffers(position - 1, output_kv_cache_buffers)); diff --git a/runtime/executor/llm_litert_mtp_drafter.h b/runtime/executor/llm_litert_mtp_drafter.h index 166d84a80..2428f35fb 100644 --- a/runtime/executor/llm_litert_mtp_drafter.h +++ b/runtime/executor/llm_litert_mtp_drafter.h @@ -68,30 +68,23 @@ class LlmLiteRtMtpDrafter { // [batch_size, num_tokens]. absl::StatusOr>> Draft( int position, int token_id, std::optional activations, - absl::flat_hash_map& - input_kv_cache_buffers, - absl::flat_hash_map& - output_kv_cache_buffers); + absl::flat_hash_map& input_kv_cache_buffers, + absl::flat_hash_map& output_kv_cache_buffers); private: - LlmLiteRtMtpDrafter(CompiledModel mtp_drafter_model, - SimpleSignature drafter_signature, - CompiledModel& base_model, - SimpleSignature verify_signature, - EmbeddingLookupManager& embedding_manager, - EmbeddingLookupManager& ple_manager, - std::unique_ptr drafter_sampler, - std::unique_ptr verifier_sampler, - std::vector kv_cache_input_names, - absl::flat_hash_map - drafter_input_buffers, - absl::flat_hash_map - drafter_output_buffers, - absl::flat_hash_map - verifier_input_buffers, - absl::flat_hash_map - verifier_output_buffers, - int num_draft_steps) + LlmLiteRtMtpDrafter( + CompiledModel mtp_drafter_model, SimpleSignature drafter_signature, + CompiledModel& base_model, SimpleSignature verify_signature, + EmbeddingLookupManager& embedding_manager, + EmbeddingLookupManager& ple_manager, + std::unique_ptr drafter_sampler, + std::unique_ptr verifier_sampler, + std::vector kv_cache_input_names, + absl::flat_hash_map drafter_input_buffers, + absl::flat_hash_map drafter_output_buffers, + absl::flat_hash_map verifier_input_buffers, + absl::flat_hash_map verifier_output_buffers, + int num_draft_steps) : mtp_drafter_model_(std::move(mtp_drafter_model)), drafter_signature_(std::move(drafter_signature)), base_model_(base_model), @@ -125,8 +118,8 @@ class LlmLiteRtMtpDrafter { } absl::Status PrepareDrafterInputBuffers( - int position, absl::flat_hash_map& - output_kv_cache_buffers); + int position, + absl::flat_hash_map& output_kv_cache_buffers); absl::Status PrepareDrafterOutputBuffers(); @@ -135,12 +128,10 @@ class LlmLiteRtMtpDrafter { absl::Status PrepareVerifierInputBuffers( int position, int token_id, const std::vector& drafted_tokens, - absl::flat_hash_map& - input_kv_cache_buffers); + absl::flat_hash_map& input_kv_cache_buffers); absl::Status PrepareVerifierOutputBuffers( - absl::flat_hash_map& - output_kv_cache_buffers); + absl::flat_hash_map& output_kv_cache_buffers); absl::StatusOr> RunVerification(); @@ -168,29 +159,26 @@ class LlmLiteRtMtpDrafter { // - input_position [batch, sequence_length] // - mask [batch, 1, sequence_length = 1, context] // - activations [batch, sequence_length = 1, hidden_size * 2] - absl::flat_hash_map drafter_input_buffers_; + absl::flat_hash_map drafter_input_buffers_; // - logits [batch, sequence_length, vocab_size] // - projected_logits [batch, sequence_length, hidden_size] - absl::flat_hash_map drafter_output_buffers_; + absl::flat_hash_map drafter_output_buffers_; // Verifier owned buffers. // - input_position [batch, draft_steps + 1] // - mask [batch, 1, draft_steps + 1, context] // - embeddings [batch, draft_steps + 1, hidden_size] // - per_layer_embeddings [batch, draft_steps + 1, ...] - absl::flat_hash_map verifier_input_buffers_; + absl::flat_hash_map verifier_input_buffers_; // - logits [batch, draft_steps + 1, vocab_size] // - activations [batch, draft_steps + 1, hidden_size] - absl::flat_hash_map verifier_output_buffers_; + absl::flat_hash_map verifier_output_buffers_; // Cached maps for Run to avoid map creation overhead. - absl::flat_hash_map - active_drafter_input_buffers_; - absl::flat_hash_map - active_drafter_output_buffers_; - absl::flat_hash_map - active_verifier_input_buffers_; - absl::flat_hash_map + absl::flat_hash_map active_drafter_input_buffers_; + absl::flat_hash_map active_drafter_output_buffers_; + absl::flat_hash_map active_verifier_input_buffers_; + absl::flat_hash_map active_verifier_output_buffers_; // Pre-allocated temporary tensors for sampling. diff --git a/runtime/executor/llm_litert_npu_compiled_model_executor.cc b/runtime/executor/llm_litert_npu_compiled_model_executor.cc index 26b1ab66f..8bc5bbfab 100644 --- a/runtime/executor/llm_litert_npu_compiled_model_executor.cc +++ b/runtime/executor/llm_litert_npu_compiled_model_executor.cc @@ -830,8 +830,7 @@ LlmLiteRtNpuCompiledModelExecutor::CreateRopeContextWithBufferSharing( } absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers( - litert::Environment& env, const litert::Model* transformer_model, - CompiledModel& llm_compiled_model, + litert::Environment& env, CompiledModel& llm_compiled_model, absl::flat_hash_map& gemma_prefill_input_buffers, absl::flat_hash_map& @@ -846,10 +845,15 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers( decode_output_kv_cache_slice_buffers, absl::flat_hash_map& verify_output_kv_cache_slice_buffers) { - auto prefill_signature = transformer_model->FindSignature(kPrefillSignature); + LITERT_ASSIGN_OR_RETURN( + auto prefill_input_names, + llm_compiled_model.GetSignatureInputNames(kPrefillSignature)); + LITERT_ASSIGN_OR_RETURN( + auto prefill_output_names, + llm_compiled_model.GetSignatureOutputNames(kPrefillSignature)); // Create input buffers for prefill signature. - for (auto input_name : prefill_signature->InputNames()) { + for (auto input_name : prefill_input_names) { if (absl::StartsWith(input_name, kv_cache_k_root_name) || absl::StartsWith(input_name, kv_cache_v_root_name)) { LITERT_ASSIGN_OR_RETURN( @@ -865,8 +869,13 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers( } // Create input buffers for decode signature. Skip kv cache input buffers as // they are already created in the prefill signature. - auto decode_signature = transformer_model->FindSignature(kDecodeSignature); - for (auto input_name : decode_signature->InputNames()) { + LITERT_ASSIGN_OR_RETURN( + auto decode_input_names, + llm_compiled_model.GetSignatureInputNames(kDecodeSignature)); + LITERT_ASSIGN_OR_RETURN( + auto decode_output_names, + llm_compiled_model.GetSignatureOutputNames(kDecodeSignature)); + for (auto input_name : decode_input_names) { if (absl::StartsWith(input_name, kv_cache_k_root_name) || absl::StartsWith(input_name, kv_cache_v_root_name)) { // Create the input kv cache buffer for the decode signature if it is not @@ -886,7 +895,7 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers( } // Create output buffers for prefill signature. - for (auto output_name : prefill_signature->OutputNames()) { + for (auto output_name : prefill_output_names) { if (absl::StartsWith(output_name, kv_cache_slice_k_root_name) || absl::StartsWith(output_name, kv_cache_slice_v_root_name)) { LITERT_ASSIGN_OR_RETURN( @@ -896,7 +905,7 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers( } } // Create output buffers for decode signature. - for (auto output_name : decode_signature->OutputNames()) { + for (auto output_name : decode_output_names) { if (absl::StartsWith(output_name, kv_cache_slice_k_root_name) || absl::StartsWith(output_name, kv_cache_slice_v_root_name)) { LITERT_ASSIGN_OR_RETURN( @@ -906,16 +915,21 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers( } // Create input/output buffers for verify signature if it exists. - auto verify_signature = - transformer_model->FindSignature(LlmSignatures::kVerifyLlm); - if (verify_signature) { - for (auto input_name : verify_signature->InputNames()) { + if (auto verify_input_names_res = llm_compiled_model.GetSignatureInputNames( + LlmSignatures::kVerifyLlm)) { + ABSL_LOG(INFO) << "Verify signature found. Inputs:"; + for (auto input_name : *verify_input_names_res) { + ABSL_LOG(INFO) << " - " << input_name; LITERT_ASSIGN_OR_RETURN(gemma_verify_input_buffers[input_name], llm_compiled_model.CreateInputBuffer( LlmSignatures::kVerifyLlm, input_name)); gemma_verify_input_buffers[input_name].Clear(); } - for (auto output_name : verify_signature->OutputNames()) { + + LITERT_ASSIGN_OR_RETURN( + auto verify_output_names, + llm_compiled_model.GetSignatureOutputNames(LlmSignatures::kVerifyLlm)); + for (auto output_name : verify_output_names) { if (absl::StartsWith(output_name, kv_cache_slice_k_root_name) || absl::StartsWith(output_name, kv_cache_slice_v_root_name)) { LITERT_ASSIGN_OR_RETURN( @@ -2742,8 +2756,14 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding( absl::flat_hash_map verify_output_kv_cache_slice_buffers; + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLitePrefillDecode)); + } + RETURN_IF_ERROR(AllocateTransformerBuffers( - env, transformer_model, llm_compiled_model, gemma_prefill_input_buffers, + env, llm_compiled_model, gemma_prefill_input_buffers, gemma_decode_input_buffers, gemma_verify_input_buffers, input_kv_cache_buffers, prefill_output_kv_cache_slice_buffers, decode_output_kv_cache_slice_buffers, @@ -2772,6 +2792,12 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding( verify_output_kv_cache_slice_buffers, gemma_prefill_input_buffers, gemma_decode_input_buffers, gemma_verify_input_buffers)); + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLitePrefillDecode)); + } + LITERT_ASSIGN_OR_RETURN(auto npu_auxiliary_lrt_model, resources.GetTFLiteModel(ModelType::kTfLiteAux)); @@ -2780,6 +2806,11 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding( CreateNpuAuxiliaryContext(env, *npu_auxiliary_lrt_model, executor_settings)); + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteAux)); + } + LITERT_ASSIGN_OR_RETURN( auto mask_context, CreateMaskContextWithBufferSharing( @@ -2855,10 +2886,15 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding( add_multi_modal_end_model(ModelType::kTfLiteEndOfVision, litert::lm::ExecutorVisionData::kEndToken); + absl::flat_hash_map raw_end_of_multi_modal_models; + for (const auto& [token, model] : end_of_multi_modal_embedding_models) { + raw_end_of_multi_modal_models[token] = model; + } + LITERT_ASSIGN_OR_RETURN( std::unique_ptr embedding_lookup_manager, EmbeddingLookupManager::Create(env, embedder_lrt_model, - end_of_multi_modal_embedding_models, true, + raw_end_of_multi_modal_models, true, "decode_embedder")); bool use_hw_ple_for_npu = false; @@ -2867,6 +2903,10 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding( use_hw_ple_for_npu = npu_config_status->use_hw_ple_for_npu; } + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteEmbedder)); + } std::optional embedder_per_layer_context = std::nullopt; @@ -2965,6 +3005,14 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding( RETURN_IF_ERROR(WarmupDrafterInference(drafter_context.value(), drafter_aux_context.value())); + + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLiteMtpDrafter)); + + RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteMtpAux)); + } } } @@ -3013,8 +3061,14 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelWithoutPerLayerEmbedding( absl::flat_hash_map verify_output_kv_cache_slice_buffers; + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLitePrefillDecode)); + } + RETURN_IF_ERROR(AllocateTransformerBuffers( - env, transformer_model, llm_compiled_model, gemma_prefill_input_buffers, + env, llm_compiled_model, gemma_prefill_input_buffers, gemma_decode_input_buffers, gemma_verify_input_buffers, input_kv_cache_buffers, prefill_output_kv_cache_slice_buffers, decode_output_kv_cache_slice_buffers, @@ -3063,6 +3117,12 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelWithoutPerLayerEmbedding( llm_inference_context.decode_input_buffers[cache_v17] = std::move(buffer_v); } + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLitePrefillDecode)); + } + LITERT_ASSIGN_OR_RETURN(auto npu_auxiliary_lrt_model, resources.GetTFLiteModel(ModelType::kTfLiteAux)); @@ -3071,6 +3131,11 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelWithoutPerLayerEmbedding( CreateNpuAuxiliaryContext(env, *npu_auxiliary_lrt_model, executor_settings)); + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteAux)); + } + LITERT_ASSIGN_OR_RETURN( auto mask_context, CreateMaskContextWithBufferSharing( @@ -3154,6 +3219,17 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelWithoutPerLayerEmbedding( EmbeddingLookupManager::Create(env, embedder_lrt_model, end_of_multi_modal_embedding_models, true, "decode_embedder")); + + if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated(); + is_fully_accelerated.HasValue() && *is_fully_accelerated) { + RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteEmbedder)); + + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLiteEndOfAudio)); + RETURN_IF_ERROR( + resources.ReleaseTFLiteModel(ModelType::kTfLiteEndOfVision)); + end_of_multi_modal_embedding_models.clear(); + } } SpeculativeDecodingType speculative_decoding_type = diff --git a/runtime/executor/llm_litert_npu_compiled_model_executor.h b/runtime/executor/llm_litert_npu_compiled_model_executor.h index 7c58ceed3..2c1c2923e 100644 --- a/runtime/executor/llm_litert_npu_compiled_model_executor.h +++ b/runtime/executor/llm_litert_npu_compiled_model_executor.h @@ -560,8 +560,7 @@ class LlmLiteRtNpuCompiledModelExecutor : public LlmExecutor { drafter_aux_output_buffers); static absl::Status AllocateTransformerBuffers( - litert::Environment& env, const litert::Model* transformer_model, - CompiledModel& llm_compiled_model, + litert::Environment& env, CompiledModel& llm_compiled_model, absl::flat_hash_map& gemma_prefill_input_buffers, absl::flat_hash_map& diff --git a/runtime/executor/llm_processed_context.h b/runtime/executor/llm_processed_context.h index f2cb79484..4e6c0869d 100644 --- a/runtime/executor/llm_processed_context.h +++ b/runtime/executor/llm_processed_context.h @@ -17,10 +17,10 @@ #include #include +#include #include #include "absl/container/flat_hash_map.h" // from @com_google_absl -#include "absl/strings/string_view.h" // from @com_google_absl #include "litert/cc/litert_tensor_buffer.h" // from @litert #include "runtime/executor/llm_executor_io_types.h" #include "runtime/executor/llm_executor_processed_tokens.h" @@ -34,8 +34,7 @@ class LlmProcessedContext : public ProcessedContext { public: explicit LlmProcessedContext( std::optional lora_id, - absl::flat_hash_map - kv_cache_buffers, + absl::flat_hash_map kv_cache_buffers, ::litert::lm::ProcessedTokens processed_tokens = {}) : lora_id_(lora_id), processed_tokens_(std::move(processed_tokens)), @@ -47,16 +46,14 @@ class LlmProcessedContext : public ProcessedContext { } ProcessedTokens& processed_tokens() override { return processed_tokens_; } - absl::flat_hash_map& - kv_cache_buffers() { + absl::flat_hash_map& kv_cache_buffers() { return kv_cache_buffers_; } private: std::optional lora_id_; ProcessedTokens processed_tokens_; - absl::flat_hash_map - kv_cache_buffers_; + absl::flat_hash_map kv_cache_buffers_; }; } // namespace litert::lm diff --git a/runtime/executor/magic_number_configs_helper_test.cc b/runtime/executor/magic_number_configs_helper_test.cc index f66679e06..def0845ad 100644 --- a/runtime/executor/magic_number_configs_helper_test.cc +++ b/runtime/executor/magic_number_configs_helper_test.cc @@ -24,6 +24,7 @@ #include #include +#include "absl/status/status.h" // from @com_google_absl #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "litert/cc/litert_macros.h" // from @litert @@ -112,6 +113,8 @@ class ModelResourcesMock : public ModelResources { (), (override)); MOCK_METHOD((absl::StatusOr>), GetWeightsSectionOffset, (ModelType model_type), (override)); + MOCK_METHOD(absl::Status, ReleaseTFLiteModel, (ModelType model_type), + (override)); absl::StatusOr GetTFLiteModel( ModelType model_type) override { diff --git a/runtime/executor/vision_litert_compiled_model_executor.cc b/runtime/executor/vision_litert_compiled_model_executor.cc index e121ecf42..3cc3c7bcc 100644 --- a/runtime/executor/vision_litert_compiled_model_executor.cc +++ b/runtime/executor/vision_litert_compiled_model_executor.cc @@ -192,11 +192,12 @@ VisionLiteRtCompiledModelExecutor::VisionEncoder::Create( const VisionExecutorProperties& vision_executor_properties) { auto handler = std::unique_ptr(new VisionEncoder( env, model, vision_executor_settings, vision_executor_properties)); - RETURN_IF_ERROR(handler->Initialize()); + RETURN_IF_ERROR(handler->Initialize(*model)); return handler; } -absl::Status VisionLiteRtCompiledModelExecutor::VisionEncoder::Initialize() { +absl::Status VisionLiteRtCompiledModelExecutor::VisionEncoder::Initialize( + const Model& model) { // TODO(b/405424188): - Add support for NPU backends. LITERT_ASSIGN_OR_RETURN(auto options, Options::Create()); auto weight_cache_file = vision_executor_settings_.GetWeightCacheFile( @@ -269,8 +270,13 @@ absl::Status VisionLiteRtCompiledModelExecutor::VisionEncoder::Initialize() { } LITERT_ASSIGN_OR_RETURN(compiled_model_, - CompiledModel::Create(env_, model_.Get(), options)); + CompiledModel::Create(env_, model.Get(), options)); if (!vision_executor_properties_.patch_num_shrink_factor.has_value()) { + if (auto num_signatures = model.GetNumSignatures(); num_signatures != 1) { + return absl::InvalidArgumentError(absl::StrCat( + "The Vision Encoder model must have exactly one signature but got ", + num_signatures)); + } // Only create input buffer at initialization for non-VIT models. LITERT_ASSIGN_OR_RETURN(input_buffers_, compiled_model_.CreateInputBuffers(0)); @@ -288,11 +294,12 @@ VisionLiteRtCompiledModelExecutor::VisionAdapter::Create( const VisionExecutorProperties& vision_executor_properties) { auto handler = std::unique_ptr(new VisionAdapter( env, model, vision_executor_settings, vision_executor_properties)); - RETURN_IF_ERROR(handler->Initialize()); + RETURN_IF_ERROR(handler->Initialize(*model)); return handler; } -absl::Status VisionLiteRtCompiledModelExecutor::VisionAdapter::Initialize() { +absl::Status VisionLiteRtCompiledModelExecutor::VisionAdapter::Initialize( + const Model& model) { // TODO(b/405424188): - Add support for NPU backends. LITERT_ASSIGN_OR_RETURN(auto options, Options::Create()); auto weight_cache_file = vision_executor_settings_.GetWeightCacheFile( @@ -329,7 +336,7 @@ absl::Status VisionLiteRtCompiledModelExecutor::VisionAdapter::Initialize() { } LITERT_ASSIGN_OR_RETURN(compiled_model_, - CompiledModel::Create(env_, model_.Get(), options)); + CompiledModel::Create(env_, model.Get(), options)); // This check verifies if signature 0 of the adapter model contains any // inputs. This is used to infer whether input buffers should be created at // initialization time (for single-signature models that use signature 0 by @@ -337,7 +344,7 @@ absl::Status VisionLiteRtCompiledModelExecutor::VisionAdapter::Initialize() { // input buffers on-demand in `Encode` for a specific signature). This is a // more direct check than relying on `patch_num_shrink_factor` which was // previously used to detect multi-signature models. - auto signature_or = model_.GetSignature(0); + auto signature_or = model.GetSignature(0); if (signature_or.HasValue() && !signature_or->InputNames().empty()) { LITERT_ASSIGN_OR_RETURN(input_buffers_, compiled_model_.CreateInputBuffers(0)); @@ -373,16 +380,6 @@ litert::lm::VisionLiteRtCompiledModelExecutor::Create( auto vision_executor_properties, GetVisionExecutorPropertiesFromModelResources(*resources.get())); - ASSIGN_OR_RETURN( - auto vision_encoder, - VisionEncoder::Create(env, vision_encoder_model, vision_executor_settings, - vision_executor_properties)); - - ASSIGN_OR_RETURN( - auto vision_adapter, - VisionAdapter::Create(env, vision_adapter_model, vision_executor_settings, - vision_executor_properties)); - LITERT_ASSIGN_OR_RETURN(auto tensor_type, vision_encoder_model->GetInputTensorType(0, 0)); const auto& dimensions = tensor_type.Layout().Dimensions(); @@ -400,6 +397,16 @@ litert::lm::VisionLiteRtCompiledModelExecutor::Create( auto expected_input_dimension = std::vector(dimensions.begin(), dimensions.end()); + ASSIGN_OR_RETURN( + auto vision_encoder, + VisionEncoder::Create(env, vision_encoder_model, vision_executor_settings, + vision_executor_properties)); + + ASSIGN_OR_RETURN( + auto vision_adapter, + VisionAdapter::Create(env, vision_adapter_model, vision_executor_settings, + vision_executor_properties)); + return absl::WrapUnique(new VisionLiteRtCompiledModelExecutor( vision_executor_settings, env, std::move(resources), std::move(vision_encoder), std::move(vision_adapter), diff --git a/runtime/executor/vision_litert_compiled_model_executor.h b/runtime/executor/vision_litert_compiled_model_executor.h index 928e4fabd..5d40d8827 100644 --- a/runtime/executor/vision_litert_compiled_model_executor.h +++ b/runtime/executor/vision_litert_compiled_model_executor.h @@ -90,7 +90,7 @@ class VisionLiteRtCompiledModelExecutor : public VisionExecutor { // Initialize the VisionEncoder, which will create the input and output // buffers for the vision encoder model. - absl::Status Initialize(); + absl::Status Initialize(const Model& model); // Returns the CompiledModel for the vision encoder model. const CompiledModel& GetCompiledModel() const { return compiled_model_; } @@ -149,7 +149,6 @@ class VisionLiteRtCompiledModelExecutor : public VisionExecutor { // The vision executor properties. const VisionExecutorProperties& vision_executor_properties_; - // The vision encoder compiled model. CompiledModel compiled_model_; @@ -180,7 +179,7 @@ class VisionLiteRtCompiledModelExecutor : public VisionExecutor { const VisionExecutorProperties& vision_executor_properties); // Initialize the VisionAdapter. - absl::Status Initialize(); + absl::Status Initialize(const Model& model); // Returns the CompiledModel for the vision adapter model. const CompiledModel& GetCompiledModel() const { return compiled_model_; } @@ -226,7 +225,6 @@ class VisionLiteRtCompiledModelExecutor : public VisionExecutor { // The vision executor properties. const VisionExecutorProperties& vision_executor_properties_; - // The vision adapter compiled model. CompiledModel compiled_model_; diff --git a/runtime/util/litert_lm_loader.cc b/runtime/util/litert_lm_loader.cc index abed74c7e..f9a2f7d0e 100644 --- a/runtime/util/litert_lm_loader.cc +++ b/runtime/util/litert_lm_loader.cc @@ -14,6 +14,10 @@ #include "runtime/util/litert_lm_loader.h" +#if defined(__linux__) || defined(__ANDROID__) +#include +#endif + #include #include #include @@ -296,6 +300,18 @@ absl::StatusOr> LitertLmLoader::GetSectionLocation( return section_location_it->second; } +absl::Status LitertLmLoader::ReleaseSection(BufferKey buffer_key) { + absl::MutexLock lock(section_buffers_mutex_); + auto it = section_buffers_.find(buffer_key); + if (it != section_buffers_.end()) { + section_buffers_.erase(it); + } + + section_memory_mapped_files_.erase(buffer_key); + section_locations_.erase(buffer_key); + return absl::OkStatus(); +} + std::optional> LitertLmLoader::GetHuggingFaceTokenizer() { auto optional_section_buffer = diff --git a/runtime/util/litert_lm_loader.h b/runtime/util/litert_lm_loader.h index ad7beaa87..776141e63 100644 --- a/runtime/util/litert_lm_loader.h +++ b/runtime/util/litert_lm_loader.h @@ -190,6 +190,11 @@ class LitertLmLoader { absl::StatusOr> GetSectionLocation( BufferKey buffer_key) const; + // Releases the section buffer and the memory mapped file associated with the + // given buffer key. + absl::Status ReleaseSection(BufferKey buffer_key) + ABSL_LOCKS_EXCLUDED(section_buffers_mutex_); + absl::StatusOr> GetScopedFile(); private: diff --git a/runtime/util/memory_mapped_file_posix.cc b/runtime/util/memory_mapped_file_posix.cc index 11202284d..4184d16e9 100644 --- a/runtime/util/memory_mapped_file_posix.cc +++ b/runtime/util/memory_mapped_file_posix.cc @@ -39,6 +39,7 @@ class MemoryMappedFilePosix : public MemoryMappedFile { : length_(length), data_(data) {} ~MemoryMappedFilePosix() override { if (data_) { + ABSL_LOG(INFO) << "munmap address " << data_ << " length " << length_; munmap(data_, length_); } } @@ -112,6 +113,7 @@ absl::StatusOr> MemoryMappedFile::Create( void* data = mmap(nullptr, length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file, offset); + ABSL_LOG(INFO) << "mmap address " << data << " length " << length; RET_CHECK_NE(data, MAP_FAILED) << "Failed to map, error: " << strerror(errno); RET_CHECK_NE(data, nullptr) << "Failed to map."; #ifdef __APPLE__