diff --git a/runtime/components/embedding_lookup/BUILD b/runtime/components/embedding_lookup/BUILD
index ba5f480f6..a0360c267 100644
--- a/runtime/components/embedding_lookup/BUILD
+++ b/runtime/components/embedding_lookup/BUILD
@@ -28,6 +28,7 @@ cc_library(
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
+        "@litert//litert/cc:litert_expected",
     ] + select({
         "@litert//litert:litert_link_capi_so": [
             "@litert//litert/cc:litert_api_with_dynamic_runtime",
@@ -51,6 +52,8 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@litert//litert/c:litert_common",
+        "@litert//litert/cc:litert_expected",
         "//runtime/util:litert_status_util",
     ] + select({
         "@litert//litert:litert_link_capi_so": [
@@ -152,6 +155,8 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@litert//litert/c:litert_common",
+        "@litert//litert/cc:litert_expected",
         "@litert//litert/cc:litert_macros",
         "//runtime/executor:llm_executor_io_types",
         "//runtime/util:litert_status_util",
diff --git a/runtime/components/embedding_lookup/embedding_lookup.h b/runtime/components/embedding_lookup/embedding_lookup.h
index 1116caa5e..6d726960c 100644
--- a/runtime/components/embedding_lookup/embedding_lookup.h
+++ b/runtime/components/embedding_lookup/embedding_lookup.h
@@ -22,6 +22,7 @@
 
 #include "absl/status/status.h"  // from @com_google_absl
 #include "absl/types/span.h"  // from @com_google_absl
+#include "litert/cc/litert_expected.h"  // from @litert
 #include "litert/cc/litert_tensor_buffer.h"  // from @litert
 
 namespace litert::lm {
@@ -69,6 +70,9 @@ class EmbeddingLookup {
   virtual absl::Status LookupPrefill(absl::Span<const int> tokens,
                                      litert::TensorBuffer* output_tensor,
                                      size_t byte_offset) = 0;
+
+  // Returns whether the embedding lookup compiled model is fully accelerated.
+  virtual litert::Expected<bool> IsFullyAccelerated() = 0;
 };
 
 }  // namespace litert::lm
diff --git a/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.cc b/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.cc
index 308d542e4..43be20c24 100644
--- a/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.cc
+++ b/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.cc
@@ -154,13 +154,13 @@ EndOfMultiModalEmbedding::Create(litert::Environment& env,
                                  const litert::Model* absl_nonnull model,
                                  int special_token) {
   auto handler = std::unique_ptr<EndOfMultiModalEmbedding>(
-      new EndOfMultiModalEmbedding(env, model, special_token));
+      new EndOfMultiModalEmbedding(env, special_token));
   RETURN_IF_ERROR(  // IWYU pragma: keep as is included by status_macros.h
-      handler->Initialize());
+      handler->Initialize(*model));
   return handler;
 }
 
-absl::Status EndOfMultiModalEmbedding::Initialize() {
+absl::Status EndOfMultiModalEmbedding::Initialize(const Model& model) {
   LITERT_ASSIGN_OR_RETURN(auto options, Options::Create());
 #if defined(__ANDROID__)
   options.SetHardwareAccelerators(litert::HwAccelerators::kNpu |
@@ -179,8 +179,8 @@ absl::Status EndOfMultiModalEmbedding::Initialize() {
 
   LITERT_ASSIGN_OR_RETURN(
       litert::CompiledModel compiled_model,
-      litert::CompiledModel::Create(env_, model_.Get(), options));
-  if (auto num_signatures = model_.GetNumSignatures(); num_signatures != 1) {
+      litert::CompiledModel::Create(env_, model.Get(), options));
+  if (auto num_signatures = model.GetNumSignatures(); num_signatures != 1) {
     return absl::InvalidArgumentError(absl::StrCat(
         "The Embedding model must have exactly one signature but got ",
         num_signatures));
@@ -237,6 +237,10 @@ absl::Status EndOfMultiModalEmbedding::Initialize() {
   size_t bytes = end_of_multi_modal_embedding_.size() * sizeof(float);
   output_buffers[0].Read(absl::MakeSpan(data_ptr, bytes));
 
+  if (auto res = compiled_model.IsFullyAccelerated(); res.HasValue()) {
+    is_fully_accelerated_ = *res;
+  }
+
   return absl::OkStatus();
 }
 
diff --git a/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h b/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h
index 09d957c86..4f8b8e390 100644
--- a/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h
+++ b/runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h
@@ -19,7 +19,6 @@
 
 #include <cstddef>
 #include <memory>
-#include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"  // from @com_google_absl
@@ -29,7 +28,6 @@
 #include "litert/cc/litert_environment.h"  // from @litert
 #include "litert/cc/litert_layout.h"  // from @litert
 #include "litert/cc/litert_model.h"  // from @litert
-#include "litert/cc/litert_options.h"  // from @litert
 #include "litert/cc/litert_tensor_buffer.h"  // from @litert
 #include "runtime/components/embedding_lookup/embedding_lookup.h"
 
@@ -74,20 +72,19 @@ class EndOfMultiModalEmbedding : public EmbeddingLookup {
                              litert::TensorBuffer* prefill_output,
                              size_t byte_offset) override;
 
+  litert::Expected<bool> IsFullyAccelerated() override {
+    return is_fully_accelerated_;
+  }
+
  protected:
-  EndOfMultiModalEmbedding(litert::Environment& env,
-                           const litert::Model* absl_nonnull model,
-                           int special_token)
-      : env_(env), model_(*model), special_token_(special_token) {}
+  EndOfMultiModalEmbedding(litert::Environment& env, int special_token)
+      : env_(env), special_token_(special_token) {}
 
   // Loads the provided model. This must be called before Lookup functions.
-  absl::Status Initialize();
+  absl::Status Initialize(const litert::Model& model);
 
   // The environment for the embedding lookup.
   litert::Environment& env_;
-  // The model for the embedding lookup. The actual model instance is owned by
-  // the model resources.
-  const litert::Model& model_;
 
   // The layout of the output tensor from the embedding model.
   litert::Layout output_buffer_layout_;
@@ -99,6 +96,8 @@ class EndOfMultiModalEmbedding : public EmbeddingLookup {
   // Contains the end of multi-modal embedding that was looked up from the
   // model.
   std::vector<float> end_of_multi_modal_embedding_;
+
+  bool is_fully_accelerated_ = false;
 };
 
 }  // namespace litert::lm
diff --git a/runtime/components/embedding_lookup/embedding_lookup_manager.cc b/runtime/components/embedding_lookup/embedding_lookup_manager.cc
index 7d90c6289..95f4fd7c3 100644
--- a/runtime/components/embedding_lookup/embedding_lookup_manager.cc
+++ b/runtime/components/embedding_lookup/embedding_lookup_manager.cc
@@ -29,7 +29,9 @@
 #include "absl/status/statusor.h"  // from @com_google_absl
 #include "absl/strings/string_view.h"  // from @com_google_absl
 #include "absl/types/span.h"  // from @com_google_absl
+#include "litert/c/litert_common.h"  // from @litert
 #include "litert/cc/litert_environment.h"  // from @litert
+#include "litert/cc/litert_expected.h"  // from @litert
 #include "litert/cc/litert_macros.h"  // from @litert
 #include "litert/cc/litert_model.h"  // from @litert
 #include "litert/cc/litert_tensor_buffer.h"  // from @litert
@@ -265,4 +267,21 @@ absl::Status EmbeddingLookupManager::Initialize(
   return absl::OkStatus();
 }
 
+litert::Expected<bool> EmbeddingLookupManager::IsFullyAccelerated() const {
+  if (text_embedding_lookup_ == nullptr) {
+    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
+                         "Text embedding lookup has not been created.");
+  }
+  if (auto res = text_embedding_lookup_->IsFullyAccelerated();
+      !res.HasValue() || !*res) {
+    return res;
+  }
+  for (const auto& lookup : end_of_multi_modal_embedding_lookups_) {
+    if (auto res = lookup->IsFullyAccelerated(); !res.HasValue() || !*res) {
+      return res;
+    }
+  }
+  return true;
+}
+
 }  // namespace litert::lm
diff --git a/runtime/components/embedding_lookup/embedding_lookup_manager.h b/runtime/components/embedding_lookup/embedding_lookup_manager.h
index 1e63d4695..43da8f7bf 100644
--- a/runtime/components/embedding_lookup/embedding_lookup_manager.h
+++ b/runtime/components/embedding_lookup/embedding_lookup_manager.h
@@ -27,6 +27,7 @@
 #include "absl/status/statusor.h"  // from @com_google_absl
 #include "absl/types/span.h"  // from @com_google_absl
 #include "litert/cc/litert_environment.h"  // from @litert
+#include "litert/cc/litert_expected.h"  // from @litert
 #include "litert/cc/litert_model.h"  // from @litert
 #include "litert/cc/litert_tensor_buffer.h"  // from @litert
 #include "runtime/components/embedding_lookup/embedding_lookup_end_of_multi_modal.h"
@@ -116,6 +117,8 @@ class EmbeddingLookupManager {
     return text_embedding_lookup_.get();
   }
 
+  litert::Expected<bool> IsFullyAccelerated() const;
+
  protected:
   absl::Status Initialize(
       litert::Environment& env,
diff --git a/runtime/components/embedding_lookup/embedding_lookup_multi_modal.h b/runtime/components/embedding_lookup/embedding_lookup_multi_modal.h
index 5d833c83e..89fa11075 100644
--- a/runtime/components/embedding_lookup/embedding_lookup_multi_modal.h
+++ b/runtime/components/embedding_lookup/embedding_lookup_multi_modal.h
@@ -91,6 +91,8 @@ class EmbeddingLookupMultiModal : public EmbeddingLookup {
   // Returns true if there are any embeddings left to be read.
   bool HasRemainingEmbeddings() const { return embedding_.size() > 0; }
 
+  litert::Expected<bool> IsFullyAccelerated() override { return true; }
+
  protected:
   absl::Status Initialize(const ::litert::TensorBuffer* embedding_buffer,
                           int special_token);
diff --git a/runtime/components/embedding_lookup/embedding_lookup_text.cc b/runtime/components/embedding_lookup/embedding_lookup_text.cc
index 580a28558..35c2ab2a2 100644
--- a/runtime/components/embedding_lookup/embedding_lookup_text.cc
+++ b/runtime/components/embedding_lookup/embedding_lookup_text.cc
@@ -31,10 +31,7 @@
 #include "absl/strings/string_view.h"  // from @com_google_absl
 #include "absl/types/span.h"  // from @com_google_absl
 #include "litert/cc/litert_common.h"  // from @litert
-#include "litert/cc/litert_compiled_model.h"  // from @litert
-#include "litert/cc/litert_element_type.h"  // from @litert
-#include "litert/cc/litert_environment.h"  // from @litert
-#include "litert/cc/litert_macros.h"  // from @litert
+#include "litert/cc/litert_expected.h"  // from @litert
 #include "litert/cc/litert_model.h"  // from @litert
 #include "litert/cc/litert_options.h"  // from @litert
 #include "litert/cc/litert_tensor_buffer.h"  // from @litert
@@ -248,12 +245,12 @@ EmbeddingLookupText::Create(litert::Environment& env,
                             const litert::Model* absl_nonnull model,
                             std::optional<std::string> signature_key) {
   auto handler = std::unique_ptr<EmbeddingLookupText>(
-      new EmbeddingLookupText(env, model, signature_key));
-  RETURN_IF_ERROR(handler->Initialize());
+      new EmbeddingLookupText(env, signature_key));
+  RETURN_IF_ERROR(handler->Initialize(*model));
   return handler;
 }
 
-absl::Status EmbeddingLookupText::Initialize() {
+absl::Status EmbeddingLookupText::Initialize(const litert::Model& model) {
   LITERT_ASSIGN_OR_RETURN(auto options, Options::Create());
 #if defined(__ANDROID__)
   options.SetHardwareAccelerators(litert::HwAccelerators::kNpu |
@@ -271,8 +268,8 @@ absl::Status EmbeddingLookupText::Initialize() {
 #endif
 
   LITERT_ASSIGN_OR_RETURN(compiled_model_, litert::CompiledModel::Create(
-                                               env_, model_.Get(), options));
-  LITERT_ASSIGN_OR_RETURN(auto signatures, model_.GetSignatures());
+                                               env_, model.Get(), options));
+  LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures());
 
   if (signature_key_.has_value()) {
     bool found = false;
@@ -354,4 +351,12 @@ absl::Status EmbeddingLookupText::Initialize() {
   return absl::OkStatus();
 }
 
+litert::Expected<bool> EmbeddingLookupText::IsFullyAccelerated() {
+  if (!compiled_model_.has_value()) {
+    return litert::Error(litert::Status::kErrorRuntimeFailure,
+                         "Compiled model has not been created.");
+  }
+  return compiled_model_->IsFullyAccelerated();
+}
+
 }  // namespace litert::lm
diff --git a/runtime/components/embedding_lookup/embedding_lookup_text.h b/runtime/components/embedding_lookup/embedding_lookup_text.h
index d40af5712..a3243a5ba 100644
--- a/runtime/components/embedding_lookup/embedding_lookup_text.h
+++ b/runtime/components/embedding_lookup/embedding_lookup_text.h
@@ -22,7 +22,6 @@
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"  // from @com_google_absl
@@ -31,6 +30,7 @@
 #include "absl/types/span.h"  // from @com_google_absl
 #include "litert/cc/litert_compiled_model.h"  // from @litert
 #include "litert/cc/litert_environment.h"  // from @litert
+#include "litert/cc/litert_expected.h"  // from @litert
 #include "litert/cc/litert_model.h"  // from @litert
 #include "litert/cc/litert_options.h"  // from @litert
 #include "litert/cc/litert_ranked_tensor_type.h"  // from @litert
@@ -103,6 +103,8 @@ class EmbeddingLookupText : public EmbeddingLookup {
   // Returns number of floats per token in the output tensor.
   size_t GetFloatsPerToken();
 
+  litert::Expected<bool> IsFullyAccelerated() override;
+
   // Returns the default embedding vector to use when a token is not found in
   // the lookup table.
   const std::vector<float>& GetDefaultEmbeddingVector() const {
@@ -116,12 +118,11 @@ class EmbeddingLookupText : public EmbeddingLookup {
 
  protected:
   EmbeddingLookupText(litert::Environment& env,
-                      const litert::Model* absl_nonnull model,
                       std::optional<std::string> signature_key)
-      : env_(env), model_(*model), signature_key_(signature_key) {}
+      : env_(env), signature_key_(signature_key) {}
 
   // Loads the provided model. This must be called before Lookup.
-  absl::Status Initialize();
+  absl::Status Initialize(const litert::Model& model);
 
   // Internal implementation of Lookup for both the single and multiple token
   // cases.
@@ -129,9 +130,6 @@ class EmbeddingLookupText : public EmbeddingLookup {
 
   // The environment for the embedding lookup.
   litert::Environment& env_;
-  // The model for the embedding lookup. The actual model instance is owned by
-  // the model resources.
-  const litert::Model& model_;
   // The compiled model for the embedding model.
   std::optional<litert::CompiledModel> compiled_model_;
 
diff --git a/runtime/components/model_resources.h b/runtime/components/model_resources.h
index bd5311276..24ba00083 100644
--- a/runtime/components/model_resources.h
+++ b/runtime/components/model_resources.h
@@ -185,6 +185,11 @@ class ModelResources {
 
   // Returns the llm metadata.
   virtual absl::StatusOr<const proto::LlmMetadata*> GetLlmMetadata() = 0;
+
+  // Releases the TFLite model from RAM. This is used to reduce peak memory
+  // usage after the model has been compiled into a hardware-specific
+  // executable.
+  virtual absl::Status ReleaseTFLiteModel(ModelType model_type) = 0;
 };
 
 }  // namespace litert::lm
diff --git a/runtime/components/model_resources_litert_lm.cc b/runtime/components/model_resources_litert_lm.cc
index a28ce0f2c..ee25c07d4 100644
--- a/runtime/components/model_resources_litert_lm.cc
+++ b/runtime/components/model_resources_litert_lm.cc
@@ -162,4 +162,14 @@ ModelResourcesLitertLm::GetWeightsSectionOffset(ModelType model_type) {
       BufferKey(schema::AnySectionDataType_TFLiteWeights, model_type));
 }
 
+absl::Status ModelResourcesLitertLm::ReleaseTFLiteModel(ModelType model_type) {
+  model_map_.erase(model_type);
+  RETURN_IF_ERROR(litert_lm_loader_->ReleaseSection(
+      BufferKey(schema::AnySectionDataType_TFLiteModel, model_type)));
+  RETURN_IF_ERROR(litert_lm_loader_->ReleaseSection(
+      BufferKey(schema::AnySectionDataType_TFLiteWeights, model_type)));
+
+  return absl::OkStatus();
+}
+
 }  // namespace litert::lm
diff --git a/runtime/components/model_resources_litert_lm.h b/runtime/components/model_resources_litert_lm.h
index e0c513abb..e1ebcc132 100644
--- a/runtime/components/model_resources_litert_lm.h
+++ b/runtime/components/model_resources_litert_lm.h
@@ -23,6 +23,7 @@
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"  // from @com_google_absl
+#include "absl/status/status.h"  // from @com_google_absl
 #include "absl/status/statusor.h"  // from @com_google_absl
 #include "absl/strings/string_view.h"  // from @com_google_absl
 #include "litert/cc/litert_model.h"  // from @litert
@@ -64,6 +65,8 @@ class ModelResourcesLitertLm : public ModelResources {
   absl::StatusOr<std::pair<size_t, size_t>> GetWeightsSectionOffset(
       ModelType model_type) override;
 
+  absl::Status ReleaseTFLiteModel(ModelType model_type) override;
+
  protected:
   explicit ModelResourcesLitertLm(
       std::unique_ptr<LitertLmLoader> litert_lm_loader)
diff --git a/runtime/components/model_resources_streaming.cc b/runtime/components/model_resources_streaming.cc
index 44dc5154c..3a668afd0 100644
--- a/runtime/components/model_resources_streaming.cc
+++ b/runtime/components/model_resources_streaming.cc
@@ -73,4 +73,8 @@ ModelResourcesStreaming::GetLlmMetadata() {
   return absl::UnimplementedError("Not implemented.");
 }
 
+absl::Status ModelResourcesStreaming::ReleaseTFLiteModel(ModelType model_type) {
+  return absl::UnimplementedError("Not implemented.");
+}
+
 }  // namespace litert::lm
diff --git a/runtime/components/model_resources_streaming.h b/runtime/components/model_resources_streaming.h
index 2e5ba9272..c56a0208a 100644
--- a/runtime/components/model_resources_streaming.h
+++ b/runtime/components/model_resources_streaming.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"  // from @com_google_absl
 #include "absl/status/statusor.h"  // from @com_google_absl
 #include "absl/strings/string_view.h"  // from @com_google_absl
 #include "litert/cc/litert_model.h"  // from @litert
@@ -60,6 +61,8 @@ class ModelResourcesStreaming : public ModelResources {
   absl::StatusOr<std::unique_ptr<Tokenizer>> GetTokenizer() override;
 
   absl::StatusOr<const proto::LlmMetadata*> GetLlmMetadata() override;
+
+  absl::Status ReleaseTFLiteModel(ModelType model_type) override;
 };
 
 }  // namespace litert::lm
diff --git a/runtime/components/model_resources_task.cc b/runtime/components/model_resources_task.cc
index 715586645..44ead88cc 100644
--- a/runtime/components/model_resources_task.cc
+++ b/runtime/components/model_resources_task.cc
@@ -75,6 +75,11 @@ absl::StatusOr<const litert::Model*> ModelResourcesTask::GetTFLiteModel(
   return model_map_[model_type].get();
 }
 
+absl::Status ModelResourcesTask::ReleaseTFLiteModel(ModelType model_type) {
+  model_map_.erase(model_type);
+  return absl::OkStatus();
+}
+
 absl::StatusOr<std::unique_ptr<Tokenizer>> ModelResourcesTask::GetTokenizer() {
   ASSIGN_OR_RETURN(auto string_view,
                    model_asset_bundle_resources_->GetFile("TOKENIZER_MODEL"));
diff --git a/runtime/components/model_resources_task.h b/runtime/components/model_resources_task.h
index 2dea300ef..9c30f9c34 100644
--- a/runtime/components/model_resources_task.h
+++ b/runtime/components/model_resources_task.h
@@ -28,7 +28,6 @@
 #include "absl/strings/string_view.h"  // from @com_google_absl
 #include "litert/cc/litert_model.h"  // from @litert
 #include "runtime/components/model_resources.h"
-#include "runtime/components/sentencepiece_tokenizer.h"
 #include "runtime/components/tokenizer.h"
 #include "runtime/proto/llm_metadata.pb.h"
 #include "runtime/util/model_asset_bundle_resources.h"
@@ -56,6 +55,7 @@ class ModelResourcesTask : public ModelResources {
     // Task model does not support prefer activation type.
     return std::nullopt;
   };
+  absl::Status ReleaseTFLiteModel(ModelType model_type) override;
   absl::StatusOr<std::unique_ptr<Tokenizer>> GetTokenizer() override;
   absl::StatusOr<const proto::LlmMetadata*> GetLlmMetadata() override;
   absl::StatusOr<std::reference_wrapper<ScopedFile>> GetScopedFile() override {
@@ -74,7 +74,8 @@ class ModelResourcesTask : public ModelResources {
       : model_asset_bundle_resources_(std::move(model_asset_bundle_resources)) {
   }
 
-  absl::flat_hash_map<ModelType, std::shared_ptr<litert::Model>> model_map_;
+  absl::flat_hash_map<ModelType, std::unique_ptr<litert::Model>> model_map_;
+
   std::unique_ptr<proto::LlmMetadata> llm_metadata_;
 
   // The model asset bundle resources produced by reading task bundle. Not null
diff --git a/runtime/components/model_resources_test.cc b/runtime/components/model_resources_test.cc
index dbae34481..8f8d1972b 100644
--- a/runtime/components/model_resources_test.cc
+++ b/runtime/components/model_resources_test.cc
@@ -65,6 +65,58 @@ TEST(ModelResourcesTest, InitializeWithValidLitertLmLoader) {
   ASSERT_NE(tokenizer.value(), nullptr);
 }
 
+TEST(ModelResourcesTest, ReleaseTFLiteModel) {
+  const auto model_path =
+      std::filesystem::path(::testing::SrcDir()) /
+      "litert_lm/runtime/testdata/test_lm.litertlm";
+  auto model_file = ScopedFile::Open(model_path.string());
+  ASSERT_TRUE(model_file.ok());
+  ASSERT_OK_AND_ASSIGN(auto loader,
+                       LitertLmLoader::Create(std::move(model_file.value())));
+
+  auto model_resources = ModelResourcesLitertLm::Create(std::move(loader));
+  ASSERT_OK(model_resources);
+
+  // Load the model.
+  auto tflite_model =
+      model_resources.value()->GetTFLiteModel(ModelType::kTfLitePrefillDecode);
+  ASSERT_OK(tflite_model);
+
+  // Release the model.
+  ASSERT_OK(model_resources.value()->ReleaseTFLiteModel(
+      ModelType::kTfLitePrefillDecode));
+
+  // Subsequent GetTFLiteModelBuffer should return NotFound.
+  EXPECT_THAT(model_resources.value()->GetTFLiteModelBuffer(
+                  ModelType::kTfLitePrefillDecode),
+              testing::status::StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST(ModelResourcesTest, ReleaseTFLiteModelDoesNotBreakSubsequentLoads) {
+  const auto model_path =
+      std::filesystem::path(::testing::SrcDir()) /
+      "litert_lm/runtime/testdata/test_lm.litertlm";
+  auto model_file = ScopedFile::Open(model_path.string());
+  ASSERT_TRUE(model_file.ok());
+  ASSERT_OK_AND_ASSIGN(auto loader,
+                       LitertLmLoader::Create(std::move(model_file.value())));
+
+  auto model_resources = ModelResourcesLitertLm::Create(std::move(loader));
+  ASSERT_OK(model_resources);
+
+  // Load one model and release it.
+  ASSERT_OK(model_resources.value()
+                ->GetTFLiteModel(ModelType::kTfLitePrefillDecode)
+                .status());
+  ASSERT_OK(model_resources.value()->ReleaseTFLiteModel(
+      ModelType::kTfLitePrefillDecode));
+
+  // Subsequent loads should still succeed (e.g., Tokenizer or other models).
+  // test_lm.litertlm contains a tokenizer.
+  auto tokenizer = model_resources.value()->GetTokenizer();
+  EXPECT_OK(tokenizer.status());
+}
+
 TEST(ModelResourcesTest, InitializeWithExternalWeights) {
   const auto model_path =
       std::filesystem::path(::testing::SrcDir()) /
diff --git a/runtime/executor/BUILD b/runtime/executor/BUILD
index 5d7ed3241..c9cf55dcb 100644
--- a/runtime/executor/BUILD
+++ b/runtime/executor/BUILD
@@ -279,6 +279,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@litert//litert/cc:litert_model_types",
         "@litert//litert/cc:litert_tensor_buffer_types",
+        "@litert//litert/cc/internal:litert_handle",
         "//runtime/components:model_resources",
         "//runtime/components:model_resources_litert_lm",
         "//runtime/components:model_resources_task",
@@ -1069,6 +1070,7 @@ cc_test(
         ":llm_executor_settings",
         ":magic_number_configs_helper",
         "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@litert//litert/cc:litert_macros",
diff --git a/runtime/executor/audio_litert_compiled_model_executor.cc b/runtime/executor/audio_litert_compiled_model_executor.cc
index e9e347e02..54a89fd34 100644
--- a/runtime/executor/audio_litert_compiled_model_executor.cc
+++ b/runtime/executor/audio_litert_compiled_model_executor.cc
@@ -62,9 +62,37 @@
 #include "runtime/util/tensor_buffer_util.h"
 #include "tflite/types/half.h"  // from @litert
 
+#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
 namespace litert::lm {
 namespace {
 
+void MadviseMemoryBuffer(absl::string_view buffer) {
+#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
+  if (buffer.empty()) return;
+  size_t page_size = getpagesize();
+  uintptr_t addr = reinterpret_cast<uintptr_t>(buffer.data());
+  size_t size = buffer.size();
+
+  // We perform proper system page alignment to guarantee that the advisor
+  // does not fail on non-aligned offsets.
+  uintptr_t aligned_addr = (addr + page_size - 1) & ~(page_size - 1);
+  if (aligned_addr > addr) {
+    size_t gap = aligned_addr - addr;
+    if (gap >= size) return;
+    size -= gap;
+  }
+  size &= ~(page_size - 1);
+
+  if (size > 0) {
+    (void)madvise(reinterpret_cast<void*>(aligned_addr), size, MADV_DONTNEED);
+  }
+#endif
+}
+
 // Set the default GPU options for the model.
 absl::Status SetGpuOptions(const AudioExecutorSettings& executor_settings,
                            litert::GpuOptions& gpu_options) {
@@ -99,6 +127,7 @@ absl::Status SetGpuOptions(const AudioExecutorSettings& executor_settings,
   gpu_options.SetConvertWeightsOnGpu(true);
   gpu_options.SetHintFullyDelegatedToSingleDelegate(true);
   gpu_options.EnableInfiniteFloatCapping(true);
+  gpu_options.WaitForWeightsConversionComplete(true);
   return absl::OkStatus();
 }
 
@@ -179,13 +208,13 @@ AudioLiteRtCompiledModelExecutor::AudioStaticEncoder::Create(
     const AudioExecutorSettings& executor_settings, Environment& env,
     const Model* absl_nonnull model) {
   auto handler = std::unique_ptr<AudioStaticEncoder>(
-      new AudioStaticEncoder(executor_settings, env, model));
-  RETURN_IF_ERROR(handler->Initialize());
+      new AudioStaticEncoder(executor_settings, env));
+  RETURN_IF_ERROR(handler->Initialize(*model));
   return handler;
 }
 
-absl::Status
-AudioLiteRtCompiledModelExecutor::AudioStaticEncoder::Initialize() {
+absl::Status AudioLiteRtCompiledModelExecutor::AudioStaticEncoder::Initialize(
+    const Model& model) {
   LITERT_ASSIGN_OR_RETURN(auto options, Options::Create());
   auto weight_cache_file = executor_settings_.GetWeightCacheFile(
       absl::StrCat(AudioExecutorSettings::kStaticEncoderName,
@@ -222,15 +251,15 @@ AudioLiteRtCompiledModelExecutor::AudioStaticEncoder::Initialize() {
   }
 
   LITERT_ASSIGN_OR_RETURN(compiled_model_,
-                          CompiledModel::Create(env_, model_.Get(), options));
-  LITERT_ASSIGN_OR_RETURN(auto signatures, model_.GetSignatures());
+                          CompiledModel::Create(env_, model.Get(), options));
+  LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures());
   if (signatures.size() != 1) {
     return absl::InvalidArgumentError(
         absl::StrCat("The Audio Static Encoder model must have exactly one "
                      "signature but got ",
                      signatures.size()));
   }
-  LITERT_ASSIGN_OR_RETURN(auto signature, model_.GetSignature(0));
+  LITERT_ASSIGN_OR_RETURN(auto signature, model.GetSignature(0));
 
   // Initialize the input buffers.
   LITERT_ASSIGN_OR_RETURN(auto input_buffers,
@@ -311,13 +340,14 @@ AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::Create(
     const AudioExecutorSettings& executor_settings, Environment& env,
     const Model* absl_nonnull model) {
   auto handler = std::unique_ptr<AudioStreamingEncoder>(
-      new AudioStreamingEncoder(executor_settings, env, model));
-  RETURN_IF_ERROR(handler->Initialize());
+      new AudioStreamingEncoder(executor_settings, env));
+  RETURN_IF_ERROR(handler->Initialize(*model));
   return handler;
 }
 
 absl::Status
-AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::Initialize() {
+AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::Initialize(
+    const Model& model) {
   LITERT_ASSIGN_OR_RETURN(auto options, Options::Create());
   auto weight_cache_file = executor_settings_.GetWeightCacheFile(
       absl::StrCat(AudioExecutorSettings::kStreamingEncoderName,
@@ -356,15 +386,15 @@ AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::Initialize() {
   }
 
   LITERT_ASSIGN_OR_RETURN(compiled_model_,
-                          CompiledModel::Create(env_, model_.Get(), options));
-  LITERT_ASSIGN_OR_RETURN(auto signatures, model_.GetSignatures());
+                          CompiledModel::Create(env_, model.Get(), options));
+  LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures());
   if (signatures.size() != 1) {
     return absl::InvalidArgumentError(absl::StrCat(
         "The Audio Encoder model must have exactly one signature but got ",
         signatures.size()));
   }
 
-  LITERT_ASSIGN_OR_RETURN(auto signature, model_.GetSignature(0));
+  LITERT_ASSIGN_OR_RETURN(auto signature, model.GetSignature(0));
 
   // Initialize the input buffers.
   LITERT_ASSIGN_OR_RETURN(auto input_buffers,
@@ -504,13 +534,14 @@ absl::StatusOr<std::unique_ptr<AudioLiteRtCompiledModelExecutor::AudioAdapter>>
 AudioLiteRtCompiledModelExecutor::AudioAdapter::Create(
     const AudioExecutorSettings& executor_settings, Environment& env,
     const Model* absl_nonnull model) {
-  auto handler = std::unique_ptr<AudioAdapter>(
-      new AudioAdapter(executor_settings, env, model));
-  RETURN_IF_ERROR(handler->Initialize());
+  auto handler =
+      std::unique_ptr<AudioAdapter>(new AudioAdapter(executor_settings, env));
+  RETURN_IF_ERROR(handler->Initialize(*model));
   return handler;
 }
 
-absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize() {
+absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize(
+    const Model& model) {
   LITERT_ASSIGN_OR_RETURN(auto options, Options::Create());
   auto weight_cache_file = executor_settings_.GetWeightCacheFile(
       absl::StrCat(AudioExecutorSettings::kAdapterName,
@@ -524,6 +555,7 @@ absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize() {
 #if defined(LITERT_USE_WEBGPU_ACCELERATOR)
     gpu_options.SetBackend(GpuOptions::Backend::kWebGpu);
 #endif  // defined(LITERT_USE_WEBGPU_ACCELERATOR)
+    gpu_options.WaitForWeightsConversionComplete(true);
     options.SetHardwareAccelerators(litert::HwAccelerators::kGpu);
   } else if (executor_settings_.GetBackend() == Backend::CPU) {
     LITERT_ASSIGN_OR_RETURN(auto& cpu_options, options.GetCpuOptions());
@@ -540,8 +572,8 @@ absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize() {
   }
 
   LITERT_ASSIGN_OR_RETURN(compiled_model_,
-                          CompiledModel::Create(env_, model_.Get(), options));
-  LITERT_ASSIGN_OR_RETURN(auto signatures, model_.GetSignatures());
+                          CompiledModel::Create(env_, model.Get(), options));
+  LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures());
   if (signatures.size() != 1) {
     return absl::InvalidArgumentError(absl::StrCat(
         "The Audio Adapter model must have exactly one signature but got ",
@@ -565,7 +597,7 @@ absl::Status AudioLiteRtCompiledModelExecutor::AudioAdapter::Initialize() {
                      output_buffers_.size()));
   }
 
-  LITERT_ASSIGN_OR_RETURN(auto signature, model_.GetSignature(0));
+  LITERT_ASSIGN_OR_RETURN(auto signature, model.GetSignature(0));
   for (int i = 0; i < signature.InputNames().size(); ++i) {
     if (absl::StrContains(signature.InputNames()[i], kFeaturesName)) {
       features_buffer_ = &input_buffers_[i];
@@ -599,6 +631,10 @@ AudioLiteRtCompiledModelExecutor::Create(
                    resources->GetTFLiteModel(ModelType::kTfLiteAudioEncoderHw));
   ASSIGN_OR_RETURN(auto audio_adapter_model,
                    resources->GetTFLiteModel(ModelType::kTfLiteAudioAdapter));
+  LITERT_ASSIGN_OR_RETURN(
+      auto executor_properties,
+      GetAudioExecutorPropertiesFromModelResources(*resources));
+  const int encoder_shrinking_factor = executor_properties.audio_shrink_factor;
   std::unique_ptr<AudioEncoder> audio_encoder;
   LITERT_ASSIGN_OR_RETURN(auto encoder_signature,
                           audio_encoder_model->GetSignature(0));
@@ -613,9 +649,35 @@ AudioLiteRtCompiledModelExecutor::Create(
                      AudioStaticEncoder::Create(executor_settings, env,
                                                 audio_encoder_model));
   }
+
+  if (auto is_fully_accelerated =
+          audio_encoder->GetMutableCompiledModel().IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    // Instead of destructively unmapping the model which invalidates underlying
+    // interpreter dependencies, we dynamically advise the kernel that the pages
+    // are no longer needed. This guarantees transparent, non-crashing, stable
+    // runtime while freeing full resident RAM pages immediately.
+    auto buf_status =
+        resources->GetTFLiteModelBuffer(ModelType::kTfLiteAudioEncoderHw);
+    if (buf_status.ok()) {
+      MadviseMemoryBuffer(*buf_status);
+    }
+  }
+
   LITERT_ASSIGN_OR_RETURN(
       auto audio_adapter,
       AudioAdapter::Create(executor_settings, env, audio_adapter_model));
+
+  if (auto is_fully_accelerated =
+          audio_adapter->GetMutableCompiledModel().IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    auto buf_status =
+        resources->GetTFLiteModelBuffer(ModelType::kTfLiteAudioAdapter);
+    if (buf_status.ok()) {
+      MadviseMemoryBuffer(*buf_status);
+    }
+  }
+
   const auto& tmp = audio_encoder->GetInputMaskBuffer();
   LITERT_ASSIGN_OR_RETURN(auto mask_tensor_type, tmp.TensorType());
   LITERT_ASSIGN_OR_RETURN(int sequence_length,
@@ -629,10 +691,7 @@ AudioLiteRtCompiledModelExecutor::Create(
                           audio_adapter->GetOutputBuffers()[0].TensorType());
   const auto dims = adapter_output_tensor_type.Layout().Dimensions();
   const int audio_embedding_dimensions = dims.back();
-  LITERT_ASSIGN_OR_RETURN(
-      auto executor_properties,
-      GetAudioExecutorPropertiesFromModelResources(*resources));
-  const int encoder_shrinking_factor = executor_properties.audio_shrink_factor;
+
   if (!is_streaming_encoder) {
     if (audio_encoder->GetOutputBuffersMap().size() !=
         audio_adapter->GetInputBuffers().size()) {
@@ -832,7 +891,7 @@ absl::StatusOr<ExecutorAudioData> AudioLiteRtCompiledModelExecutor::Encode(
 absl::StatusOr<std::unique_ptr<AudioStreamingContext>>
 AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::CreateNewContext() {
   absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer> state_buffers;
-  LITERT_ASSIGN_OR_RETURN(auto signature, compiled_model_.GetSignature(0));
+  // Removed redundant GetSignature call that crashes post-release.
   for (auto& [name, buffer] : input_buffers_map_) {
     if (name == kSegmentValuesName || name == kSegmentMaskName) {
       // Skip the segment values and mask buffers as they are not part of the
@@ -859,7 +918,7 @@ AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::CreateNewContext() {
 absl::StatusOr<std::unique_ptr<AudioStreamingContext>>
 AudioLiteRtCompiledModelExecutor::AudioStreamingEncoder::CloneContext() {
   absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer> state_buffers;
-  LITERT_ASSIGN_OR_RETURN(auto signature, compiled_model_.GetSignature(0));
+  // Removed redundant GetSignature call that crashes post-release.
   for (const auto& [name, buffer] : input_buffers_map_) {
     if (name == kSegmentValuesName || name == kSegmentMaskName) {
       // Skip the segment values and mask buffers as they are not part of the
diff --git a/runtime/executor/audio_litert_compiled_model_executor.h b/runtime/executor/audio_litert_compiled_model_executor.h
index bc9ccc953..fb02e9c7c 100644
--- a/runtime/executor/audio_litert_compiled_model_executor.h
+++ b/runtime/executor/audio_litert_compiled_model_executor.h
@@ -75,7 +75,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor {
   //   A unique pointer to the AudioLiteRtCompiledModelExecutor if successful,
   //   or an error status if failed.
   static absl::StatusOr<std::unique_ptr<AudioLiteRtCompiledModelExecutor>>
-  Create(AudioExecutorSettings executor_settings, Environment& env);
+  Create(AudioExecutorSettings executor_settings, litert::Environment& env);
 
   // Run the audio encoder and audio adapter models to encode the spectrogram
   // tensor into audio embeddings. It is caller's responsibility to ensure the
@@ -141,7 +141,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor {
    public:
     virtual ~AudioEncoder() = default;
 
-    virtual absl::Status Initialize() = 0;
+    virtual absl::Status Initialize(const Model& model) = 0;
 
     virtual absl::Status ClearInputBuffers() = 0;
 
@@ -241,7 +241,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor {
 
     // Initialize the AudioStaticEncoder, which will create the input and output
     // buffers for the audio encoder model.
-    absl::Status Initialize() override;
+    absl::Status Initialize(const Model& model) override;
 
     absl::Status ClearInputBuffers() override;
 
@@ -249,12 +249,11 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor {
 
    private:
     AudioStaticEncoder(const AudioExecutorSettings& executor_settings,
-                       Environment& env, const Model* absl_nonnull model)
-        : executor_settings_(executor_settings), env_(env), model_(*model) {}
+                       Environment& env)
+        : executor_settings_(executor_settings), env_(env) {}
 
-    const AudioExecutorSettings& executor_settings_;
+    AudioExecutorSettings executor_settings_;
     Environment& env_;
-    const Model& model_;
   };
 
   // Audio Encoder for streaming LiteRT model, where the audio is provided in
@@ -310,7 +309,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor {
 
     // Initialize the AudioStreamingEncoder, which will create the input and
     // output buffers for the audio encoder model.
-    absl::Status Initialize() override;
+    absl::Status Initialize(const Model& model);
 
     int GetOverlapSize() const { return overlap_size_; }
 
@@ -331,12 +330,11 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor {
 
    private:
     AudioStreamingEncoder(const AudioExecutorSettings& executor_settings,
-                          Environment& env, const Model* absl_nonnull model)
-        : executor_settings_(executor_settings), env_(env), model_(*model) {}
+                          Environment& env)
+        : executor_settings_(executor_settings), env_(env) {}
 
-    const AudioExecutorSettings& executor_settings_;
+    AudioExecutorSettings executor_settings_;
     Environment& env_;
-    const Model& model_;
     int overlap_size_;
   };
 
@@ -360,7 +358,7 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor {
 
     // Initialize the AudioAdapter, which will create the input and output
     // buffers for the audio adapter model.
-    absl::Status Initialize();
+    absl::Status Initialize(const Model& model);
 
     const CompiledModel& GetCompiledModel() const { return compiled_model_; }
 
@@ -391,12 +389,11 @@ class AudioLiteRtCompiledModelExecutor : public AudioExecutor {
 
    private:
     AudioAdapter(const AudioExecutorSettings& executor_settings,
-                 Environment& env, const Model* absl_nonnull model)
-        : executor_settings_(executor_settings), env_(env), model_(*model) {}
+                 Environment& env)
+        : executor_settings_(executor_settings), env_(env) {}
 
-    const AudioExecutorSettings& executor_settings_;
+    AudioExecutorSettings executor_settings_;
     Environment& env_;
-    const Model& model_;
     CompiledModel compiled_model_;
     // The input buffers for the audio adapter model.
     std::vector<TensorBuffer> input_buffers_;
diff --git a/runtime/executor/llm_executor_settings_utils.cc b/runtime/executor/llm_executor_settings_utils.cc
index bc87e77e9..38bf2f81e 100644
--- a/runtime/executor/llm_executor_settings_utils.cc
+++ b/runtime/executor/llm_executor_settings_utils.cc
@@ -203,6 +203,7 @@ absl::StatusOr<litert::Options> CreateCompilationOptions(
       gpu_compilation_options.EnableAllowSrcQuantizedFcConvOps(
           !advanced_settings.allow_src_quantized_fc_conv_ops.has_value() ||
           advanced_settings.allow_src_quantized_fc_conv_ops.value());
+      gpu_compilation_options.WaitForWeightsConversionComplete(true);
       gpu_compilation_options.HintWaitingForCompletion(
           advanced_settings.hint_waiting_for_completion.has_value() &&
           advanced_settings.hint_waiting_for_completion.value());
diff --git a/runtime/executor/llm_litert_compiled_model_executor.cc b/runtime/executor/llm_litert_compiled_model_executor.cc
index b2d630a94..1e9c62129 100644
--- a/runtime/executor/llm_litert_compiled_model_executor.cc
+++ b/runtime/executor/llm_litert_compiled_model_executor.cc
@@ -35,6 +35,7 @@
 #include "absl/strings/str_cat.h"  // from @com_google_absl
 #include "absl/strings/string_view.h"  // from @com_google_absl
 #include "absl/types/span.h"  // from @com_google_absl
+#include "litert/cc/internal/litert_handle.h"  // from @litert
 #include "litert/cc/litert_common.h"  // from @litert
 #include "litert/cc/litert_compiled_model.h"  // from @litert
 #include "litert/cc/litert_element_type.h"  // from @litert
@@ -68,12 +69,40 @@
 #include "runtime/util/scoped_file.h"
 #include "runtime/util/status_macros.h"  // IWYU pragma: keep
 #include "runtime/util/tensor_buffer_util.h"
+
+#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
 #include "tflite/delegates/xnnpack/xnnpack_delegate.h"  // from @litert
 #include "tflite/types/half.h"  // from @litert
 
 namespace litert::lm {
 namespace {
 
+void MadviseMemoryBuffer(absl::string_view buffer) {
+#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
+  if (buffer.empty()) return;
+  size_t page_size = getpagesize();
+  uintptr_t addr = reinterpret_cast<uintptr_t>(buffer.data());
+  size_t size = buffer.size();
+
+  // Ensure properly system page aligned boundaries for guaranteed safety.
+  uintptr_t aligned_addr = (addr + page_size - 1) & ~(page_size - 1);
+  if (aligned_addr > addr) {
+    size_t gap = aligned_addr - addr;
+    if (gap >= size) return;
+    size -= gap;
+  }
+  size &= ~(page_size - 1);
+
+  if (size > 0) {
+    (void)madvise(reinterpret_cast<void*>(aligned_addr), size, MADV_DONTNEED);
+  }
+#endif
+}
+
 using ::absl::Span;
 
 // Names of the signature runners, used to get the signature runners from the
@@ -88,7 +117,7 @@ absl::Status InitializeEmbeddingLookups(
     std::unique_ptr<EmbeddingLookupManager>& per_layer_embedding_lookup) {
   absl::flat_hash_map<int, const Model*> end_of_multi_modal_embedding_models;
   {
-    auto end_of_audio_model =
+    absl::StatusOr<const Model*> end_of_audio_model =
         resources.GetTFLiteModel(ModelType::kTfLiteEndOfAudio);
     if (end_of_audio_model.ok()) {
       end_of_multi_modal_embedding_models.insert(
@@ -96,7 +125,7 @@ absl::Status InitializeEmbeddingLookups(
     }
   }
   {
-    auto end_of_vision_model =
+    absl::StatusOr<const Model*> end_of_vision_model =
         resources.GetTFLiteModel(ModelType::kTfLiteEndOfVision);
     if (end_of_vision_model.ok()) {
       end_of_multi_modal_embedding_models.insert(
@@ -104,13 +133,24 @@ absl::Status InitializeEmbeddingLookups(
     }
   }
 
-  auto text_embedder_model =
+  absl::StatusOr<const Model*> text_embedder_model =
       resources.GetTFLiteModel(ModelType::kTfLiteEmbedder);
   if (text_embedder_model.ok()) {
     ASSIGN_OR_RETURN(
         embedding_lookup,
         EmbeddingLookupManager::Create(env, *text_embedder_model,
                                        end_of_multi_modal_embedding_models));
+
+    // TODO: Ideally we release the individual embedding models, but we don't
+    // have access to the individual embedding models here.
+    if (auto is_fully_accelerated = embedding_lookup->IsFullyAccelerated();
+        is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+      RETURN_IF_ERROR(
+          resources.ReleaseTFLiteModel(ModelType::kTfLiteEndOfAudio));
+      RETURN_IF_ERROR(
+          resources.ReleaseTFLiteModel(ModelType::kTfLiteEndOfVision));
+      RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteEmbedder));
+    }
   }
 
   // Create per layer embedding lookups from the resources.
@@ -122,14 +162,21 @@ absl::Status InitializeEmbeddingLookups(
         EmbeddingLookupManager::Create(env, *per_layer_embedder_model,
                                        /*fully_supports_multi_modal=*/false));
   }
+  if (per_layer_embedding_lookup != nullptr) {
+    if (auto is_fully_accelerated =
+            per_layer_embedding_lookup->IsFullyAccelerated();
+        is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+      RETURN_IF_ERROR(
+          resources.ReleaseTFLiteModel(ModelType::kTfLitePerLayerEmbedder));
+    }
+  }
   return absl::OkStatus();
 }
 
 absl::Status CopyKvCacheBuffers(
     size_t decode_batch_size, int src_index_to_copy_on_prefill,
-    const absl::flat_hash_map<absl::string_view, TensorBuffer>&
-        src_kv_cache_buffers,
-    const absl::flat_hash_map<absl::string_view, TensorBuffer>&
+    const absl::flat_hash_map<std::string, TensorBuffer>& src_kv_cache_buffers,
+    const absl::flat_hash_map<std::string, TensorBuffer>&
         dst_kv_cache_buffers) {
   for (const auto& [name, src_buffer] : src_kv_cache_buffers) {
     if (!dst_kv_cache_buffers.contains(name)) {
@@ -191,17 +238,62 @@ absl::StatusOr<int> GetDynamicDimIndex(const Model& model,
   }
   return absl::InvalidArgumentError("No dynamic dimension found.");
 }
+}  // namespace
 
-absl::StatusOr<bool> HasDynamicDim(const Model& model,
-                                   absl::string_view signature,
-                                   absl::string_view tensor_name) {
-  LITERT_ASSIGN_OR_RETURN(const SimpleSignature& sig,
-                          model.FindSignature(signature));
-  LITERT_ASSIGN_OR_RETURN(const SimpleTensor& tensor,
-                          sig.InputTensor(tensor_name));
-  LITERT_ASSIGN_OR_RETURN(const RankedTensorType ranked_tensor_type,
-                          tensor.RankedTensorType());
-  auto dimensions = ranked_tensor_type.Layout().Dimensions();
+absl::StatusOr<LlmLiteRtCompiledModelExecutorBase::CachedMetadata>
+LlmLiteRtCompiledModelExecutorBase::CacheTensorMetadata(const Model& model) {
+  CachedMetadata cached_metadata;
+  for (int sig_idx = 0; sig_idx < model.GetNumSignatures(); ++sig_idx) {
+    LITERT_ASSIGN_OR_RETURN(auto sig, model.GetSignature(sig_idx));
+    std::string sig_key(sig.Key());
+    if (absl::StartsWith(sig_key, kPrefillSignatureRunner)) {
+      cached_metadata.prefill_signature_key = sig_key;
+    }
+    cached_metadata.signature_key_to_idx[sig_key] = sig_idx;
+
+    auto input_names = sig.InputNames();
+    std::vector<std::string> in_names;
+    for (int in_idx = 0; in_idx < input_names.size(); ++in_idx) {
+      std::string input_name(input_names[in_idx]);
+      in_names.push_back(input_name);
+
+      {
+        LITERT_ASSIGN_OR_RETURN(auto tensor, sig.InputTensor(in_idx));
+        LITERT_ASSIGN_OR_RETURN(auto ranked_type, tensor.RankedTensorType());
+        TensorMetadata metadata;
+        metadata.signature_index = sig_idx;
+        metadata.input_index = in_idx;
+        metadata.element_type = ranked_type.ElementType();
+        auto dims = ranked_type.Layout().Dimensions();
+        metadata.dimensions.assign(dims.begin(), dims.end());
+
+        cached_metadata
+            .input_tensor_metadata[absl::StrCat(sig_key, ":", input_name)] =
+            std::move(metadata);
+      }
+    }
+    cached_metadata.input_names_by_sig_idx[sig_idx] = std::move(in_names);
+
+    auto output_names = sig.OutputNames();
+    std::vector<std::string> out_names;
+    out_names.reserve(output_names.size());
+    for (int out_idx = 0; out_idx < output_names.size(); ++out_idx) {
+      out_names.push_back(std::string(output_names[out_idx]));
+    }
+    cached_metadata.output_names_by_sig_idx[sig_idx] = std::move(out_names);
+  }
+  return cached_metadata;
+}
+
+absl::StatusOr<bool> LlmLiteRtCompiledModelExecutorBase::HasDynamicDim(
+    absl::string_view signature, absl::string_view tensor_name) {
+  std::string cache_key = absl::StrCat(signature, ":", tensor_name);
+  auto it = input_tensor_metadata_.find(cache_key);
+  if (it == input_tensor_metadata_.end()) {
+    ABSL_LOG(ERROR) << "Tensor metadata not found for key: " << cache_key;
+    return absl::NotFoundError("Tensor metadata not found");
+  }
+  const auto& dimensions = it->second.dimensions;
   for (int i = 0; i < dimensions.size(); ++i) {
     if (dimensions[i] == kDynamicDimValue) {
       return true;
@@ -210,17 +302,16 @@ absl::StatusOr<bool> HasDynamicDim(const Model& model,
   return false;
 }
 
-absl::Status ResolveDynamicShape(const Model& model,
-                                 CompiledModel& compiled_model,
-                                 absl::string_view signature,
-                                 absl::string_view tensor_name, int new_value) {
-  LITERT_ASSIGN_OR_RETURN(const SimpleSignature& sig,
-                          model.FindSignature(signature));
-  LITERT_ASSIGN_OR_RETURN(const SimpleTensor& tensor,
-                          sig.InputTensor(tensor_name));
-  LITERT_ASSIGN_OR_RETURN(const RankedTensorType ranked_tensor_type,
-                          tensor.RankedTensorType());
-  auto dimensions = ranked_tensor_type.Layout().Dimensions();
+absl::Status LlmLiteRtCompiledModelExecutorBase::ResolveDynamicShape(
+    absl::string_view signature, absl::string_view tensor_name, int new_value) {
+  std::string cache_key = absl::StrCat(signature, ":", tensor_name);
+  auto it = input_tensor_metadata_.find(cache_key);
+  if (it == input_tensor_metadata_.end()) {
+    ABSL_LOG(ERROR) << "Tensor metadata not found for key: " << cache_key;
+    return absl::NotFoundError("Tensor metadata not found");
+  }
+  const auto& metadata = it->second;
+  const auto& dimensions = metadata.dimensions;
 
   bool has_dynamic_dim = false;
   std::vector<int> new_shape;
@@ -235,13 +326,48 @@ absl::Status ResolveDynamicShape(const Model& model,
   }
 
   if (has_dynamic_dim) {
-    LITERT_RETURN_IF_ERROR(
-        compiled_model.ResizeInputTensor(signature, tensor_name, new_shape));
+    LITERT_RETURN_IF_ERROR(compiled_model_->ResizeInputTensor(
+        metadata.signature_index, metadata.input_index, new_shape));
+    auto layouts_or = compiled_model_->GetOutputTensorLayouts(
+        metadata.signature_index, /*update_allocation=*/true);
+    if (!layouts_or) {
+      return absl::InternalError(
+          "Failed to update allocation after resizing tensor.");
+    }
   }
 
   return absl::OkStatus();
 }
 
+absl::StatusOr<litert::TensorBuffer>
+LlmLiteRtCompiledModelExecutorBase::CreateInputBuffer(
+    absl::string_view signature, absl::string_view tensor_name) const {
+  std::string cache_key = absl::StrCat(signature, ":", tensor_name);
+  auto it = input_tensor_metadata_.find(cache_key);
+  if (it == input_tensor_metadata_.end()) {
+    ABSL_LOG(ERROR) << "Tensor metadata not found for key: " << cache_key;
+    return absl::NotFoundError("Tensor metadata not found");
+  }
+  const auto& metadata = it->second;
+
+  LITERT_ASSIGN_OR_RETURN(litert::TensorBufferRequirements buffer_requirements,
+                          compiled_model_->GetInputBufferRequirements(
+                              metadata.signature_index, metadata.input_index));
+
+  LITERT_ASSIGN_OR_RETURN(litert::Layout runtime_layout,
+                          compiled_model_->GetInputTensorLayout(
+                              metadata.signature_index, metadata.input_index));
+
+  litert::RankedTensorType tensor_type(metadata.element_type,
+                                       std::move(runtime_layout));
+  LITERT_ASSIGN_OR_RETURN(auto buf,
+                          litert::TensorBuffer::CreateManagedFromRequirements(
+                              env_, tensor_type, buffer_requirements));
+  return std::move(buf);
+}
+
+namespace {
+
 absl::StatusOr<TensorBuffer> ResizeKVCacheTensorBuffer(
     Environment& env, TensorBuffer& tensor_buffer, int dynamic_dim_index,
     int num_entries_to_insert) {
@@ -363,18 +489,22 @@ absl::StatusOr<TensorBuffer> CreateFP16OutputBuffer(
 absl::Status LlmLiteRtCompiledModelExecutorBase::CreatePrefillInputBuffers(
     absl::string_view prefill_signature, int sequence_length,
     int context_length,
-    absl::flat_hash_map<absl::string_view, TensorBuffer>&
-        prefill_input_buffers) {
+    absl::flat_hash_map<std::string, TensorBuffer>& prefill_input_buffers) {
   auto dyn_shape_resolver = [&](absl::string_view tensor_name) -> absl::Status {
-    return ResolveDynamicShape(model_, *compiled_model_, prefill_signature,
-                               tensor_name, sequence_length);
+    ASSIGN_OR_RETURN(bool has_dynamic_dim,
+                     HasDynamicDim(prefill_signature, tensor_name));
+    if (has_dynamic_dim) {
+      RETURN_IF_ERROR(
+          ResolveDynamicShape(prefill_signature, tensor_name, sequence_length));
+    }
+    return absl::OkStatus();
   };
   // Create input_token, positions and attn_mask buffers after determining
   // the prefill length.
   if (!signatures_.input_tokens.empty()) {
     RETURN_IF_ERROR(dyn_shape_resolver(signatures_.input_tokens));
-    auto tokens_buffer = compiled_model_->CreateInputBuffer(
-        prefill_signature, signatures_.input_tokens);
+    auto tokens_buffer =
+        CreateInputBuffer(prefill_signature, signatures_.input_tokens);
     prefill_input_buffers[signatures_.input_tokens] = std::move(*tokens_buffer);
   } else {
     // If input_tokens is empty, we must have input_embeddings.
@@ -388,7 +518,7 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::CreatePrefillInputBuffers(
           "model is not initialized.");
     }
     RETURN_IF_ERROR(dyn_shape_resolver(signatures_.input_embeddings.value()));
-    auto embeddings_buffer = compiled_model_->CreateInputBuffer(
+    auto embeddings_buffer = CreateInputBuffer(
         prefill_signature, signatures_.input_embeddings.value());
     prefill_input_buffers[signatures_.input_embeddings.value()] =
         std::move(*embeddings_buffer);
@@ -402,36 +532,50 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::CreatePrefillInputBuffers(
       }
       RETURN_IF_ERROR(
           dyn_shape_resolver(signatures_.input_per_layer_embeddings.value()));
-      auto per_layer_embeddings_buffer = compiled_model_->CreateInputBuffer(
+      auto per_layer_embeddings_buffer = CreateInputBuffer(
           prefill_signature, signatures_.input_per_layer_embeddings.value());
       prefill_input_buffers[signatures_.input_per_layer_embeddings.value()] =
           std::move(*per_layer_embeddings_buffer);
     }
   }
   RETURN_IF_ERROR(dyn_shape_resolver(signatures_.input_positions));
-  auto positions_buffer = compiled_model_->CreateInputBuffer(
-      prefill_signature, signatures_.input_positions);
+  auto positions_buffer =
+      CreateInputBuffer(prefill_signature, signatures_.input_positions);
   prefill_input_buffers[signatures_.input_positions] =
       std::move(*positions_buffer);
 
   if (signatures_.input_attn_mask.has_value()) {
-    ASSIGN_OR_RETURN(bool is_attn_dyn,
-                     HasDynamicDim(model_, prefill_signature,
-                                   signatures_.input_attn_mask.value()));
+    ASSIGN_OR_RETURN(
+        bool is_attn_dyn,
+        HasDynamicDim(prefill_signature, signatures_.input_attn_mask.value()));
     if (is_attn_dyn) {
       std::vector<int> new_shape = {1, 1, sequence_length, context_length};
-      LITERT_RETURN_IF_ERROR(compiled_model_->ResizeInputTensor(
-          prefill_signature, signatures_.input_attn_mask.value(), new_shape));
+      auto cache_key = absl::StrCat(prefill_signature, ":",
+                                    signatures_.input_attn_mask.value());
+      auto it = input_tensor_metadata_.find(cache_key);
+      if (it != input_tensor_metadata_.end()) {
+        LITERT_RETURN_IF_ERROR(compiled_model_->ResizeInputTensor(
+            it->second.signature_index, it->second.input_index, new_shape));
+        auto layouts_or = compiled_model_->GetOutputTensorLayouts(
+            it->second.signature_index, /*update_allocation=*/true);
+        if (!layouts_or) {
+          return absl::InternalError(
+              "Failed to update allocation after resizing tensor.");
+        }
+      } else {
+        return absl::NotFoundError(
+            absl::StrCat("Tensor metadata not found: ", cache_key));
+      }
     }
 
-    auto attn_mask_buffer = compiled_model_->CreateInputBuffer(
+    auto attn_mask_buffer = CreateInputBuffer(
         prefill_signature, signatures_.input_attn_mask.value());
     prefill_input_buffers[signatures_.input_attn_mask.value()] =
         std::move(*attn_mask_buffer);
   }
   if (signatures_.input_int32_param.has_value()) {
     gpu_optimized_single_buffer_cache_ = true;
-    auto param_tensor_buffer = compiled_model_->CreateInputBuffer(
+    auto param_tensor_buffer = CreateInputBuffer(
         prefill_signature, signatures_.input_int32_param.value());
     prefill_input_buffers[signatures_.input_int32_param.value()] =
         std::move(*param_tensor_buffer);
@@ -546,7 +690,7 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::PrepareFirstPrefillAfterDecode(
 
 absl::Status LlmLiteRtCompiledModelExecutorBase::PrefillInternal(
     absl::string_view prefill_signature,
-    absl::flat_hash_map<absl::string_view, TensorBuffer>& prefill_input_buffers,
+    absl::flat_hash_map<std::string, TensorBuffer>& prefill_input_buffers,
     Span<const int> ids, bool async) {
   RETURN_IF_ERROR(RollBackProcessedTokens());
 
@@ -712,30 +856,53 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::PrefillInternal(
 
 absl::Status LlmLiteRtCompiledModelExecutorBase::BindTensorsAndRunPrefill(
     absl::string_view prefill_signature,
-    absl::flat_hash_map<absl::string_view, TensorBuffer>& prefill_input_buffers,
+    absl::flat_hash_map<std::string, TensorBuffer>& prefill_input_buffers,
     bool async) {
-  absl::flat_hash_map<absl::string_view, TensorBuffer> input_buffers;
-  for (const auto& [input_name, input_buffer] : prefill_input_buffers) {
-    LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, input_buffer.Duplicate());
-    input_buffers[input_name] = std::move(input_buffer_dup);
-  }
-  for (const auto& [input_name, input_buffer] : *input_kv_cache_buffers_) {
-    LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, input_buffer.Duplicate());
-    input_buffers[input_name] = std::move(input_buffer_dup);
+  auto sig_it = signature_key_to_idx_.find(prefill_signature);
+  if (sig_it == signature_key_to_idx_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Signature key not found: ", prefill_signature));
+  }
+  size_t current_prefill_idx = sig_it->second;
+
+  std::vector<TensorBuffer> vec_input_buffers;
+  const auto& in_names = input_names_by_sig_idx_[current_prefill_idx];
+  vec_input_buffers.reserve(in_names.size());
+  for (const auto& name : in_names) {
+    if (auto it = prefill_input_buffers.find(name);
+        it != prefill_input_buffers.end()) {
+      LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, it->second.Duplicate());
+      vec_input_buffers.push_back(std::move(input_buffer_dup));
+    } else if (auto it = input_kv_cache_buffers_->find(name);
+               it != input_kv_cache_buffers_->end()) {
+      LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, it->second.Duplicate());
+      vec_input_buffers.push_back(std::move(input_buffer_dup));
+    } else {
+        vec_input_buffers.push_back(litert::TensorBuffer::WrapCObject(
+            env_.GetHolder(), nullptr, litert::OwnHandle::kNo));
+    }
   }
-  absl::flat_hash_map<absl::string_view, TensorBuffer> output_buffers;
-  for (const auto& [output_name, output_buffer] : *output_kv_cache_buffers_) {
-    LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, output_buffer.Duplicate());
-    output_buffer_dup.ClearEvent();
-    output_buffers[output_name] = std::move(output_buffer_dup);
+
+  std::vector<TensorBuffer> vec_output_buffers;
+  const auto& out_names = output_names_by_sig_idx_[current_prefill_idx];
+  vec_output_buffers.reserve(out_names.size());
+  for (const auto& name : out_names) {
+    if (auto it = output_kv_cache_buffers_->find(name);
+        it != output_kv_cache_buffers_->end()) {
+      LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, it->second.Duplicate());
+      output_buffer_dup.ClearEvent();
+      vec_output_buffers.push_back(std::move(output_buffer_dup));
+    } else {
+      return absl::NotFoundError(absl::StrCat("Missing output buffer: ", name));
+    }
   }
 
   if (async) {
     LITERT_RETURN_IF_ERROR(compiled_model_->RunAsync(
-        prefill_signature, input_buffers, output_buffers, async));
+        current_prefill_idx, vec_input_buffers, vec_output_buffers, async));
   } else {
-    LITERT_RETURN_IF_ERROR(
-        compiled_model_->Run(prefill_signature, input_buffers, output_buffers));
+    LITERT_RETURN_IF_ERROR(compiled_model_->Run(
+        current_prefill_idx, vec_input_buffers, vec_output_buffers));
   }
 
   if (!gpu_optimized_single_buffer_cache_) {
@@ -916,36 +1083,52 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::DecodeInternal(
 
 absl::Status LlmLiteRtCompiledModelExecutorBase::BindTensorsAndRunDecode(
     TensorBuffer* output_logits) {
-  absl::flat_hash_map<absl::string_view, TensorBuffer> decode_input_buffers;
-  for (const auto& [input_name, input_buffer] : decode_input_buffers_) {
-    LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, input_buffer.Duplicate());
-    decode_input_buffers[input_name] = std::move(input_buffer_dup);
-  }
-  for (const auto& [input_name, input_buffer] : *input_kv_cache_buffers_) {
-    LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, input_buffer.Duplicate());
-    decode_input_buffers[input_name] = std::move(input_buffer_dup);
-  }
-  absl::flat_hash_map<absl::string_view, TensorBuffer> decode_output_buffers;
-  for (const auto& [output_name, output_buffer] : decode_output_buffers_) {
-    // LITERT_ASSIGN_OR_RETURN() causes a compilation error on windows.
-    auto output_buffer_dup =
-        output_logits && output_name == signatures_.output_logits
-            ? output_logits->Duplicate()
-            : output_buffer.Duplicate();
-    RET_CHECK(output_buffer_dup) << "Failed to duplicate output buffer.";
-    output_buffer_dup->ClearEvent();
-    decode_output_buffers[output_name] = std::move(*output_buffer_dup);
-  }
-  for (const auto& [output_name, output_buffer] : *output_kv_cache_buffers_) {
-    LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, output_buffer.Duplicate());
-    output_buffer_dup.ClearEvent();
-    decode_output_buffers[output_name] = std::move(output_buffer_dup);
+
+  std::vector<TensorBuffer> vec_input_buffers;
+  const auto& in_names = input_names_by_sig_idx_[decode_signature_idx_];
+  vec_input_buffers.reserve(in_names.size());
+  for (const auto& name : in_names) {
+    if (auto it = decode_input_buffers_.find(name);
+        it != decode_input_buffers_.end()) {
+      LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, it->second.Duplicate());
+      vec_input_buffers.push_back(std::move(input_buffer_dup));
+    } else if (auto it = input_kv_cache_buffers_->find(name);
+               it != input_kv_cache_buffers_->end()) {
+      LITERT_ASSIGN_OR_RETURN(auto input_buffer_dup, it->second.Duplicate());
+      vec_input_buffers.push_back(std::move(input_buffer_dup));
+    } else {
+        vec_input_buffers.push_back(litert::TensorBuffer::WrapCObject(
+            env_.GetHolder(), nullptr, litert::OwnHandle::kNo));
+    }
+  }
+
+  std::vector<TensorBuffer> vec_output_buffers;
+  const auto& out_names = output_names_by_sig_idx_[decode_signature_idx_];
+  vec_output_buffers.reserve(out_names.size());
+  for (const auto& name : out_names) {
+    if (output_logits && name == signatures_.output_logits) {
+      LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup,
+                              output_logits->Duplicate());
+      output_buffer_dup.ClearEvent();
+      vec_output_buffers.push_back(std::move(output_buffer_dup));
+    } else if (auto it = decode_output_buffers_.find(name);
+               it != decode_output_buffers_.end()) {
+      LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, it->second.Duplicate());
+      output_buffer_dup.ClearEvent();
+      vec_output_buffers.push_back(std::move(output_buffer_dup));
+    } else if (auto it = output_kv_cache_buffers_->find(name);
+               it != output_kv_cache_buffers_->end()) {
+      LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, it->second.Duplicate());
+      output_buffer_dup.ClearEvent();
+      vec_output_buffers.push_back(std::move(output_buffer_dup));
+    } else {
+      return absl::NotFoundError(absl::StrCat("Missing output buffer: ", name));
+    }
   }
 
   bool async = true;
-  LITERT_RETURN_IF_ERROR(
-      compiled_model_->RunAsync(kDecodeSignatureRunner, decode_input_buffers,
-                                decode_output_buffers, async));
+  LITERT_RETURN_IF_ERROR(compiled_model_->RunAsync(
+      decode_signature_idx_, vec_input_buffers, vec_output_buffers, async));
 
   if (!gpu_optimized_single_buffer_cache_) {
     std::swap(input_kv_cache_buffers_, output_kv_cache_buffers_);
@@ -1263,16 +1446,14 @@ absl::Status LlmLiteRtCompiledModelExecutorBase::InitializeSampler(
   if (sampler_handles_input_) {
     ABSL_LOG(INFO) << "Sampler will handle decode input tensors.";
     if (!decode_prev_input_pos_) {
-      LITERT_ASSIGN_OR_RETURN(
-          decode_prev_input_pos_,
-          compiled_model_->CreateInputBuffer(kDecodeSignatureRunner,
-                                             signatures_.input_positions));
+      LITERT_ASSIGN_OR_RETURN(decode_prev_input_pos_,
+                              CreateInputBuffer(kDecodeSignatureRunner,
+                                                signatures_.input_positions));
     }
     if (!decode_prev_mask_ && signatures_.input_attn_mask.has_value()) {
-      LITERT_ASSIGN_OR_RETURN(
-          decode_prev_mask_,
-          compiled_model_->CreateInputBuffer(kDecodeSignatureRunner,
-                                             *signatures_.input_attn_mask));
+      LITERT_ASSIGN_OR_RETURN(decode_prev_mask_,
+                              CreateInputBuffer(kDecodeSignatureRunner,
+                                                *signatures_.input_attn_mask));
     }
     // Set, then reset the input handling to get the underlying model ready, but
     // not to bind the input tensors.
@@ -1547,10 +1728,10 @@ LlmLiteRtCompiledModelExecutorStatic::Create(
     compiled_model =
         std::make_unique<CompiledModel>(std::move(compiled_model_tmp));
   }
-  absl::flat_hash_map<absl::string_view, TensorBuffer> decode_input_buffers;
-  absl::flat_hash_map<absl::string_view, TensorBuffer> decode_output_buffers;
-  absl::flat_hash_map<absl::string_view, TensorBuffer> input_kv_cache_buffers;
-  absl::flat_hash_map<absl::string_view, TensorBuffer> output_kv_cache_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> decode_input_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> decode_output_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> input_kv_cache_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> output_kv_cache_buffers;
 
   bool clear_kv_cache_before_prefill =
       !executor_settings.GetAdvancedSettings() ||
@@ -1653,17 +1834,17 @@ LlmLiteRtCompiledModelExecutorStatic::Create(
       << "Output logits must be (batch, seq, vocab)";
   int batch_size = output_logits_buffer_tensor_type.Layout().Dimensions()[0];
 
-  std::optional<absl::flat_hash_map<absl::string_view, TensorBuffer>>
+  std::optional<absl::flat_hash_map<std::string, TensorBuffer>>
       decode_input_kv_cache_buffers;
-  std::optional<absl::flat_hash_map<absl::string_view, TensorBuffer>>
+  std::optional<absl::flat_hash_map<std::string, TensorBuffer>>
       decode_output_kv_cache_buffers;
   if (batch_size > 1) {
     ABSL_LOG(INFO) << "Decode batch size is larger than 1. Allocate decode "
                    << "only KV cache buffers.";
     decode_input_kv_cache_buffers =
-        absl::flat_hash_map<absl::string_view, TensorBuffer>();
+        absl::flat_hash_map<std::string, TensorBuffer>();
     decode_output_kv_cache_buffers =
-        absl::flat_hash_map<absl::string_view, TensorBuffer>();
+        absl::flat_hash_map<std::string, TensorBuffer>();
     for (auto input_name : decode_signature.InputNames()) {
       if (absl::StartsWith(input_name, kv_cache_k_root_name) ||
           absl::StartsWith(input_name, kv_cache_v_root_name)) {
@@ -1712,9 +1893,24 @@ LlmLiteRtCompiledModelExecutorStatic::Create(
     }
   }
 
-  return absl::WrapUnique(new LlmLiteRtCompiledModelExecutorStatic(
-      std::move(executor_settings), lrt_env, litert_model,
-      std::move(compiled_model), std::move(decode_input_buffers),
+  Expected<bool> is_fully_accelerated = compiled_model->IsFullyAccelerated();
+  ASSIGN_OR_RETURN(auto cached_metadata, CacheTensorMetadata(*litert_model));
+
+  if (is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    // Dynamically advise kernel to recycle physical RAM pages instead of
+    // destructively releasing the underlying memory map, maximizing system
+    // resource returns while enforcing safe, uninterrupted TFLite execution
+    // contexts globally.
+    auto buf_status =
+        resources.GetTFLiteModelBuffer(ModelType::kTfLitePrefillDecode);
+    if (buf_status.ok()) {
+      MadviseMemoryBuffer(*buf_status);
+    }
+  }
+
+  auto executor = absl::WrapUnique(new LlmLiteRtCompiledModelExecutorStatic(
+      std::move(executor_settings), lrt_env, std::move(compiled_model),
+      std::move(cached_metadata), std::move(decode_input_buffers),
       std::move(decode_output_buffers), std::move(input_kv_cache_buffers),
       std::move(output_kv_cache_buffers),
       std::move(decode_input_kv_cache_buffers),
@@ -1722,6 +1918,8 @@ LlmLiteRtCompiledModelExecutorStatic::Create(
       signatures, batch_size, std::move(cache_path),
       std::move(embedding_lookup), std::move(per_layer_embedding_lookup),
       use_fp16_precision, activation_data_type, std::move(mtp_drafter)));
+
+  return executor;
 }
 
 /* ===========================================================================*/
@@ -1785,22 +1983,20 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::PrefillInternal(
         !executor_settings_.GetAdvancedSettings() ||
         executor_settings_.GetAdvancedSettings()->clear_kv_cache_before_prefill;
     for (const auto& k_cache_input_name : key_cache_input_names_) {
-      RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "prefill",
-                                          k_cache_input_name, prefill_length));
-      LITERT_ASSIGN_OR_RETURN(
-          auto input_buffer,
-          compiled_model_->CreateInputBuffer("prefill", k_cache_input_name));
+      RETURN_IF_ERROR(
+          ResolveDynamicShape("prefill", k_cache_input_name, prefill_length));
+      LITERT_ASSIGN_OR_RETURN(auto input_buffer,
+                              CreateInputBuffer("prefill", k_cache_input_name));
       if (clear_kv_cache_before_prefill) {
         LITERT_RETURN_IF_ERROR(input_buffer.Clear());
       }
       kv_cache_buffers_1_[k_cache_input_name] = std::move(input_buffer);
     }
     for (const auto& v_cache_input_name : value_cache_input_names_) {
-      RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "prefill",
-                                          v_cache_input_name, prefill_length));
-      LITERT_ASSIGN_OR_RETURN(
-          auto input_buffer,
-          compiled_model_->CreateInputBuffer("prefill", v_cache_input_name));
+      RETURN_IF_ERROR(
+          ResolveDynamicShape("prefill", v_cache_input_name, prefill_length));
+      LITERT_ASSIGN_OR_RETURN(auto input_buffer,
+                              CreateInputBuffer("prefill", v_cache_input_name));
       if (clear_kv_cache_before_prefill) {
         LITERT_RETURN_IF_ERROR(input_buffer.Clear());
       }
@@ -1822,18 +2018,16 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::PrefillInternal(
       int new_kv_seq_len = kv_length + prefill_length;
       int entries_to_add = new_kv_seq_len - kv_length;
       for (const auto& k_cache_input_name : key_cache_input_names_) {
-        RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "prefill",
-                                            k_cache_input_name,
-                                            new_kv_seq_len));
+        RETURN_IF_ERROR(
+            ResolveDynamicShape("prefill", k_cache_input_name, new_kv_seq_len));
         ASSIGN_OR_RETURN(kv_cache_buffers_1_[k_cache_input_name],
                          ResizeKVCacheTensorBuffer(
                              env_, kv_cache_buffers_1_[k_cache_input_name],
                              key_dynamic_dim_index_, entries_to_add));
       }
       for (const auto& v_cache_input_name : value_cache_input_names_) {
-        RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "prefill",
-                                            v_cache_input_name,
-                                            new_kv_seq_len));
+        RETURN_IF_ERROR(
+            ResolveDynamicShape("prefill", v_cache_input_name, new_kv_seq_len));
         ASSIGN_OR_RETURN(kv_cache_buffers_1_[v_cache_input_name],
                          ResizeKVCacheTensorBuffer(
                              env_, kv_cache_buffers_1_[v_cache_input_name],
@@ -1843,7 +2037,7 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::PrefillInternal(
     }
   }
 
-  absl::flat_hash_map<absl::string_view, TensorBuffer> prefill_input_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> prefill_input_buffers;
   RETURN_IF_ERROR(CreatePrefillInputBuffers("prefill", prefill_length,
                                             kv_length, prefill_input_buffers));
 
@@ -1873,16 +2067,16 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::DecodeInternal(
     int entries_to_add = kv_increament_size_;
     int new_kv_len = current_kv_len + entries_to_add;
     for (const auto& k_cache_input_name : key_cache_input_names_) {
-      RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "decode",
-                                          k_cache_input_name, new_kv_len));
+      RETURN_IF_ERROR(
+          ResolveDynamicShape("decode", k_cache_input_name, new_kv_len));
       ASSIGN_OR_RETURN(kv_cache_buffers_1_[k_cache_input_name],
                        ResizeKVCacheTensorBuffer(
                            env_, kv_cache_buffers_1_[k_cache_input_name],
                            key_dynamic_dim_index_, entries_to_add));
     }
     for (const auto& v_cache_input_name : value_cache_input_names_) {
-      RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "decode",
-                                          v_cache_input_name, new_kv_len));
+      RETURN_IF_ERROR(
+          ResolveDynamicShape("decode", v_cache_input_name, new_kv_len));
       ASSIGN_OR_RETURN(kv_cache_buffers_1_[v_cache_input_name],
                        ResizeKVCacheTensorBuffer(
                            env_, kv_cache_buffers_1_[v_cache_input_name],
@@ -1891,13 +2085,11 @@ absl::Status LlmLiteRtCompiledModelExecutorDynamic::DecodeInternal(
     current_kv_len = new_kv_len;
   }
 
-  RETURN_IF_ERROR(ResolveDynamicShape(model_, *compiled_model_, "decode",
-                                      signatures_.input_attn_mask.value(),
-                                      current_kv_len));
+  RETURN_IF_ERROR(ResolveDynamicShape(
+      "decode", signatures_.input_attn_mask.value(), current_kv_len));
   LITERT_ASSIGN_OR_RETURN(
       decode_input_buffers_[signatures_.input_attn_mask.value()],
-      compiled_model_->CreateInputBuffer("decode",
-                                         signatures_.input_attn_mask.value()));
+      CreateInputBuffer("decode", signatures_.input_attn_mask.value()));
 
   return LlmLiteRtCompiledModelExecutorBase::DecodeInternal(token,
                                                             output_logits);
@@ -1967,8 +2159,8 @@ LlmLiteRtCompiledModelExecutorDynamic::Create(
         std::make_unique<CompiledModel>(std::move(compiled_model_tmp));
   }
 
-  absl::flat_hash_map<absl::string_view, TensorBuffer> decode_input_buffers;
-  absl::flat_hash_map<absl::string_view, TensorBuffer> decode_output_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> decode_input_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> decode_output_buffers;
 
   LITERT_ASSIGN_OR_RETURN(auto decode_signature,
                           litert_model->FindSignature(kDecodeSignatureRunner));
@@ -2038,15 +2230,34 @@ LlmLiteRtCompiledModelExecutorDynamic::Create(
   std::unique_ptr<EmbeddingLookupManager> per_layer_embedding_lookup;
   RETURN_IF_ERROR(InitializeEmbeddingLookups(
       lrt_env, resources, embedding_lookup, per_layer_embedding_lookup));
-  return absl::WrapUnique(new LlmLiteRtCompiledModelExecutorDynamic(
-      std::move(executor_settings), lrt_env, litert_model,
-      std::move(compiled_model), std::move(decode_input_buffers),
+
+  Expected<bool> is_fully_accelerated = compiled_model->IsFullyAccelerated();
+  ASSIGN_OR_RETURN(auto cached_metadata, CacheTensorMetadata(*litert_model));
+
+  if (is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    // Dynamically advise kernel to recycle physical RAM pages instead of
+    // destructively releasing the underlying memory map, maximizing system
+    // resource returns while enforcing safe, uninterrupted TFLite execution
+    // contexts globally.
+    auto buf_status =
+        resources.GetTFLiteModelBuffer(ModelType::kTfLitePrefillDecode);
+    if (buf_status.ok()) {
+      MadviseMemoryBuffer(*buf_status);
+    }
+  }
+
+  auto executor = absl::WrapUnique(new LlmLiteRtCompiledModelExecutorDynamic(
+      std::move(executor_settings), lrt_env, std::move(compiled_model),
+      std::move(cached_metadata), std::move(decode_input_buffers),
       std::move(decode_output_buffers), prefill_chunk_size, k_dynamic_dim,
       v_dynamic_dim, kv_increament_size, std::move(key_cache_input_names),
       std::move(value_cache_input_names), signatures, batch_size,
       std::move(weight_cache_path), std::move(embedding_lookup),
-      std::move(per_layer_embedding_lookup), /*use_fp16_precision=*/false,
+      std::move(per_layer_embedding_lookup),
+      /*use_fp16_precision=*/false,
       /*logits_data_type=*/LogitsDataType::FLOAT32));
+
+  return executor;
 }
 
 }  // namespace litert::lm
diff --git a/runtime/executor/llm_litert_compiled_model_executor.h b/runtime/executor/llm_litert_compiled_model_executor.h
index 0c94ee889..cb999b36e 100644
--- a/runtime/executor/llm_litert_compiled_model_executor.h
+++ b/runtime/executor/llm_litert_compiled_model_executor.h
@@ -23,7 +23,6 @@
 #include <utility>
 #include <vector>
 
-#include "absl/base/nullability.h"  // from @com_google_absl
 #include "absl/container/flat_hash_map.h"  // from @com_google_absl
 #include "absl/status/status.h"  // from @com_google_absl
 #include "absl/status/statusor.h"  // from @com_google_absl
@@ -56,6 +55,25 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
  public:
   using LlmExecutor::Prefill;
 
+  struct TensorMetadata {
+    size_t signature_index;
+    size_t input_index;
+    litert::ElementType element_type;
+    std::vector<int> dimensions;
+  };
+
+  struct CachedMetadata {
+    std::string prefill_signature_key;
+    absl::flat_hash_map<std::string, size_t> signature_key_to_idx;
+    absl::flat_hash_map<size_t, std::vector<std::string>>
+        input_names_by_sig_idx;
+    absl::flat_hash_map<size_t, std::vector<std::string>>
+        output_names_by_sig_idx;
+    absl::flat_hash_map<std::string, TensorMetadata> input_tensor_metadata;
+  };
+
+  static absl::StatusOr<CachedMetadata> CacheTensorMetadata(const Model& model);
+
   // Input APIs:
   // Basic API to trigger the "prefill" or "prefix" process.
   // Input is token ids with shape `[batch, sequence_length]`
@@ -131,18 +149,15 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
  protected:
   LlmLiteRtCompiledModelExecutorBase(
       LlmExecutorSettings executor_settings, Environment& env,
-      const Model* absl_nonnull model,
       std::unique_ptr<CompiledModel> compiled_model,
-      absl::flat_hash_map<absl::string_view, TensorBuffer> decode_input_buffers,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>
-          decode_output_buffers,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>
-          input_kv_cache_buffers,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>
-          output_kv_cache_buffers,
-      std::optional<absl::flat_hash_map<absl::string_view, TensorBuffer>>
+      CachedMetadata cached_metadata,
+      absl::flat_hash_map<std::string, TensorBuffer> decode_input_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> decode_output_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> input_kv_cache_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> output_kv_cache_buffers,
+      std::optional<absl::flat_hash_map<std::string, TensorBuffer>>
           decode_input_kv_cache_buffers,
-      std::optional<absl::flat_hash_map<absl::string_view, TensorBuffer>>
+      std::optional<absl::flat_hash_map<std::string, TensorBuffer>>
           decode_output_kv_cache_buffers,
       ModelSignatures signatures, int output_batch_size,
       std::string weight_cache_path,
@@ -152,8 +167,16 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
       std::unique_ptr<LlmLiteRtMtpDrafter> mtp_drafter)
       : executor_settings_(std::move(executor_settings)),
         env_(env),
-        model_(*model),
         compiled_model_(std::move(compiled_model)),
+        prefill_signature_key_(
+            std::move(cached_metadata.prefill_signature_key)),
+        signature_key_to_idx_(std::move(cached_metadata.signature_key_to_idx)),
+        input_names_by_sig_idx_(
+            std::move(cached_metadata.input_names_by_sig_idx)),
+        output_names_by_sig_idx_(
+            std::move(cached_metadata.output_names_by_sig_idx)),
+        input_tensor_metadata_(
+            std::move(cached_metadata.input_tensor_metadata)),
         decode_input_buffers_(std::move(decode_input_buffers)),
         decode_output_buffers_(std::move(decode_output_buffers)),
         kv_cache_buffers_1_(std::move(input_kv_cache_buffers)),
@@ -170,7 +193,7 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
         logits_data_type_(logits_data_type),
         mtp_drafter_(std::move(mtp_drafter)) {
     auto processed_context = std::make_unique<LlmProcessedContext>(
-        std::nullopt, absl::flat_hash_map<absl::string_view, TensorBuffer>(),
+        std::nullopt, absl::flat_hash_map<std::string, TensorBuffer>(),
         ProcessedTokens());
     auto runtime_config = std::make_unique<RuntimeConfig>();
     runtime_config->output_heads = output_batch_size;
@@ -208,7 +231,7 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
   // with a certain length synchronously or asynchronously.
   absl::Status PrefillInternal(
       absl::string_view prefill_signature,
-      absl::flat_hash_map<absl::string_view /*input_name*/, TensorBuffer>&
+      absl::flat_hash_map<std::string /*input_name*/, TensorBuffer>&
           prefill_input_buffers,
       absl::Span<const int> ids, bool async);
 
@@ -216,7 +239,7 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
   // and run prefill signature.
   absl::Status BindTensorsAndRunPrefill(
       absl::string_view prefill_signature,
-      absl::flat_hash_map<absl::string_view /*input_name*/, TensorBuffer>&
+      absl::flat_hash_map<std::string /*input_name*/, TensorBuffer>&
           prefill_input_buffers,
       bool async);
 
@@ -239,8 +262,7 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
   absl::Status CreatePrefillInputBuffers(
       absl::string_view prefill_signature, int sequence_length,
       int context_length,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>&
-          prefill_input_buffers);
+      absl::flat_hash_map<std::string, TensorBuffer>& prefill_input_buffers);
 
   // Fills the input buffer from the unprocessed token.
   absl::Status FillInputBufferWithToken(
@@ -270,22 +292,37 @@ class LlmLiteRtCompiledModelExecutorBase : public LlmExecutor {
 
   LlmExecutorSettings executor_settings_;
   Environment& env_;
-  const Model& model_;
   std::unique_ptr<CompiledModel> compiled_model_;
 
-  absl::flat_hash_map<absl::string_view, TensorBuffer> decode_input_buffers_;
-  absl::flat_hash_map<absl::string_view, TensorBuffer> decode_output_buffers_;
+  std::string prefill_signature_key_;
+  size_t decode_signature_idx_ = 0;
+
+  absl::flat_hash_map<std::string, size_t> signature_key_to_idx_;
+  absl::flat_hash_map<size_t, std::vector<std::string>> input_names_by_sig_idx_;
+  absl::flat_hash_map<size_t, std::vector<std::string>>
+      output_names_by_sig_idx_;
+
+  absl::flat_hash_map<std::string, TensorMetadata> input_tensor_metadata_;
+  absl::Status ResolveDynamicShape(absl::string_view signature,
+                                   absl::string_view tensor_name,
+                                   int new_value);
+  absl::StatusOr<bool> HasDynamicDim(absl::string_view signature,
+                                     absl::string_view tensor_name);
+  absl::StatusOr<litert::TensorBuffer> CreateInputBuffer(
+      absl::string_view signature, absl::string_view tensor_name) const;
+
+  absl::flat_hash_map<std::string, TensorBuffer> decode_input_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer> decode_output_buffers_;
   // KV cache double buffers because some GPU backends can't allocate one buffer
   // for both read and write at the same time.
-  absl::flat_hash_map<absl::string_view, TensorBuffer> kv_cache_buffers_1_;
-  absl::flat_hash_map<absl::string_view, TensorBuffer> kv_cache_buffers_2_;
-  absl::flat_hash_map<absl::string_view, TensorBuffer>* input_kv_cache_buffers_;
-  absl::flat_hash_map<absl::string_view, TensorBuffer>*
-      output_kv_cache_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer> kv_cache_buffers_1_;
+  absl::flat_hash_map<std::string, TensorBuffer> kv_cache_buffers_2_;
+  absl::flat_hash_map<std::string, TensorBuffer>* input_kv_cache_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer>* output_kv_cache_buffers_;
   // KV cache (double) buffers used during decode when output_batch_size_ > 1.
-  std::optional<absl::flat_hash_map<absl::string_view, TensorBuffer>>
+  std::optional<absl::flat_hash_map<std::string, TensorBuffer>>
       decode_kv_cache_buffers_1_;
-  std::optional<absl::flat_hash_map<absl::string_view, TensorBuffer>>
+  std::optional<absl::flat_hash_map<std::string, TensorBuffer>>
       decode_kv_cache_buffers_2_;
 
   // The signatures of the model.
@@ -349,18 +386,15 @@ class LlmLiteRtCompiledModelExecutorStatic
  private:
   LlmLiteRtCompiledModelExecutorStatic(
       LlmExecutorSettings executor_settings, Environment& env,
-      const Model* absl_nonnull model,
       std::unique_ptr<CompiledModel> compiled_model,
-      absl::flat_hash_map<absl::string_view, TensorBuffer> decode_input_buffers,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>
-          decode_output_buffers,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>
-          input_kv_cache_buffers,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>
-          output_kv_cache_buffers,
-      std::optional<absl::flat_hash_map<absl::string_view, TensorBuffer>>
+      CachedMetadata cached_metadata,
+      absl::flat_hash_map<std::string, TensorBuffer> decode_input_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> decode_output_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> input_kv_cache_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> output_kv_cache_buffers,
+      std::optional<absl::flat_hash_map<std::string, TensorBuffer>>
           decode_input_kv_cache_buffers,
-      std::optional<absl::flat_hash_map<absl::string_view, TensorBuffer>>
+      std::optional<absl::flat_hash_map<std::string, TensorBuffer>>
           decode_output_kv_cache_buffers,
       SortedPrefillSignatureMap prefill_signature_map,
       ModelSignatures signatures, int output_batch_size,
@@ -372,9 +406,9 @@ class LlmLiteRtCompiledModelExecutorStatic
       LogitsDataType logits_data_type = LogitsDataType::FLOAT32,
       std::unique_ptr<LlmLiteRtMtpDrafter> mtp_drafter = nullptr)
       : LlmLiteRtCompiledModelExecutorBase(
-            std::move(executor_settings), env, model, std::move(compiled_model),
-            std::move(decode_input_buffers), std::move(decode_output_buffers),
-            std::move(input_kv_cache_buffers),
+            std::move(executor_settings), env, std::move(compiled_model),
+            std::move(cached_metadata), std::move(decode_input_buffers),
+            std::move(decode_output_buffers), std::move(input_kv_cache_buffers),
             std::move(output_kv_cache_buffers),
             std::move(decode_input_kv_cache_buffers),
             std::move(decode_output_kv_cache_buffers), signatures,
@@ -388,7 +422,7 @@ class LlmLiteRtCompiledModelExecutorStatic
   // to refer to them by just their unique name.
   absl::flat_hash_map<
       std::string /*prefill_signature_name*/,
-      absl::flat_hash_map<absl::string_view /*input_name*/, TensorBuffer>>
+      absl::flat_hash_map<std::string /*input_name*/, TensorBuffer>>
       prefill_input_buffers_;
   std::optional<bool> do_prefill_sync_;
 };
@@ -411,11 +445,10 @@ class LlmLiteRtCompiledModelExecutorDynamic
  private:
   LlmLiteRtCompiledModelExecutorDynamic(
       LlmExecutorSettings executor_settings, Environment& env,
-      const Model* absl_nonnull model,
       std::unique_ptr<CompiledModel> compiled_model,
-      absl::flat_hash_map<absl::string_view, TensorBuffer> decode_input_buffers,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>
-          decode_output_buffers,
+      CachedMetadata cached_metadata,
+      absl::flat_hash_map<std::string, TensorBuffer> decode_input_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> decode_output_buffers,
       int prefill_chunk_size, int key_dynamic_dim_index,
       int value_dynamic_dim_index, int kv_increament_size,
       std::vector<std::string> key_cache_input_names,
@@ -429,8 +462,9 @@ class LlmLiteRtCompiledModelExecutorDynamic
       LogitsDataType logits_data_type = LogitsDataType::FLOAT32,
       std::unique_ptr<LlmLiteRtMtpDrafter> mtp_drafter = nullptr)
       : LlmLiteRtCompiledModelExecutorBase(
-            std::move(executor_settings), env, model, std::move(compiled_model),
-            std::move(decode_input_buffers), std::move(decode_output_buffers),
+            std::move(executor_settings), env, std::move(compiled_model),
+            std::move(cached_metadata), std::move(decode_input_buffers),
+            std::move(decode_output_buffers),
             /*input_kv_cache_buffers=*/{},
             /*output_kv_cache_buffers=*/{},
             /*decode_input_kv_cache_buffers=*/std::nullopt,
diff --git a/runtime/executor/llm_litert_compiled_model_executor_test.cc b/runtime/executor/llm_litert_compiled_model_executor_test.cc
index 69635091a..acb7f281e 100644
--- a/runtime/executor/llm_litert_compiled_model_executor_test.cc
+++ b/runtime/executor/llm_litert_compiled_model_executor_test.cc
@@ -462,6 +462,10 @@ class TfLiteModelResources : public ModelResources {
     return absl::UnimplementedError("Unsupported model type");
   }
 
+  absl::Status ReleaseTFLiteModel(ModelType model_type) override {
+    return absl::UnimplementedError("ReleaseTFLiteModel not implemented");
+  }
+
   absl::StatusOr<absl::string_view> GetTFLiteModelBuffer(
       ModelType model_type) override {
     return absl::UnimplementedError("GetTFLiteModelBuffer not implemented.");
diff --git a/runtime/executor/llm_litert_mtp_drafter.cc b/runtime/executor/llm_litert_mtp_drafter.cc
index d5ba22973..3b8e0ffe4 100644
--- a/runtime/executor/llm_litert_mtp_drafter.cc
+++ b/runtime/executor/llm_litert_mtp_drafter.cc
@@ -180,10 +180,8 @@ LlmLiteRtMtpDrafter::Create(Environment& env, ModelResources& resources,
       auto compiled_model,
       CompiledModel::Create(env, model->Get(), compilation_options));
 
-  absl::flat_hash_map<absl::string_view, TensorBuffer>
-      mtp_drafter_input_buffers;
-  absl::flat_hash_map<absl::string_view, TensorBuffer>
-      mtp_drafter_output_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> mtp_drafter_input_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> mtp_drafter_output_buffers;
   std::vector<std::string> kv_cache_input_names;
   LITERT_ASSIGN_OR_RETURN(SimpleSignature drafter_signature,
                           compiled_model.GetSignature(/*signature_index=*/0));
@@ -210,8 +208,8 @@ LlmLiteRtMtpDrafter::Create(Environment& env, ModelResources& resources,
 
   LITERT_ASSIGN_OR_RETURN(SimpleSignature verify_signature,
                           base_model.FindSignature(kVerifySignatureRunner));
-  absl::flat_hash_map<absl::string_view, TensorBuffer> verifier_input_buffers;
-  absl::flat_hash_map<absl::string_view, TensorBuffer> verifier_output_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> verifier_input_buffers;
+  absl::flat_hash_map<std::string, TensorBuffer> verifier_output_buffers;
   int num_draft_steps;
   {
     for (absl::string_view input_name : verify_signature.InputNames()) {
@@ -277,8 +275,8 @@ LlmLiteRtMtpDrafter::Create(Environment& env, ModelResources& resources,
 }
 
 absl::Status LlmLiteRtMtpDrafter::PrepareDrafterInputBuffers(
-    int position, absl::flat_hash_map<absl::string_view, TensorBuffer>&
-                      output_kv_cache_buffers) {
+    int position,
+    absl::flat_hash_map<std::string, TensorBuffer>& output_kv_cache_buffers) {
   for (const auto& kv_cache_input_name : kv_cache_input_names_) {
     LITERT_ASSIGN_OR_RETURN(
         auto kv_cache_buffer_dup,
@@ -337,9 +335,20 @@ absl::StatusOr<std::vector<int>> LlmLiteRtMtpDrafter::RunDraftingLoop(
     }
 
     bool async = true;
+    absl::flat_hash_map<absl::string_view, TensorBuffer> draft_input_buffers;
+    for (const auto& [name, buffer] : active_drafter_input_buffers_) {
+      LITERT_ASSIGN_OR_RETURN(auto buffer_dup, buffer.Duplicate());
+      draft_input_buffers[name] = std::move(buffer_dup);
+    }
+    absl::flat_hash_map<absl::string_view, TensorBuffer> draft_output_buffers;
+    for (const auto& [name, buffer] : active_drafter_output_buffers_) {
+      LITERT_ASSIGN_OR_RETURN(auto buffer_dup, buffer.Duplicate());
+      draft_output_buffers[name] = std::move(buffer_dup);
+    }
+
     LITERT_RETURN_IF_ERROR(mtp_drafter_model_.RunAsync(
-        drafter_signature_.Key(), active_drafter_input_buffers_,
-        active_drafter_output_buffers_, async));
+        drafter_signature_.Key(), draft_input_buffers, draft_output_buffers,
+        async));
 
     RETURN_IF_ERROR(drafter_sampler_->SampleToIdAndScoreBuffer(
         active_drafter_output_buffers_["logits"], drafter_id_tensor_,
@@ -357,8 +366,7 @@ absl::StatusOr<std::vector<int>> LlmLiteRtMtpDrafter::RunDraftingLoop(
 
 absl::Status LlmLiteRtMtpDrafter::PrepareVerifierInputBuffers(
     int position, int token_id, const std::vector<int>& drafted_tokens,
-    absl::flat_hash_map<absl::string_view, TensorBuffer>&
-        input_kv_cache_buffers) {
+    absl::flat_hash_map<std::string, TensorBuffer>& input_kv_cache_buffers) {
   {
     LITERT_ASSIGN_OR_RETURN(
         auto verifier_input_pos_lock_and_addr,
@@ -404,8 +412,7 @@ absl::Status LlmLiteRtMtpDrafter::PrepareVerifierInputBuffers(
 }
 
 absl::Status LlmLiteRtMtpDrafter::PrepareVerifierOutputBuffers(
-    absl::flat_hash_map<absl::string_view, TensorBuffer>&
-        output_kv_cache_buffers) {
+    absl::flat_hash_map<std::string, TensorBuffer>& output_kv_cache_buffers) {
   for (const auto& [output_name, output_buffer] : output_kv_cache_buffers) {
     LITERT_ASSIGN_OR_RETURN(auto output_buffer_dup, output_buffer.Duplicate());
     active_verifier_output_buffers_[output_name] = std::move(output_buffer_dup);
@@ -418,9 +425,20 @@ absl::Status LlmLiteRtMtpDrafter::PrepareVerifierOutputBuffers(
 
 absl::StatusOr<std::vector<int>> LlmLiteRtMtpDrafter::RunVerification() {
   bool async = true;
-  LITERT_RETURN_IF_ERROR(base_model_.RunAsync(
-      verify_signature_.Key(), active_verifier_input_buffers_,
-      active_verifier_output_buffers_, async));
+  absl::flat_hash_map<absl::string_view, TensorBuffer> verify_input_buffers;
+  for (const auto& [name, buffer] : active_verifier_input_buffers_) {
+    LITERT_ASSIGN_OR_RETURN(auto buffer_dup, buffer.Duplicate());
+    verify_input_buffers[name] = std::move(buffer_dup);
+  }
+  absl::flat_hash_map<absl::string_view, TensorBuffer> verify_output_buffers;
+  for (const auto& [name, buffer] : active_verifier_output_buffers_) {
+    LITERT_ASSIGN_OR_RETURN(auto buffer_dup, buffer.Duplicate());
+    verify_output_buffers[name] = std::move(buffer_dup);
+  }
+
+  LITERT_RETURN_IF_ERROR(base_model_.RunAsync(verify_signature_.Key(),
+                                              verify_input_buffers,
+                                              verify_output_buffers, async));
 
   RETURN_IF_ERROR(verifier_sampler_->SampleToIdAndScoreBuffer(
       active_verifier_output_buffers_.at("logits"), verifier_id_tensor_,
@@ -434,10 +452,8 @@ absl::StatusOr<std::vector<int>> LlmLiteRtMtpDrafter::RunVerification() {
 
 absl::StatusOr<std::vector<std::vector<int>>> LlmLiteRtMtpDrafter::Draft(
     int position, int token_id, std::optional<TensorBuffer> activations,
-    absl::flat_hash_map<absl::string_view, TensorBuffer>&
-        input_kv_cache_buffers,
-    absl::flat_hash_map<absl::string_view, TensorBuffer>&
-        output_kv_cache_buffers) {
+    absl::flat_hash_map<std::string, TensorBuffer>& input_kv_cache_buffers,
+    absl::flat_hash_map<std::string, TensorBuffer>& output_kv_cache_buffers) {
   RETURN_IF_ERROR(
       PrepareDrafterInputBuffers(position - 1, output_kv_cache_buffers));
 
diff --git a/runtime/executor/llm_litert_mtp_drafter.h b/runtime/executor/llm_litert_mtp_drafter.h
index 166d84a80..2428f35fb 100644
--- a/runtime/executor/llm_litert_mtp_drafter.h
+++ b/runtime/executor/llm_litert_mtp_drafter.h
@@ -68,30 +68,23 @@ class LlmLiteRtMtpDrafter {
   //   [batch_size, num_tokens].
   absl::StatusOr<std::vector<std::vector<int>>> Draft(
       int position, int token_id, std::optional<TensorBuffer> activations,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>&
-          input_kv_cache_buffers,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>&
-          output_kv_cache_buffers);
+      absl::flat_hash_map<std::string, TensorBuffer>& input_kv_cache_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer>& output_kv_cache_buffers);
 
  private:
-  LlmLiteRtMtpDrafter(CompiledModel mtp_drafter_model,
-                      SimpleSignature drafter_signature,
-                      CompiledModel& base_model,
-                      SimpleSignature verify_signature,
-                      EmbeddingLookupManager& embedding_manager,
-                      EmbeddingLookupManager& ple_manager,
-                      std::unique_ptr<Sampler> drafter_sampler,
-                      std::unique_ptr<Sampler> verifier_sampler,
-                      std::vector<std::string> kv_cache_input_names,
-                      absl::flat_hash_map<absl::string_view, TensorBuffer>
-                          drafter_input_buffers,
-                      absl::flat_hash_map<absl::string_view, TensorBuffer>
-                          drafter_output_buffers,
-                      absl::flat_hash_map<absl::string_view, TensorBuffer>
-                          verifier_input_buffers,
-                      absl::flat_hash_map<absl::string_view, TensorBuffer>
-                          verifier_output_buffers,
-                      int num_draft_steps)
+  LlmLiteRtMtpDrafter(
+      CompiledModel mtp_drafter_model, SimpleSignature drafter_signature,
+      CompiledModel& base_model, SimpleSignature verify_signature,
+      EmbeddingLookupManager& embedding_manager,
+      EmbeddingLookupManager& ple_manager,
+      std::unique_ptr<Sampler> drafter_sampler,
+      std::unique_ptr<Sampler> verifier_sampler,
+      std::vector<std::string> kv_cache_input_names,
+      absl::flat_hash_map<std::string, TensorBuffer> drafter_input_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> drafter_output_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> verifier_input_buffers,
+      absl::flat_hash_map<std::string, TensorBuffer> verifier_output_buffers,
+      int num_draft_steps)
       : mtp_drafter_model_(std::move(mtp_drafter_model)),
         drafter_signature_(std::move(drafter_signature)),
         base_model_(base_model),
@@ -125,8 +118,8 @@ class LlmLiteRtMtpDrafter {
   }
 
   absl::Status PrepareDrafterInputBuffers(
-      int position, absl::flat_hash_map<absl::string_view, TensorBuffer>&
-                        output_kv_cache_buffers);
+      int position,
+      absl::flat_hash_map<std::string, TensorBuffer>& output_kv_cache_buffers);
 
   absl::Status PrepareDrafterOutputBuffers();
 
@@ -135,12 +128,10 @@ class LlmLiteRtMtpDrafter {
 
   absl::Status PrepareVerifierInputBuffers(
       int position, int token_id, const std::vector<int>& drafted_tokens,
-      absl::flat_hash_map<absl::string_view, TensorBuffer>&
-          input_kv_cache_buffers);
+      absl::flat_hash_map<std::string, TensorBuffer>& input_kv_cache_buffers);
 
   absl::Status PrepareVerifierOutputBuffers(
-      absl::flat_hash_map<absl::string_view, TensorBuffer>&
-          output_kv_cache_buffers);
+      absl::flat_hash_map<std::string, TensorBuffer>& output_kv_cache_buffers);
 
   absl::StatusOr<std::vector<int>> RunVerification();
 
@@ -168,29 +159,26 @@ class LlmLiteRtMtpDrafter {
   //   - input_position [batch, sequence_length]
   //   - mask [batch, 1, sequence_length = 1, context]
   //   - activations [batch, sequence_length = 1, hidden_size * 2]
-  absl::flat_hash_map<absl::string_view, TensorBuffer> drafter_input_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer> drafter_input_buffers_;
   //   - logits [batch, sequence_length, vocab_size]
   //   - projected_logits [batch, sequence_length, hidden_size]
-  absl::flat_hash_map<absl::string_view, TensorBuffer> drafter_output_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer> drafter_output_buffers_;
 
   // Verifier owned buffers.
   //   - input_position [batch, draft_steps + 1]
   //   - mask [batch, 1, draft_steps + 1, context]
   //   - embeddings [batch, draft_steps + 1, hidden_size]
   //   - per_layer_embeddings [batch, draft_steps + 1, ...]
-  absl::flat_hash_map<absl::string_view, TensorBuffer> verifier_input_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer> verifier_input_buffers_;
   //   - logits [batch, draft_steps + 1, vocab_size]
   //   - activations [batch, draft_steps + 1, hidden_size]
-  absl::flat_hash_map<absl::string_view, TensorBuffer> verifier_output_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer> verifier_output_buffers_;
 
   // Cached maps for Run to avoid map creation overhead.
-  absl::flat_hash_map<absl::string_view, TensorBuffer>
-      active_drafter_input_buffers_;
-  absl::flat_hash_map<absl::string_view, TensorBuffer>
-      active_drafter_output_buffers_;
-  absl::flat_hash_map<absl::string_view, TensorBuffer>
-      active_verifier_input_buffers_;
-  absl::flat_hash_map<absl::string_view, TensorBuffer>
+  absl::flat_hash_map<std::string, TensorBuffer> active_drafter_input_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer> active_drafter_output_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer> active_verifier_input_buffers_;
+  absl::flat_hash_map<std::string, TensorBuffer>
       active_verifier_output_buffers_;
 
   // Pre-allocated temporary tensors for sampling.
diff --git a/runtime/executor/llm_litert_npu_compiled_model_executor.cc b/runtime/executor/llm_litert_npu_compiled_model_executor.cc
index 26b1ab66f..8bc5bbfab 100644
--- a/runtime/executor/llm_litert_npu_compiled_model_executor.cc
+++ b/runtime/executor/llm_litert_npu_compiled_model_executor.cc
@@ -830,8 +830,7 @@ LlmLiteRtNpuCompiledModelExecutor::CreateRopeContextWithBufferSharing(
 }
 
 absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers(
-    litert::Environment& env, const litert::Model* transformer_model,
-    CompiledModel& llm_compiled_model,
+    litert::Environment& env, CompiledModel& llm_compiled_model,
     absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>&
         gemma_prefill_input_buffers,
     absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>&
@@ -846,10 +845,15 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers(
         decode_output_kv_cache_slice_buffers,
     absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>&
         verify_output_kv_cache_slice_buffers) {
-  auto prefill_signature = transformer_model->FindSignature(kPrefillSignature);
+  LITERT_ASSIGN_OR_RETURN(
+      auto prefill_input_names,
+      llm_compiled_model.GetSignatureInputNames(kPrefillSignature));
+  LITERT_ASSIGN_OR_RETURN(
+      auto prefill_output_names,
+      llm_compiled_model.GetSignatureOutputNames(kPrefillSignature));
 
   // Create input buffers for prefill signature.
-  for (auto input_name : prefill_signature->InputNames()) {
+  for (auto input_name : prefill_input_names) {
     if (absl::StartsWith(input_name, kv_cache_k_root_name) ||
         absl::StartsWith(input_name, kv_cache_v_root_name)) {
       LITERT_ASSIGN_OR_RETURN(
@@ -865,8 +869,13 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers(
   }
   // Create input buffers for decode signature. Skip kv cache input buffers as
   // they are already created in the prefill signature.
-  auto decode_signature = transformer_model->FindSignature(kDecodeSignature);
-  for (auto input_name : decode_signature->InputNames()) {
+  LITERT_ASSIGN_OR_RETURN(
+      auto decode_input_names,
+      llm_compiled_model.GetSignatureInputNames(kDecodeSignature));
+  LITERT_ASSIGN_OR_RETURN(
+      auto decode_output_names,
+      llm_compiled_model.GetSignatureOutputNames(kDecodeSignature));
+  for (auto input_name : decode_input_names) {
     if (absl::StartsWith(input_name, kv_cache_k_root_name) ||
         absl::StartsWith(input_name, kv_cache_v_root_name)) {
       // Create the input kv cache buffer for the decode signature if it is not
@@ -886,7 +895,7 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers(
   }
 
   // Create output buffers for prefill signature.
-  for (auto output_name : prefill_signature->OutputNames()) {
+  for (auto output_name : prefill_output_names) {
     if (absl::StartsWith(output_name, kv_cache_slice_k_root_name) ||
         absl::StartsWith(output_name, kv_cache_slice_v_root_name)) {
       LITERT_ASSIGN_OR_RETURN(
@@ -896,7 +905,7 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers(
     }
   }
   // Create output buffers for decode signature.
-  for (auto output_name : decode_signature->OutputNames()) {
+  for (auto output_name : decode_output_names) {
     if (absl::StartsWith(output_name, kv_cache_slice_k_root_name) ||
         absl::StartsWith(output_name, kv_cache_slice_v_root_name)) {
       LITERT_ASSIGN_OR_RETURN(
@@ -906,16 +915,21 @@ absl::Status LlmLiteRtNpuCompiledModelExecutor::AllocateTransformerBuffers(
   }
 
   // Create input/output buffers for verify signature if it exists.
-  auto verify_signature =
-      transformer_model->FindSignature(LlmSignatures::kVerifyLlm);
-  if (verify_signature) {
-    for (auto input_name : verify_signature->InputNames()) {
+  if (auto verify_input_names_res = llm_compiled_model.GetSignatureInputNames(
+          LlmSignatures::kVerifyLlm)) {
+    ABSL_LOG(INFO) << "Verify signature found. Inputs:";
+    for (auto input_name : *verify_input_names_res) {
+      ABSL_LOG(INFO) << "  - " << input_name;
       LITERT_ASSIGN_OR_RETURN(gemma_verify_input_buffers[input_name],
                               llm_compiled_model.CreateInputBuffer(
                                   LlmSignatures::kVerifyLlm, input_name));
       gemma_verify_input_buffers[input_name].Clear();
     }
-    for (auto output_name : verify_signature->OutputNames()) {
+
+    LITERT_ASSIGN_OR_RETURN(
+        auto verify_output_names,
+        llm_compiled_model.GetSignatureOutputNames(LlmSignatures::kVerifyLlm));
+    for (auto output_name : verify_output_names) {
       if (absl::StartsWith(output_name, kv_cache_slice_k_root_name) ||
           absl::StartsWith(output_name, kv_cache_slice_v_root_name)) {
         LITERT_ASSIGN_OR_RETURN(
@@ -2742,8 +2756,14 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding(
   absl::flat_hash_map<absl::string_view, TensorBuffer>
       verify_output_kv_cache_slice_buffers;
 
+  if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    RETURN_IF_ERROR(
+        resources.ReleaseTFLiteModel(ModelType::kTfLitePrefillDecode));
+  }
+
   RETURN_IF_ERROR(AllocateTransformerBuffers(
-      env, transformer_model, llm_compiled_model, gemma_prefill_input_buffers,
+      env, llm_compiled_model, gemma_prefill_input_buffers,
       gemma_decode_input_buffers, gemma_verify_input_buffers,
       input_kv_cache_buffers, prefill_output_kv_cache_slice_buffers,
       decode_output_kv_cache_slice_buffers,
@@ -2772,6 +2792,12 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding(
           verify_output_kv_cache_slice_buffers, gemma_prefill_input_buffers,
           gemma_decode_input_buffers, gemma_verify_input_buffers));
 
+  if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    RETURN_IF_ERROR(
+        resources.ReleaseTFLiteModel(ModelType::kTfLitePrefillDecode));
+  }
+
   LITERT_ASSIGN_OR_RETURN(auto npu_auxiliary_lrt_model,
                           resources.GetTFLiteModel(ModelType::kTfLiteAux));
 
@@ -2780,6 +2806,11 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding(
       CreateNpuAuxiliaryContext(env, *npu_auxiliary_lrt_model,
                                 executor_settings));
 
+  if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteAux));
+  }
+
   LITERT_ASSIGN_OR_RETURN(
       auto mask_context,
       CreateMaskContextWithBufferSharing(
@@ -2855,10 +2886,15 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding(
   add_multi_modal_end_model(ModelType::kTfLiteEndOfVision,
                             litert::lm::ExecutorVisionData::kEndToken);
 
+  absl::flat_hash_map<int, const Model*> raw_end_of_multi_modal_models;
+  for (const auto& [token, model] : end_of_multi_modal_embedding_models) {
+    raw_end_of_multi_modal_models[token] = model;
+  }
+
   LITERT_ASSIGN_OR_RETURN(
       std::unique_ptr<EmbeddingLookupManager> embedding_lookup_manager,
       EmbeddingLookupManager::Create(env, embedder_lrt_model,
-                                     end_of_multi_modal_embedding_models, true,
+                                     raw_end_of_multi_modal_models, true,
                                      "decode_embedder"));
 
   bool use_hw_ple_for_npu = false;
@@ -2867,6 +2903,10 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding(
     use_hw_ple_for_npu = npu_config_status->use_hw_ple_for_npu;
   }
 
+  if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteEmbedder));
+  }
   std::optional<EmbedderPerLayerContext> embedder_per_layer_context =
       std::nullopt;
 
@@ -2965,6 +3005,14 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelHasPerLayerEmbedding(
 
       RETURN_IF_ERROR(WarmupDrafterInference(drafter_context.value(),
                                              drafter_aux_context.value()));
+
+      if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+          is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+        RETURN_IF_ERROR(
+            resources.ReleaseTFLiteModel(ModelType::kTfLiteMtpDrafter));
+
+        RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteMtpAux));
+      }
     }
   }
 
@@ -3013,8 +3061,14 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelWithoutPerLayerEmbedding(
   absl::flat_hash_map<absl::string_view, TensorBuffer>
       verify_output_kv_cache_slice_buffers;
 
+  if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    RETURN_IF_ERROR(
+        resources.ReleaseTFLiteModel(ModelType::kTfLitePrefillDecode));
+  }
+
   RETURN_IF_ERROR(AllocateTransformerBuffers(
-      env, transformer_model, llm_compiled_model, gemma_prefill_input_buffers,
+      env, llm_compiled_model, gemma_prefill_input_buffers,
       gemma_decode_input_buffers, gemma_verify_input_buffers,
       input_kv_cache_buffers, prefill_output_kv_cache_slice_buffers,
       decode_output_kv_cache_slice_buffers,
@@ -3063,6 +3117,12 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelWithoutPerLayerEmbedding(
     llm_inference_context.decode_input_buffers[cache_v17] = std::move(buffer_v);
   }
 
+  if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    RETURN_IF_ERROR(
+        resources.ReleaseTFLiteModel(ModelType::kTfLitePrefillDecode));
+  }
+
   LITERT_ASSIGN_OR_RETURN(auto npu_auxiliary_lrt_model,
                           resources.GetTFLiteModel(ModelType::kTfLiteAux));
 
@@ -3071,6 +3131,11 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelWithoutPerLayerEmbedding(
       CreateNpuAuxiliaryContext(env, *npu_auxiliary_lrt_model,
                                 executor_settings));
 
+  if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+      is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+    RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteAux));
+  }
+
   LITERT_ASSIGN_OR_RETURN(
       auto mask_context,
       CreateMaskContextWithBufferSharing(
@@ -3154,6 +3219,17 @@ LlmLiteRtNpuCompiledModelExecutor::CreateForModelWithoutPerLayerEmbedding(
         EmbeddingLookupManager::Create(env, embedder_lrt_model,
                                        end_of_multi_modal_embedding_models,
                                        true, "decode_embedder"));
+
+    if (auto is_fully_accelerated = llm_compiled_model.IsFullyAccelerated();
+        is_fully_accelerated.HasValue() && *is_fully_accelerated) {
+      RETURN_IF_ERROR(resources.ReleaseTFLiteModel(ModelType::kTfLiteEmbedder));
+
+      RETURN_IF_ERROR(
+          resources.ReleaseTFLiteModel(ModelType::kTfLiteEndOfAudio));
+      RETURN_IF_ERROR(
+          resources.ReleaseTFLiteModel(ModelType::kTfLiteEndOfVision));
+      end_of_multi_modal_embedding_models.clear();
+    }
   }
 
   SpeculativeDecodingType speculative_decoding_type =
diff --git a/runtime/executor/llm_litert_npu_compiled_model_executor.h b/runtime/executor/llm_litert_npu_compiled_model_executor.h
index 7c58ceed3..2c1c2923e 100644
--- a/runtime/executor/llm_litert_npu_compiled_model_executor.h
+++ b/runtime/executor/llm_litert_npu_compiled_model_executor.h
@@ -560,8 +560,7 @@ class LlmLiteRtNpuCompiledModelExecutor : public LlmExecutor {
           drafter_aux_output_buffers);
 
   static absl::Status AllocateTransformerBuffers(
-      litert::Environment& env, const litert::Model* transformer_model,
-      CompiledModel& llm_compiled_model,
+      litert::Environment& env, CompiledModel& llm_compiled_model,
       absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>&
           gemma_prefill_input_buffers,
       absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>&
diff --git a/runtime/executor/llm_processed_context.h b/runtime/executor/llm_processed_context.h
index f2cb79484..4e6c0869d 100644
--- a/runtime/executor/llm_processed_context.h
+++ b/runtime/executor/llm_processed_context.h
@@ -17,10 +17,10 @@
 
 #include <cstdint>
 #include <optional>
+#include <string>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"  // from @com_google_absl
-#include "absl/strings/string_view.h"  // from @com_google_absl
 #include "litert/cc/litert_tensor_buffer.h"  // from @litert
 #include "runtime/executor/llm_executor_io_types.h"
 #include "runtime/executor/llm_executor_processed_tokens.h"
@@ -34,8 +34,7 @@ class LlmProcessedContext : public ProcessedContext {
  public:
   explicit LlmProcessedContext(
       std::optional<uint32_t> lora_id,
-      absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>
-          kv_cache_buffers,
+      absl::flat_hash_map<std::string, ::litert::TensorBuffer> kv_cache_buffers,
       ::litert::lm::ProcessedTokens processed_tokens = {})
       : lora_id_(lora_id),
         processed_tokens_(std::move(processed_tokens)),
@@ -47,16 +46,14 @@ class LlmProcessedContext : public ProcessedContext {
   }
   ProcessedTokens& processed_tokens() override { return processed_tokens_; }
 
-  absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>&
-  kv_cache_buffers() {
+  absl::flat_hash_map<std::string, ::litert::TensorBuffer>& kv_cache_buffers() {
     return kv_cache_buffers_;
   }
 
  private:
   std::optional<uint32_t> lora_id_;
   ProcessedTokens processed_tokens_;
-  absl::flat_hash_map<absl::string_view, ::litert::TensorBuffer>
-      kv_cache_buffers_;
+  absl::flat_hash_map<std::string, ::litert::TensorBuffer> kv_cache_buffers_;
 };
 
 }  // namespace litert::lm
diff --git a/runtime/executor/magic_number_configs_helper_test.cc b/runtime/executor/magic_number_configs_helper_test.cc
index f66679e06..def0845ad 100644
--- a/runtime/executor/magic_number_configs_helper_test.cc
+++ b/runtime/executor/magic_number_configs_helper_test.cc
@@ -24,6 +24,7 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"  // from @com_google_absl
 #include "absl/status/statusor.h"  // from @com_google_absl
 #include "absl/strings/string_view.h"  // from @com_google_absl
 #include "litert/cc/litert_macros.h"  // from @litert
@@ -112,6 +113,8 @@ class ModelResourcesMock : public ModelResources {
               (), (override));
   MOCK_METHOD((absl::StatusOr<std::pair<size_t, size_t>>),
               GetWeightsSectionOffset, (ModelType model_type), (override));
+  MOCK_METHOD(absl::Status, ReleaseTFLiteModel, (ModelType model_type),
+              (override));
 
   absl::StatusOr<const litert::Model*> GetTFLiteModel(
       ModelType model_type) override {
diff --git a/runtime/executor/vision_litert_compiled_model_executor.cc b/runtime/executor/vision_litert_compiled_model_executor.cc
index e121ecf42..3cc3c7bcc 100644
--- a/runtime/executor/vision_litert_compiled_model_executor.cc
+++ b/runtime/executor/vision_litert_compiled_model_executor.cc
@@ -192,11 +192,12 @@ VisionLiteRtCompiledModelExecutor::VisionEncoder::Create(
     const VisionExecutorProperties& vision_executor_properties) {
   auto handler = std::unique_ptr<VisionEncoder>(new VisionEncoder(
       env, model, vision_executor_settings, vision_executor_properties));
-  RETURN_IF_ERROR(handler->Initialize());
+  RETURN_IF_ERROR(handler->Initialize(*model));
   return handler;
 }
 
-absl::Status VisionLiteRtCompiledModelExecutor::VisionEncoder::Initialize() {
+absl::Status VisionLiteRtCompiledModelExecutor::VisionEncoder::Initialize(
+    const Model& model) {
   // TODO(b/405424188): - Add support for NPU backends.
   LITERT_ASSIGN_OR_RETURN(auto options, Options::Create());
   auto weight_cache_file = vision_executor_settings_.GetWeightCacheFile(
@@ -269,8 +270,13 @@ absl::Status VisionLiteRtCompiledModelExecutor::VisionEncoder::Initialize() {
   }
 
   LITERT_ASSIGN_OR_RETURN(compiled_model_,
-                          CompiledModel::Create(env_, model_.Get(), options));
+                          CompiledModel::Create(env_, model.Get(), options));
   if (!vision_executor_properties_.patch_num_shrink_factor.has_value()) {
+    if (auto num_signatures = model.GetNumSignatures(); num_signatures != 1) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "The Vision Encoder model must have exactly one signature but got ",
+          num_signatures));
+    }
     // Only create input buffer at initialization for non-VIT models.
     LITERT_ASSIGN_OR_RETURN(input_buffers_,
                             compiled_model_.CreateInputBuffers(0));
@@ -288,11 +294,12 @@ VisionLiteRtCompiledModelExecutor::VisionAdapter::Create(
     const VisionExecutorProperties& vision_executor_properties) {
   auto handler = std::unique_ptr<VisionAdapter>(new VisionAdapter(
       env, model, vision_executor_settings, vision_executor_properties));
-  RETURN_IF_ERROR(handler->Initialize());
+  RETURN_IF_ERROR(handler->Initialize(*model));
   return handler;
 }
 
-absl::Status VisionLiteRtCompiledModelExecutor::VisionAdapter::Initialize() {
+absl::Status VisionLiteRtCompiledModelExecutor::VisionAdapter::Initialize(
+    const Model& model) {
   // TODO(b/405424188): - Add support for NPU backends.
   LITERT_ASSIGN_OR_RETURN(auto options, Options::Create());
   auto weight_cache_file = vision_executor_settings_.GetWeightCacheFile(
@@ -329,7 +336,7 @@ absl::Status VisionLiteRtCompiledModelExecutor::VisionAdapter::Initialize() {
   }
 
   LITERT_ASSIGN_OR_RETURN(compiled_model_,
-                          CompiledModel::Create(env_, model_.Get(), options));
+                          CompiledModel::Create(env_, model.Get(), options));
   // This check verifies if signature 0 of the adapter model contains any
   // inputs. This is used to infer whether input buffers should be created at
   // initialization time (for single-signature models that use signature 0 by
@@ -337,7 +344,7 @@ absl::Status VisionLiteRtCompiledModelExecutor::VisionAdapter::Initialize() {
   // input buffers on-demand in `Encode` for a specific signature). This is a
   // more direct check than relying on `patch_num_shrink_factor` which was
   // previously used to detect multi-signature models.
-  auto signature_or = model_.GetSignature(0);
+  auto signature_or = model.GetSignature(0);
   if (signature_or.HasValue() && !signature_or->InputNames().empty()) {
     LITERT_ASSIGN_OR_RETURN(input_buffers_,
                             compiled_model_.CreateInputBuffers(0));
@@ -373,16 +380,6 @@ litert::lm::VisionLiteRtCompiledModelExecutor::Create(
       auto vision_executor_properties,
       GetVisionExecutorPropertiesFromModelResources(*resources.get()));
 
-  ASSIGN_OR_RETURN(
-      auto vision_encoder,
-      VisionEncoder::Create(env, vision_encoder_model, vision_executor_settings,
-                            vision_executor_properties));
-
-  ASSIGN_OR_RETURN(
-      auto vision_adapter,
-      VisionAdapter::Create(env, vision_adapter_model, vision_executor_settings,
-                            vision_executor_properties));
-
   LITERT_ASSIGN_OR_RETURN(auto tensor_type,
                           vision_encoder_model->GetInputTensorType(0, 0));
   const auto& dimensions = tensor_type.Layout().Dimensions();
@@ -400,6 +397,16 @@ litert::lm::VisionLiteRtCompiledModelExecutor::Create(
   auto expected_input_dimension =
       std::vector<int>(dimensions.begin(), dimensions.end());
 
+  ASSIGN_OR_RETURN(
+      auto vision_encoder,
+      VisionEncoder::Create(env, vision_encoder_model, vision_executor_settings,
+                            vision_executor_properties));
+
+  ASSIGN_OR_RETURN(
+      auto vision_adapter,
+      VisionAdapter::Create(env, vision_adapter_model, vision_executor_settings,
+                            vision_executor_properties));
+
   return absl::WrapUnique(new VisionLiteRtCompiledModelExecutor(
       vision_executor_settings, env, std::move(resources),
       std::move(vision_encoder), std::move(vision_adapter),
diff --git a/runtime/executor/vision_litert_compiled_model_executor.h b/runtime/executor/vision_litert_compiled_model_executor.h
index 928e4fabd..5d40d8827 100644
--- a/runtime/executor/vision_litert_compiled_model_executor.h
+++ b/runtime/executor/vision_litert_compiled_model_executor.h
@@ -90,7 +90,7 @@ class VisionLiteRtCompiledModelExecutor : public VisionExecutor {
 
     // Initialize the VisionEncoder, which will create the input and output
     // buffers for the vision encoder model.
-    absl::Status Initialize();
+    absl::Status Initialize(const Model& model);
 
     // Returns the CompiledModel for the vision encoder model.
     const CompiledModel& GetCompiledModel() const { return compiled_model_; }
@@ -149,7 +149,6 @@ class VisionLiteRtCompiledModelExecutor : public VisionExecutor {
 
     // The vision executor properties.
     const VisionExecutorProperties& vision_executor_properties_;
-
     // The vision encoder compiled model.
     CompiledModel compiled_model_;
 
@@ -180,7 +179,7 @@ class VisionLiteRtCompiledModelExecutor : public VisionExecutor {
         const VisionExecutorProperties& vision_executor_properties);
 
     // Initialize the VisionAdapter.
-    absl::Status Initialize();
+    absl::Status Initialize(const Model& model);
 
     // Returns the CompiledModel for the vision adapter model.
     const CompiledModel& GetCompiledModel() const { return compiled_model_; }
@@ -226,7 +225,6 @@ class VisionLiteRtCompiledModelExecutor : public VisionExecutor {
 
     // The vision executor properties.
     const VisionExecutorProperties& vision_executor_properties_;
-
     // The vision adapter compiled model.
     CompiledModel compiled_model_;
 
diff --git a/runtime/util/litert_lm_loader.cc b/runtime/util/litert_lm_loader.cc
index abed74c7e..f9a2f7d0e 100644
--- a/runtime/util/litert_lm_loader.cc
+++ b/runtime/util/litert_lm_loader.cc
@@ -14,6 +14,10 @@
 
 #include "runtime/util/litert_lm_loader.h"
 
+#if defined(__linux__) || defined(__ANDROID__)
+#include <sys/mman.h>
+#endif
+
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
@@ -296,6 +300,18 @@ absl::StatusOr<std::pair<size_t, size_t>> LitertLmLoader::GetSectionLocation(
   return section_location_it->second;
 }
 
+absl::Status LitertLmLoader::ReleaseSection(BufferKey buffer_key) {
+  absl::MutexLock lock(section_buffers_mutex_);
+  auto it = section_buffers_.find(buffer_key);
+  if (it != section_buffers_.end()) {
+    section_buffers_.erase(it);
+  }
+
+  section_memory_mapped_files_.erase(buffer_key);
+  section_locations_.erase(buffer_key);
+  return absl::OkStatus();
+}
+
 std::optional<litert::OwningBufferRef<uint8_t>>
 LitertLmLoader::GetHuggingFaceTokenizer() {
   auto optional_section_buffer =
diff --git a/runtime/util/litert_lm_loader.h b/runtime/util/litert_lm_loader.h
index ad7beaa87..776141e63 100644
--- a/runtime/util/litert_lm_loader.h
+++ b/runtime/util/litert_lm_loader.h
@@ -190,6 +190,11 @@ class LitertLmLoader {
   absl::StatusOr<std::pair<size_t, size_t>> GetSectionLocation(
       BufferKey buffer_key) const;
 
+  // Releases the section buffer and the memory mapped file associated with the
+  // given buffer key.
+  absl::Status ReleaseSection(BufferKey buffer_key)
+      ABSL_LOCKS_EXCLUDED(section_buffers_mutex_);
+
   absl::StatusOr<std::reference_wrapper<ScopedFile>> GetScopedFile();
 
  private:
diff --git a/runtime/util/memory_mapped_file_posix.cc b/runtime/util/memory_mapped_file_posix.cc
index 11202284d..4184d16e9 100644
--- a/runtime/util/memory_mapped_file_posix.cc
+++ b/runtime/util/memory_mapped_file_posix.cc
@@ -39,6 +39,7 @@ class MemoryMappedFilePosix : public MemoryMappedFile {
       : length_(length), data_(data) {}
   ~MemoryMappedFilePosix() override {
     if (data_) {
+      ABSL_LOG(INFO) << "munmap address " << data_ << " length " << length_;
       munmap(data_, length_);
     }
   }
@@ -112,6 +113,7 @@ absl::StatusOr<std::unique_ptr<MemoryMappedFile>> MemoryMappedFile::Create(
 
   void* data =
       mmap(nullptr, length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file, offset);
+  ABSL_LOG(INFO) << "mmap address " << data << " length " << length;
   RET_CHECK_NE(data, MAP_FAILED) << "Failed to map, error: " << strerror(errno);
   RET_CHECK_NE(data, nullptr) << "Failed to map.";
 #ifdef __APPLE__