Internal changes to FastVLM dataProcessor.

ai-edge-bot · copybara-github · commit 1789e406b828 · 2026-05-03T15:33:36.000-07:00
LiteRT-LM-PiperOrigin-RevId: 909686676
diff --git a/runtime/conversation/model_data_processor/BUILD b/runtime/conversation/model_data_processor/BUILD
@@ -221,21 +221,24 @@ cc_library(
     srcs = ["fastvlm_data_processor.cc"],
     hdrs = ["fastvlm_data_processor.h"],
     deps = [
+        ":data_utils",
         ":fastvlm_data_processor_config",
-        ":gemma3_data_processor",
-        ":gemma3_data_processor_config",
         ":model_data_processor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@nlohmann_json//:json",
+        "@litert//litert/cc:litert_layout",
         "//runtime/components:prompt_template",
-        "//runtime/components:tokenizer",
-        "//runtime/components/constrained_decoding:constraint",
+        "//runtime/components/preprocessor:image_preprocessor",
+        "//runtime/components/preprocessor:stb_image_preprocessor",
         "//runtime/conversation:io_types",
         "//runtime/engine:io_types",
         "//runtime/util:litert_status_util",
+        "//runtime/util:memory_mapped_file",
+        "@com_googlesource_code_re2//:re2",
     ],
 )
 
diff --git a/runtime/conversation/model_data_processor/fastvlm_data_processor.cc b/runtime/conversation/model_data_processor/fastvlm_data_processor.cc
@@ -14,66 +14,147 @@
 
 #include "runtime/conversation/model_data_processor/fastvlm_data_processor.h"
 
+#include <deque>
 #include <memory>
-#include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/memory/memory.h"  // from @com_google_absl
 #include "absl/status/status.h"  // from @com_google_absl
 #include "absl/status/statusor.h"  // from @com_google_absl
+#include "absl/strings/string_view.h"  // from @com_google_absl
 #include "nlohmann/json.hpp"  // from @nlohmann_json
-#include "runtime/components/tokenizer.h"
+#include "litert/cc/litert_layout.h"  // from @litert
+#include "runtime/components/preprocessor/image_preprocessor.h"
+#include "runtime/components/preprocessor/stb_image_preprocessor.h"
+#include "runtime/components/prompt_template.h"
 #include "runtime/conversation/io_types.h"
+#include "runtime/conversation/model_data_processor/data_utils.h"
 #include "runtime/conversation/model_data_processor/fastvlm_data_processor_config.h"
-#include "runtime/conversation/model_data_processor/gemma3_data_processor.h"
-#include "runtime/conversation/model_data_processor/gemma3_data_processor_config.h"
 #include "runtime/conversation/model_data_processor/model_data_processor.h"
 #include "runtime/engine/io_types.h"
+#include "runtime/util/memory_mapped_file.h"
 #include "runtime/util/status_macros.h"
+#include "re2/re2.h"  // from @com_googlesource_code_re2
 
 namespace litert::lm {
 
+namespace {
+
+using ::nlohmann::ordered_json;
+
+bool IsImage(absl::string_view part) { return part == "<image_soft_token>"; }
+
+}  // namespace
+
 absl::StatusOr<std::unique_ptr<FastVlmDataProcessor>>
-FastVlmDataProcessor::Create(
-    FastVlmDataProcessorConfig config, std::optional<Preface> preface,
-    const Tokenizer* tokenizer,
-    const std::vector<std::vector<int>>& stop_token_ids,
-    bool enable_constrained_decoding) {
-  Gemma3DataProcessorConfig gemma3_config;
-  gemma3_config.boi_token = config.boi_token;
-  gemma3_config.eoi_token = config.eoi_token;
-  gemma3_config.image_tensor_height = config.image_tensor_height;
-  gemma3_config.image_tensor_width = config.image_tensor_width;
-
-  ASSIGN_OR_RETURN(auto impl, Gemma3DataProcessor::Create(
-                                  gemma3_config, preface, tokenizer,
-                                  stop_token_ids, enable_constrained_decoding));
-  return absl::WrapUnique(new FastVlmDataProcessor(config, std::move(impl)));
+FastVlmDataProcessor::Create(FastVlmDataProcessorConfig config,
+                             const PromptTemplateCapabilities& capabilities) {
+  return absl::WrapUnique(new FastVlmDataProcessor(
+      config, capabilities, std::make_unique<StbImagePreprocessor>()));
+}
+
+absl::StatusOr<ordered_json> FastVlmDataProcessor::MessageToTemplateInput(
+    const ordered_json& message) const {
+  if (message["content"].is_string() && capabilities_.requires_typed_content) {
+    return ordered_json::object(
+        {{"role", message["role"]},
+         {"content", ordered_json::array(
+                         {{{"type", "text"}, {"text", message["content"]}}})}});
+  } else if (message["content"].is_array() && message["content"].size() == 1 &&
+             message["content"][0]["type"] == "text" &&
+             !capabilities_.requires_typed_content) {
+    return ordered_json::object({{"role", message["role"]},
+                                 {"content", message["content"][0]["text"]}});
+  } else {
+    return message;
+  }
+}
+
+absl::StatusOr<ordered_json> FastVlmDataProcessor::FormatTools(
+    const ordered_json& tools) const {
+  return absl::UnimplementedError("FastVLM does not support tool calling.");
 }
 
 absl::StatusOr<std::vector<InputData>>
 FastVlmDataProcessor::ToInputDataVectorImpl(
-    const std::string& rendered_template_prompt,
-    const nlohmann::ordered_json& messages,
+    const std::string& rendered_template_prompt, const ordered_json& messages,
     const FastVlmDataProcessorArguments& args) const {
-  return impl_->ToInputDataVector(rendered_template_prompt, messages,
-                                  Gemma3DataProcessorArguments{});
+  std::vector<InputData> input_data;
+  std::deque<std::unique_ptr<MemoryMappedFile>> image_files;
+
+  for (const auto& message : messages) {
+    if (message.contains("content") && message["content"].is_array()) {
+      for (const auto& item : message["content"]) {
+        if (item.is_string()) {
+          continue;
+        }
+        ASSIGN_OR_RETURN(std::unique_ptr<MemoryMappedFile> mmap_file,
+                         LoadItemData(item));
+        if (item["type"] == "image") {
+          image_files.push_back(std::move(mmap_file));
+        }
+      }
+    }
+  }
+
+  RE2 re_delimiter("(<image_soft_token>)");
+  absl::string_view prompt_view(rendered_template_prompt);
+  const char* start = prompt_view.data();
+  std::string part;
+  ImagePreprocessParameter image_params;
+  image_params.SetTargetDimensions(Dimensions(
+      {1, config_.image_tensor_height, config_.image_tensor_width, 3}));
+
+  while (RE2::FindAndConsume(&prompt_view, re_delimiter, &part)) {
+    absl::string_view text_part(start, prompt_view.data() - part.size());
+    start = prompt_view.data();
+    if (IsImage(part)) {
+      input_data.emplace_back(InputText(std::string(text_part)));
+
+      if (image_files.empty()) {
+        return absl::InvalidArgumentError(
+            "Provided less images than expected in the prompt.");
+      }
+      auto image_file = std::move(image_files.front());
+      image_files.pop_front();
+      ASSIGN_OR_RETURN(auto preprocessed_image,
+                       image_preprocessor_->Preprocess(
+                           InputImage(std::string(
+                               static_cast<const char*>(image_file->data()),
+                               image_file->length())),
+                           image_params));
+      input_data.emplace_back(InputImage(std::move(preprocessed_image)));
+    }
+  }
+
+  if (!image_files.empty()) {
+    return absl::InvalidArgumentError(
+        "Provided more images than expected in the prompt.");
+  }
+
+  if (!prompt_view.empty()) {
+    input_data.push_back(InputText(std::string(prompt_view)));
+  }
+
+  return input_data;
 }
 
 absl::StatusOr<Message> FastVlmDataProcessor::ToMessageImpl(
     const Responses& responses,
     const FastVlmDataProcessorArguments& args) const {
-  return impl_->ToMessage(responses, Gemma3DataProcessorArguments{});
+  absl::string_view response_text = responses.GetTexts()[0];
+  ordered_json content = ordered_json::array(
+      {{{"type", "text"}, {"text", std::string(response_text)}}});
+  return ordered_json::object({{"role", "assistant"}, {"content", content}});
 }
 
 absl::Status FastVlmDataProcessor::CloneStateImpl(
     const TypeSafeModelDataProcessor<FastVlmDataProcessorConfig,
                                      FastVlmDataProcessorArguments>& other) {
-  const FastVlmDataProcessor& other_fastvlm =
-      static_cast<const FastVlmDataProcessor&>(other);
-  return impl_->CloneState(*other_fastvlm.impl_);
+  return absl::OkStatus();
 }
 
 }  // namespace litert::lm
diff --git a/runtime/conversation/model_data_processor/fastvlm_data_processor.h b/runtime/conversation/model_data_processor/fastvlm_data_processor.h
@@ -15,38 +15,24 @@
 #ifndef THIRD_PARTY_ODML_LITERT_LM_RUNTIME_CONVERSATION_MODEL_DATA_PROCESSOR_FASTVLM_DATA_PROCESSOR_H_
 #define THIRD_PARTY_ODML_LITERT_LM_RUNTIME_CONVERSATION_MODEL_DATA_PROCESSOR_FASTVLM_DATA_PROCESSOR_H_
 
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/status/status.h"  // from @com_google_absl
-#include "absl/status/statusor.h"  // from @com_google_absl
-#include "absl/strings/string_view.h"  // from @com_google_absl
-#include "nlohmann/json.hpp"  // from @nlohmann_json
-#include "runtime/components/constrained_decoding/constraint.h"
+#include "runtime/components/preprocessor/image_preprocessor.h"
 #include "runtime/components/prompt_template.h"
-#include "runtime/components/tokenizer.h"
 #include "runtime/conversation/io_types.h"
 #include "runtime/conversation/model_data_processor/fastvlm_data_processor_config.h"
-#include "runtime/conversation/model_data_processor/gemma3_data_processor.h"
 #include "runtime/conversation/model_data_processor/model_data_processor.h"
 #include "runtime/engine/io_types.h"
 
 namespace litert::lm {
 
-// FastVlmDataProcessor is a thin wrapper around Gemma3DataProcessor that
-// uses FastVlmDataProcessorConfig.
+// FastVlmDataProcessor is a model data processor for FastVLM models.
 class FastVlmDataProcessor
     : public TypeSafeModelDataProcessor<FastVlmDataProcessorConfig,
                                         FastVlmDataProcessorArguments> {
  public:
   // Creates a FastVlmDataProcessor instance.
   static absl::StatusOr<std::unique_ptr<FastVlmDataProcessor>> Create(
-      FastVlmDataProcessorConfig config, std::optional<Preface> preface,
-      const Tokenizer* tokenizer,
-      const std::vector<std::vector<int>>& stop_token_ids,
-      bool enable_constrained_decoding);
+      FastVlmDataProcessorConfig config,
+      const PromptTemplateCapabilities& capabilities);
 
   // Returns the config of the FastVlmDataProcessor.
   const FastVlmDataProcessorConfig& GetConfig() const override {
@@ -55,46 +41,26 @@ class FastVlmDataProcessor
 
   // Converts a message into the template input for that message.
   absl::StatusOr<nlohmann::ordered_json> MessageToTemplateInput(
-      const nlohmann::ordered_json& message) const override {
-    return impl_->MessageToTemplateInput(message);
-  }
+      const nlohmann::ordered_json& message) const override;
 
   // Formats tool declarations.
   absl::StatusOr<nlohmann::ordered_json> FormatTools(
-      const nlohmann::ordered_json& tools) const override {
-    return impl_->FormatTools(tools);
-  }
-
-  // Creates a constraint from the given tools.
-  absl::StatusOr<std::unique_ptr<Constraint>> CreateConstraint(
-      const nlohmann::ordered_json& tools) const override {
-    return impl_->CreateConstraint(tools);
-  }
+      const nlohmann::ordered_json& tools) const override;
 
   // Returns the start of tool call blocks.
-  absl::string_view CodeFenceStart() const override {
-    return impl_->CodeFenceStart();
-  }
+  absl::string_view CodeFenceStart() const override { return ""; }
 
   // Returns the end of tool call blocks.
-  absl::string_view CodeFenceEnd() const override {
-    return impl_->CodeFenceEnd();
-  }
-
-  absl::StatusOr<SingleTurnTemplateRenderResult> RenderSingleTurnTemplate(
-      std::vector<Message>& history, const Preface& preface,
-      const Message& message, const PromptTemplate& prompt_template,
-      bool current_is_appending_message, bool append_message,
-      std::optional<nlohmann::ordered_json> extra_context) const override {
-    return impl_->RenderSingleTurnTemplate(
-        history, preface, message, prompt_template,
-        current_is_appending_message, append_message, extra_context);
-  }
+  absl::string_view CodeFenceEnd() const override { return ""; }
 
  private:
-  explicit FastVlmDataProcessor(FastVlmDataProcessorConfig config,
-                                std::unique_ptr<Gemma3DataProcessor> impl)
-      : config_(config), impl_(std::move(impl)) {}
+  explicit FastVlmDataProcessor(
+      FastVlmDataProcessorConfig config,
+      const PromptTemplateCapabilities& capabilities,
+      std::unique_ptr<ImagePreprocessor> image_preprocessor)
+      : config_(config),
+        capabilities_(capabilities),
+        image_preprocessor_(std::move(image_preprocessor)) {}
 
   absl::StatusOr<std::vector<InputData>> ToInputDataVectorImpl(
       const std::string& rendered_template_prompt,
@@ -111,7 +77,8 @@ class FastVlmDataProcessor
       override;
 
   FastVlmDataProcessorConfig config_;
-  std::unique_ptr<Gemma3DataProcessor> impl_;
+  PromptTemplateCapabilities capabilities_;
+  std::unique_ptr<ImagePreprocessor> image_preprocessor_;
 };
 
 }  // namespace litert::lm
diff --git a/runtime/conversation/model_data_processor/fastvlm_data_processor_config.h b/runtime/conversation/model_data_processor/fastvlm_data_processor_config.h
@@ -21,11 +21,6 @@ namespace litert::lm {
 
 // Config for FastVlmDataProcessor.
 struct FastVlmDataProcessorConfig {
-  // The string for beginning of image token.
-  std::string boi_token = " <start_of_image>";
-  // The string for end of image token.
-  std::string eoi_token = "<end_of_image>";
-
   int image_tensor_height = 1024;
   int image_tensor_width = 1024;
 };
diff --git a/runtime/conversation/model_data_processor/model_data_processor_factory.cc b/runtime/conversation/model_data_processor/model_data_processor_factory.cc
@@ -271,14 +271,6 @@ absl::StatusOr<DataProcessorConfig> CreateFastVlmDataProcessorConfig(
   }
   FastVlmDataProcessorConfig config;
   proto::FastVlm fast_vlm = model_type.fast_vlm();
-  if (fast_vlm.has_start_of_image_token()) {
-    ASSIGN_OR_RETURN(config.boi_token,
-                     GetTokenString(fast_vlm.start_of_image_token()));
-  }
-  if (fast_vlm.has_end_of_image_token()) {
-    ASSIGN_OR_RETURN(config.eoi_token,
-                     GetTokenString(fast_vlm.end_of_image_token()));
-  }
   const auto& default_fast_vlm = proto::FastVlm::default_instance();
   if (fast_vlm.image_tensor_height() !=
       default_fast_vlm.image_tensor_height()) {
@@ -400,8 +392,7 @@ absl::StatusOr<std::unique_ptr<ModelDataProcessor>> CreateModelDataProcessor(
   } else if (std::holds_alternative<FastVlmDataProcessorConfig>(config)) {
     ABSL_LOG(INFO) << "Creating FastVlmDataProcessor";
     return FastVlmDataProcessor::Create(
-        std::get<FastVlmDataProcessorConfig>(config), preface, tokenizer,
-        stop_token_ids, enable_constrained_decoding);
+        std::get<FastVlmDataProcessorConfig>(config), capabilities);
   } else {
     return absl::InvalidArgumentError("Unsupported data processor config type");
   }
diff --git a/runtime/proto/llm_model_type.proto b/runtime/proto/llm_model_type.proto
@@ -193,13 +193,11 @@ message Gemma4 {
 // FastVLM model.
 message FastVlm {
   // Vision modality.
-  // The start of image token indicates the start of image in the prompt.
-  TokenUnion start_of_image_token = 1;
-  // The end of image token indicates the end of image in the prompt.
-  TokenUnion end_of_image_token = 2;
   // The height of the image tensor that image preprocessor should resize to.
   int32 image_tensor_height = 3;
   // The width of the image tensor that image preprocessor should resize to.
   int32 image_tensor_width = 4;
+
+  reserved 1, 2;
 }
 
diff --git a/runtime/util/model_type_utils.cc b/runtime/util/model_type_utils.cc