Skip to content

Commit 1789e40

Browse files
ai-edge-botcopybara-github
authored andcommitted
Internal changes to FastVLM dataProcessor.
LiteRT-LM-PiperOrigin-RevId: 909686676
1 parent ee1cb0b commit 1789e40

7 files changed

Lines changed: 145 additions & 120 deletions

File tree

runtime/conversation/model_data_processor/BUILD

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,21 +221,24 @@ cc_library(
221221
srcs = ["fastvlm_data_processor.cc"],
222222
hdrs = ["fastvlm_data_processor.h"],
223223
deps = [
224+
":data_utils",
224225
":fastvlm_data_processor_config",
225-
":gemma3_data_processor",
226-
":gemma3_data_processor_config",
227226
":model_data_processor",
228227
"@com_google_absl//absl/memory",
229228
"@com_google_absl//absl/status",
230229
"@com_google_absl//absl/status:statusor",
230+
"@com_google_absl//absl/strings",
231231
"@com_google_absl//absl/strings:string_view",
232232
"@nlohmann_json//:json",
233+
"@litert//litert/cc:litert_layout",
233234
"//runtime/components:prompt_template",
234-
"//runtime/components:tokenizer",
235-
"//runtime/components/constrained_decoding:constraint",
235+
"//runtime/components/preprocessor:image_preprocessor",
236+
"//runtime/components/preprocessor:stb_image_preprocessor",
236237
"//runtime/conversation:io_types",
237238
"//runtime/engine:io_types",
238239
"//runtime/util:litert_status_util",
240+
"//runtime/util:memory_mapped_file",
241+
"@com_googlesource_code_re2//:re2",
239242
],
240243
)
241244

runtime/conversation/model_data_processor/fastvlm_data_processor.cc

Lines changed: 108 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,66 +14,147 @@
1414

1515
#include "runtime/conversation/model_data_processor/fastvlm_data_processor.h"
1616

17+
#include <deque>
1718
#include <memory>
18-
#include <optional>
1919
#include <string>
2020
#include <utility>
21+
#include <variant>
2122
#include <vector>
2223

2324
#include "absl/memory/memory.h" // from @com_google_absl
2425
#include "absl/status/status.h" // from @com_google_absl
2526
#include "absl/status/statusor.h" // from @com_google_absl
27+
#include "absl/strings/string_view.h" // from @com_google_absl
2628
#include "nlohmann/json.hpp" // from @nlohmann_json
27-
#include "runtime/components/tokenizer.h"
29+
#include "litert/cc/litert_layout.h" // from @litert
30+
#include "runtime/components/preprocessor/image_preprocessor.h"
31+
#include "runtime/components/preprocessor/stb_image_preprocessor.h"
32+
#include "runtime/components/prompt_template.h"
2833
#include "runtime/conversation/io_types.h"
34+
#include "runtime/conversation/model_data_processor/data_utils.h"
2935
#include "runtime/conversation/model_data_processor/fastvlm_data_processor_config.h"
30-
#include "runtime/conversation/model_data_processor/gemma3_data_processor.h"
31-
#include "runtime/conversation/model_data_processor/gemma3_data_processor_config.h"
3236
#include "runtime/conversation/model_data_processor/model_data_processor.h"
3337
#include "runtime/engine/io_types.h"
38+
#include "runtime/util/memory_mapped_file.h"
3439
#include "runtime/util/status_macros.h"
40+
#include "re2/re2.h" // from @com_googlesource_code_re2
3541

3642
namespace litert::lm {
3743

44+
namespace {
45+
46+
using ::nlohmann::ordered_json;
47+
48+
bool IsImage(absl::string_view part) { return part == "<image_soft_token>"; }
49+
50+
} // namespace
51+
3852
absl::StatusOr<std::unique_ptr<FastVlmDataProcessor>>
39-
FastVlmDataProcessor::Create(
40-
FastVlmDataProcessorConfig config, std::optional<Preface> preface,
41-
const Tokenizer* tokenizer,
42-
const std::vector<std::vector<int>>& stop_token_ids,
43-
bool enable_constrained_decoding) {
44-
Gemma3DataProcessorConfig gemma3_config;
45-
gemma3_config.boi_token = config.boi_token;
46-
gemma3_config.eoi_token = config.eoi_token;
47-
gemma3_config.image_tensor_height = config.image_tensor_height;
48-
gemma3_config.image_tensor_width = config.image_tensor_width;
49-
50-
ASSIGN_OR_RETURN(auto impl, Gemma3DataProcessor::Create(
51-
gemma3_config, preface, tokenizer,
52-
stop_token_ids, enable_constrained_decoding));
53-
return absl::WrapUnique(new FastVlmDataProcessor(config, std::move(impl)));
53+
FastVlmDataProcessor::Create(FastVlmDataProcessorConfig config,
54+
const PromptTemplateCapabilities& capabilities) {
55+
return absl::WrapUnique(new FastVlmDataProcessor(
56+
config, capabilities, std::make_unique<StbImagePreprocessor>()));
57+
}
58+
59+
absl::StatusOr<ordered_json> FastVlmDataProcessor::MessageToTemplateInput(
60+
const ordered_json& message) const {
61+
if (message["content"].is_string() && capabilities_.requires_typed_content) {
62+
return ordered_json::object(
63+
{{"role", message["role"]},
64+
{"content", ordered_json::array(
65+
{{{"type", "text"}, {"text", message["content"]}}})}});
66+
} else if (message["content"].is_array() && message["content"].size() == 1 &&
67+
message["content"][0]["type"] == "text" &&
68+
!capabilities_.requires_typed_content) {
69+
return ordered_json::object({{"role", message["role"]},
70+
{"content", message["content"][0]["text"]}});
71+
} else {
72+
return message;
73+
}
74+
}
75+
76+
absl::StatusOr<ordered_json> FastVlmDataProcessor::FormatTools(
77+
const ordered_json& tools) const {
78+
return absl::UnimplementedError("FastVLM does not support tool calling.");
5479
}
5580

5681
absl::StatusOr<std::vector<InputData>>
5782
FastVlmDataProcessor::ToInputDataVectorImpl(
58-
const std::string& rendered_template_prompt,
59-
const nlohmann::ordered_json& messages,
83+
const std::string& rendered_template_prompt, const ordered_json& messages,
6084
const FastVlmDataProcessorArguments& args) const {
61-
return impl_->ToInputDataVector(rendered_template_prompt, messages,
62-
Gemma3DataProcessorArguments{});
85+
std::vector<InputData> input_data;
86+
std::deque<std::unique_ptr<MemoryMappedFile>> image_files;
87+
88+
for (const auto& message : messages) {
89+
if (message.contains("content") && message["content"].is_array()) {
90+
for (const auto& item : message["content"]) {
91+
if (item.is_string()) {
92+
continue;
93+
}
94+
ASSIGN_OR_RETURN(std::unique_ptr<MemoryMappedFile> mmap_file,
95+
LoadItemData(item));
96+
if (item["type"] == "image") {
97+
image_files.push_back(std::move(mmap_file));
98+
}
99+
}
100+
}
101+
}
102+
103+
RE2 re_delimiter("(<image_soft_token>)");
104+
absl::string_view prompt_view(rendered_template_prompt);
105+
const char* start = prompt_view.data();
106+
std::string part;
107+
ImagePreprocessParameter image_params;
108+
image_params.SetTargetDimensions(Dimensions(
109+
{1, config_.image_tensor_height, config_.image_tensor_width, 3}));
110+
111+
while (RE2::FindAndConsume(&prompt_view, re_delimiter, &part)) {
112+
absl::string_view text_part(start, prompt_view.data() - part.size());
113+
start = prompt_view.data();
114+
if (IsImage(part)) {
115+
input_data.emplace_back(InputText(std::string(text_part)));
116+
117+
if (image_files.empty()) {
118+
return absl::InvalidArgumentError(
119+
"Provided less images than expected in the prompt.");
120+
}
121+
auto image_file = std::move(image_files.front());
122+
image_files.pop_front();
123+
ASSIGN_OR_RETURN(auto preprocessed_image,
124+
image_preprocessor_->Preprocess(
125+
InputImage(std::string(
126+
static_cast<const char*>(image_file->data()),
127+
image_file->length())),
128+
image_params));
129+
input_data.emplace_back(InputImage(std::move(preprocessed_image)));
130+
}
131+
}
132+
133+
if (!image_files.empty()) {
134+
return absl::InvalidArgumentError(
135+
"Provided more images than expected in the prompt.");
136+
}
137+
138+
if (!prompt_view.empty()) {
139+
input_data.push_back(InputText(std::string(prompt_view)));
140+
}
141+
142+
return input_data;
63143
}
64144

65145
absl::StatusOr<Message> FastVlmDataProcessor::ToMessageImpl(
66146
const Responses& responses,
67147
const FastVlmDataProcessorArguments& args) const {
68-
return impl_->ToMessage(responses, Gemma3DataProcessorArguments{});
148+
absl::string_view response_text = responses.GetTexts()[0];
149+
ordered_json content = ordered_json::array(
150+
{{{"type", "text"}, {"text", std::string(response_text)}}});
151+
return ordered_json::object({{"role", "assistant"}, {"content", content}});
69152
}
70153

71154
absl::Status FastVlmDataProcessor::CloneStateImpl(
72155
const TypeSafeModelDataProcessor<FastVlmDataProcessorConfig,
73156
FastVlmDataProcessorArguments>& other) {
74-
const FastVlmDataProcessor& other_fastvlm =
75-
static_cast<const FastVlmDataProcessor&>(other);
76-
return impl_->CloneState(*other_fastvlm.impl_);
157+
return absl::OkStatus();
77158
}
78159

79160
} // namespace litert::lm

runtime/conversation/model_data_processor/fastvlm_data_processor.h

Lines changed: 17 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -15,38 +15,24 @@
1515
#ifndef THIRD_PARTY_ODML_LITERT_LM_RUNTIME_CONVERSATION_MODEL_DATA_PROCESSOR_FASTVLM_DATA_PROCESSOR_H_
1616
#define THIRD_PARTY_ODML_LITERT_LM_RUNTIME_CONVERSATION_MODEL_DATA_PROCESSOR_FASTVLM_DATA_PROCESSOR_H_
1717

18-
#include <memory>
19-
#include <optional>
20-
#include <string>
21-
#include <vector>
22-
23-
#include "absl/status/status.h" // from @com_google_absl
24-
#include "absl/status/statusor.h" // from @com_google_absl
25-
#include "absl/strings/string_view.h" // from @com_google_absl
26-
#include "nlohmann/json.hpp" // from @nlohmann_json
27-
#include "runtime/components/constrained_decoding/constraint.h"
18+
#include "runtime/components/preprocessor/image_preprocessor.h"
2819
#include "runtime/components/prompt_template.h"
29-
#include "runtime/components/tokenizer.h"
3020
#include "runtime/conversation/io_types.h"
3121
#include "runtime/conversation/model_data_processor/fastvlm_data_processor_config.h"
32-
#include "runtime/conversation/model_data_processor/gemma3_data_processor.h"
3322
#include "runtime/conversation/model_data_processor/model_data_processor.h"
3423
#include "runtime/engine/io_types.h"
3524

3625
namespace litert::lm {
3726

38-
// FastVlmDataProcessor is a thin wrapper around Gemma3DataProcessor that
39-
// uses FastVlmDataProcessorConfig.
27+
// FastVlmDataProcessor is a model data processor for FastVLM models.
4028
class FastVlmDataProcessor
4129
: public TypeSafeModelDataProcessor<FastVlmDataProcessorConfig,
4230
FastVlmDataProcessorArguments> {
4331
public:
4432
// Creates a FastVlmDataProcessor instance.
4533
static absl::StatusOr<std::unique_ptr<FastVlmDataProcessor>> Create(
46-
FastVlmDataProcessorConfig config, std::optional<Preface> preface,
47-
const Tokenizer* tokenizer,
48-
const std::vector<std::vector<int>>& stop_token_ids,
49-
bool enable_constrained_decoding);
34+
FastVlmDataProcessorConfig config,
35+
const PromptTemplateCapabilities& capabilities);
5036

5137
// Returns the config of the FastVlmDataProcessor.
5238
const FastVlmDataProcessorConfig& GetConfig() const override {
@@ -55,46 +41,26 @@ class FastVlmDataProcessor
5541

5642
// Converts a message into the template input for that message.
5743
absl::StatusOr<nlohmann::ordered_json> MessageToTemplateInput(
58-
const nlohmann::ordered_json& message) const override {
59-
return impl_->MessageToTemplateInput(message);
60-
}
44+
const nlohmann::ordered_json& message) const override;
6145

6246
// Formats tool declarations.
6347
absl::StatusOr<nlohmann::ordered_json> FormatTools(
64-
const nlohmann::ordered_json& tools) const override {
65-
return impl_->FormatTools(tools);
66-
}
67-
68-
// Creates a constraint from the given tools.
69-
absl::StatusOr<std::unique_ptr<Constraint>> CreateConstraint(
70-
const nlohmann::ordered_json& tools) const override {
71-
return impl_->CreateConstraint(tools);
72-
}
48+
const nlohmann::ordered_json& tools) const override;
7349

7450
// Returns the start of tool call blocks.
75-
absl::string_view CodeFenceStart() const override {
76-
return impl_->CodeFenceStart();
77-
}
51+
absl::string_view CodeFenceStart() const override { return ""; }
7852

7953
// Returns the end of tool call blocks.
80-
absl::string_view CodeFenceEnd() const override {
81-
return impl_->CodeFenceEnd();
82-
}
83-
84-
absl::StatusOr<SingleTurnTemplateRenderResult> RenderSingleTurnTemplate(
85-
std::vector<Message>& history, const Preface& preface,
86-
const Message& message, const PromptTemplate& prompt_template,
87-
bool current_is_appending_message, bool append_message,
88-
std::optional<nlohmann::ordered_json> extra_context) const override {
89-
return impl_->RenderSingleTurnTemplate(
90-
history, preface, message, prompt_template,
91-
current_is_appending_message, append_message, extra_context);
92-
}
54+
absl::string_view CodeFenceEnd() const override { return ""; }
9355

9456
private:
95-
explicit FastVlmDataProcessor(FastVlmDataProcessorConfig config,
96-
std::unique_ptr<Gemma3DataProcessor> impl)
97-
: config_(config), impl_(std::move(impl)) {}
57+
explicit FastVlmDataProcessor(
58+
FastVlmDataProcessorConfig config,
59+
const PromptTemplateCapabilities& capabilities,
60+
std::unique_ptr<ImagePreprocessor> image_preprocessor)
61+
: config_(config),
62+
capabilities_(capabilities),
63+
image_preprocessor_(std::move(image_preprocessor)) {}
9864

9965
absl::StatusOr<std::vector<InputData>> ToInputDataVectorImpl(
10066
const std::string& rendered_template_prompt,
@@ -111,7 +77,8 @@ class FastVlmDataProcessor
11177
override;
11278

11379
FastVlmDataProcessorConfig config_;
114-
std::unique_ptr<Gemma3DataProcessor> impl_;
80+
PromptTemplateCapabilities capabilities_;
81+
std::unique_ptr<ImagePreprocessor> image_preprocessor_;
11582
};
11683

11784
} // namespace litert::lm

runtime/conversation/model_data_processor/fastvlm_data_processor_config.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,6 @@ namespace litert::lm {
2121

2222
// Config for FastVlmDataProcessor.
2323
struct FastVlmDataProcessorConfig {
24-
// The string for beginning of image token.
25-
std::string boi_token = " <start_of_image>";
26-
// The string for end of image token.
27-
std::string eoi_token = "<end_of_image>";
28-
2924
int image_tensor_height = 1024;
3025
int image_tensor_width = 1024;
3126
};

runtime/conversation/model_data_processor/model_data_processor_factory.cc

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -271,14 +271,6 @@ absl::StatusOr<DataProcessorConfig> CreateFastVlmDataProcessorConfig(
271271
}
272272
FastVlmDataProcessorConfig config;
273273
proto::FastVlm fast_vlm = model_type.fast_vlm();
274-
if (fast_vlm.has_start_of_image_token()) {
275-
ASSIGN_OR_RETURN(config.boi_token,
276-
GetTokenString(fast_vlm.start_of_image_token()));
277-
}
278-
if (fast_vlm.has_end_of_image_token()) {
279-
ASSIGN_OR_RETURN(config.eoi_token,
280-
GetTokenString(fast_vlm.end_of_image_token()));
281-
}
282274
const auto& default_fast_vlm = proto::FastVlm::default_instance();
283275
if (fast_vlm.image_tensor_height() !=
284276
default_fast_vlm.image_tensor_height()) {
@@ -400,8 +392,7 @@ absl::StatusOr<std::unique_ptr<ModelDataProcessor>> CreateModelDataProcessor(
400392
} else if (std::holds_alternative<FastVlmDataProcessorConfig>(config)) {
401393
ABSL_LOG(INFO) << "Creating FastVlmDataProcessor";
402394
return FastVlmDataProcessor::Create(
403-
std::get<FastVlmDataProcessorConfig>(config), preface, tokenizer,
404-
stop_token_ids, enable_constrained_decoding);
395+
std::get<FastVlmDataProcessorConfig>(config), capabilities);
405396
} else {
406397
return absl::InvalidArgumentError("Unsupported data processor config type");
407398
}

runtime/proto/llm_model_type.proto

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,11 @@ message Gemma4 {
193193
// FastVLM model.
194194
message FastVlm {
195195
// Vision modality.
196-
// The start of image token indicates the start of image in the prompt.
197-
TokenUnion start_of_image_token = 1;
198-
// The end of image token indicates the end of image in the prompt.
199-
TokenUnion end_of_image_token = 2;
200196
// The height of the image tensor that image preprocessor should resize to.
201197
int32 image_tensor_height = 3;
202198
// The width of the image tensor that image preprocessor should resize to.
203199
int32 image_tensor_width = 4;
200+
201+
reserved 1, 2;
204202
}
205203

0 commit comments

Comments
 (0)