From fd0675a4c92fe243905178811b4c5643a157be2c Mon Sep 17 00:00:00 2001 From: Cairn-2001 Date: Sun, 12 Apr 2026 11:36:07 +0000 Subject: [PATCH] fix: prepend text message to content blocks in multimodal agent loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a user sends a message with image attachments via the upload API, the agent loop receives both `user_message` (text) and `user_content_blocks` (images). Previously, when content blocks were present, only the blocks were pushed to the session — the text message was silently dropped. The LLM received the images but not the user's question or context. This fix prepends the text message as a ContentBlock::Text into the blocks vector before pushing to the session, so the LLM sees both the user's text AND any attached images in a single turn. Both the non-streaming and streaming agent loop paths are fixed. Before: User: "What color is this?" + [image of blue square] LLM receives: [image only, no text] Response: "I can't see the image directly" After: User: "What color is this?" + [image of blue square] LLM receives: [text: "What color is this?", image: blue square] Response: "Blue" Tested with Qwen 3.5 Plus and Gemini 2.5 Flash via OpenRouter. Images up to 1.3MB confirmed working through the full pipeline. Signed-off-by: Cairn-2001 --- crates/openfang-runtime/src/agent_loop.rs | 26 +++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/crates/openfang-runtime/src/agent_loop.rs b/crates/openfang-runtime/src/agent_loop.rs index 892afc6ab9..ef1296e128 100644 --- a/crates/openfang-runtime/src/agent_loop.rs +++ b/crates/openfang-runtime/src/agent_loop.rs @@ -279,7 +279,18 @@ pub async fn run_agent_loop( // Add the user message to session history. // When content blocks are provided (e.g. text + image from a channel), // use multimodal message format so the LLM receives the image for vision. - if let Some(blocks) = user_content_blocks { + // The text message is prepended to the blocks so the LLM sees both the + // user's question AND any attached images in a single turn. + if let Some(mut blocks) = user_content_blocks { + if !user_message.is_empty() { + blocks.insert( + 0, + ContentBlock::Text { + text: user_message.to_string(), + provider_metadata: None, + }, + ); + } session.messages.push(Message::user_with_blocks(blocks)); } else { session.messages.push(Message::user(user_message)); @@ -1448,7 +1459,18 @@ pub async fn run_agent_loop_streaming( // Add the user message to session history. // When content blocks are provided (e.g. text + image from a channel), // use multimodal message format so the LLM receives the image for vision. - if let Some(blocks) = user_content_blocks { + // The text message is prepended to the blocks so the LLM sees both the + // user's question AND any attached images in a single turn. + if let Some(mut blocks) = user_content_blocks { + if !user_message.is_empty() { + blocks.insert( + 0, + ContentBlock::Text { + text: user_message.to_string(), + provider_metadata: None, + }, + ); + } session.messages.push(Message::user_with_blocks(blocks)); } else { session.messages.push(Message::user(user_message));