From fd0675a4c92fe243905178811b4c5643a157be2c Mon Sep 17 00:00:00 2001
From: Cairn-2001 <Cairn-2001@smoothcurves.nexus>
Date: Sun, 12 Apr 2026 11:36:07 +0000
Subject: [PATCH] fix: prepend text message to content blocks in multimodal
 agent loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a user sends a message with image attachments via the upload API,
the agent loop receives both `user_message` (text) and
`user_content_blocks` (images). Previously, when content blocks were
present, only the blocks were pushed to the session — the text message
was silently dropped. The LLM received the images but not the user's
question or context.

This fix prepends the text message as a ContentBlock::Text into the
blocks vector before pushing to the session, so the LLM sees both
the user's text AND any attached images in a single turn.

Both the non-streaming and streaming agent loop paths are fixed.

Before:
  User: "What color is this?" + [image of blue square]
  LLM receives: [image only, no text]
  Response: "I can't see the image directly"

After:
  User: "What color is this?" + [image of blue square]
  LLM receives: [text: "What color is this?", image: blue square]
  Response: "Blue"

Tested with Qwen 3.5 Plus and Gemini 2.5 Flash via OpenRouter.
Images up to 1.3MB confirmed working through the full pipeline.

Signed-off-by: Cairn-2001 <Cairn-2001@smoothcurves.nexus>
---
 crates/openfang-runtime/src/agent_loop.rs | 26 +++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/crates/openfang-runtime/src/agent_loop.rs b/crates/openfang-runtime/src/agent_loop.rs
index 892afc6ab9..ef1296e128 100644
--- a/crates/openfang-runtime/src/agent_loop.rs
+++ b/crates/openfang-runtime/src/agent_loop.rs
@@ -279,7 +279,18 @@ pub async fn run_agent_loop(
     // Add the user message to session history.
     // When content blocks are provided (e.g. text + image from a channel),
     // use multimodal message format so the LLM receives the image for vision.
-    if let Some(blocks) = user_content_blocks {
+    // The text message is prepended to the blocks so the LLM sees both the
+    // user's question AND any attached images in a single turn.
+    if let Some(mut blocks) = user_content_blocks {
+        if !user_message.is_empty() {
+            blocks.insert(
+                0,
+                ContentBlock::Text {
+                    text: user_message.to_string(),
+                    provider_metadata: None,
+                },
+            );
+        }
         session.messages.push(Message::user_with_blocks(blocks));
     } else {
         session.messages.push(Message::user(user_message));
@@ -1448,7 +1459,18 @@ pub async fn run_agent_loop_streaming(
     // Add the user message to session history.
     // When content blocks are provided (e.g. text + image from a channel),
     // use multimodal message format so the LLM receives the image for vision.
-    if let Some(blocks) = user_content_blocks {
+    // The text message is prepended to the blocks so the LLM sees both the
+    // user's question AND any attached images in a single turn.
+    if let Some(mut blocks) = user_content_blocks {
+        if !user_message.is_empty() {
+            blocks.insert(
+                0,
+                ContentBlock::Text {
+                    text: user_message.to_string(),
+                    provider_metadata: None,
+                },
+            );
+        }
         session.messages.push(Message::user_with_blocks(blocks));
     } else {
         session.messages.push(Message::user(user_message));