Merge branch 'main' into gemma-4

Erol444 · web-flow · commit cd268eb5841c · 2026-04-27T10:36:19.000+02:00
diff --git a/.github/workflows/load_test_hosted_inference.yml b/.github/workflows/load_test_hosted_inference.yml
@@ -114,7 +114,7 @@ jobs:
         if: ${{ github.event.inputs.environment == 'staging' && github.event.inputs.model_type == 'workflows' && github.event.inputs.test_target == 'aws-lambda-serving' }}
         run: |
           ROBOFLOW_API_KEY=${{ secrets.LOAD_TEST_STAGING_API_KEY }} python -m inference_cli.main benchmark api-speed -wid workflows-staging-test -wn paul-guerrie -d coco -rps 5 -br 500 -h https://lambda-classification.staging.roboflow.com --legacy-endpoints --yes --output_location test_results.json --max_error_rate 5.0
-      - name: 🏋️‍♂️ Load test 😎 STAGING 😎 AWS LAMBDA | workflows 🔥🔥🔥🔥
+      - name: 🏋️‍♂️ Load test 😎 STAGING 😎 SERVERLESS V2 | workflows 🔥🔥🔥🔥
         if: ${{ github.event.inputs.environment == 'staging' && github.event.inputs.model_type == 'workflows' && github.event.inputs.test_target == 'serverless-v2' }}
         run: |
           ROBOFLOW_API_KEY=${{ secrets.LOAD_TEST_STAGING_API_KEY }} python -m inference_cli.main benchmark api-speed -wid workflows-staging-test -wn paul-guerrie -d coco -rps 5 -br 500 -h https://serverless.roboflow.one --legacy-endpoints --yes --output_location test_results.json --max_error_rate 5.0
diff --git a/inference/core/workflows/core_steps/models/foundation/glm_ocr/v1.py b/inference/core/workflows/core_steps/models/foundation/glm_ocr/v1.py
@@ -1,4 +1,5 @@
-from typing import List, Literal, Optional, Type, Union
+import json
+from typing import Dict, List, Literal, Optional, Type, Union
 
 from pydantic import ConfigDict, Field, model_validator
 
@@ -17,6 +18,7 @@
 )
 from inference.core.workflows.execution_engine.entities.types import (
     IMAGE_KIND,
+    LANGUAGE_MODEL_OUTPUT_KIND,
     ROBOFLOW_MODEL_ID_KIND,
     STRING_KIND,
     ImageInputField,
@@ -29,15 +31,28 @@
 )
 from inference_sdk import InferenceHTTPClient
 
+STRUCTURED_ANSWERING_PROMPT_TEMPLATE = (
+    "You are supposed to produce responses in JSON wrapped in Markdown markers: "
+    "```json\nyour-response\n```. Below is a dictionary with keys and values. "
+    "Each key must be present in your response. Values represent descriptions "
+    "for JSON fields to be generated. Provide only JSON Markdown in response.\n\n"
+    "Specification of requirements regarding output fields:\n{output_structure}"
+)
+
 TASK_TYPE_TO_PROMPT = {
     "text-recognition": "Text Recognition:",
     "table-recognition": "Table Recognition:",
     "formula-recognition": "Formula Recognition:",
+    "structured-answering": None,
     "custom": None,
 }
 
 TaskType = Literal[
-    "text-recognition", "table-recognition", "formula-recognition", "custom"
+    "text-recognition",
+    "table-recognition",
+    "formula-recognition",
+    "structured-answering",
+    "custom",
 ]
 
 TASKS_METADATA = {
@@ -53,13 +68,18 @@
         "name": "Formula Recognition",
         "description": "Recognizes mathematical formulas and equations.",
     },
+    "structured-answering": {
+        "name": "Structured Output",
+        "description": "Extract values into a JSON document with a user-defined schema.",
+    },
     "custom": {
         "name": "Custom Prompt",
         "description": "Provide your own prompt for specialized recognition tasks.",
     },
 }
 
 TASKS_REQUIRING_PROMPT = {"custom"}
+TASKS_REQUIRING_OUTPUT_STRUCTURE = {"structured-answering"}
 
 LONG_DESCRIPTION = """
 Recognize text in images using GLM-OCR, a vision language model by Zhipu AI specialized
@@ -74,7 +94,9 @@
 - **Table Recognition** — Recognizes table structures and content.
 
 You can also select **Custom Prompt** to provide your own prompt for specialized
-recognition tasks.
+recognition tasks, or **Structured Output** to extract values from the image
+into a JSON document with a user-defined schema (pair with the JSON Parser
+block to materialize the keys as workflow outputs).
 
 This block pairs well with detection models and DynamicCropBlock to isolate regions of
 interest before running OCR. For example, use an object detection model to find labels
@@ -92,6 +114,9 @@ class BlockManifest(WorkflowBlockManifest):
         description="Recognition task to perform. Determines the prompt sent to GLM-OCR.",
         json_schema_extra={
             "values_metadata": TASKS_METADATA,
+            "recommended_parsers": {
+                "structured-answering": "roboflow_core/json_parser@v1",
+            },
             "always_visible": True,
         },
     )
@@ -106,6 +131,20 @@ class BlockManifest(WorkflowBlockManifest):
             },
         },
     )
+    output_structure: Optional[Dict[str, str]] = Field(
+        default=None,
+        description="Dictionary describing the structure of the expected JSON response. "
+        "Keys are the JSON field names; values describe what the model should put in each field.",
+        examples=[{"my_key": "description"}, "$inputs.output_structure"],
+        json_schema_extra={
+            "relevant_for": {
+                "task_type": {
+                    "values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
+                    "required": True,
+                },
+            },
+        },
+    )
     max_new_tokens: Optional[int] = Field(
         default=None,
         description="Maximum number of tokens to generate. If not set, the model default will be used.",
@@ -151,15 +190,24 @@ class BlockManifest(WorkflowBlockManifest):
     def validate_prompt(self) -> "BlockManifest":
         if self.task_type == "custom" and not self.prompt:
             raise ValueError("`prompt` is required when task_type is 'custom'.")
+        if (
+            self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
+            and not self.output_structure
+        ):
+            raise ValueError(
+                f"`output_structure` is required when task_type is '{self.task_type}'."
+            )
         return self
 
     @classmethod
     def describe_outputs(cls) -> List[OutputDefinition]:
         return [
             OutputDefinition(
                 name="parsed_output",
-                kind=[STRING_KIND],
-                description="The recognized text from the image.",
+                kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND],
+                description="The recognized text from the image. For "
+                "`structured-answering` this is a JSON-in-Markdown document "
+                "ready to be fed into the JSON Parser block.",
             ),
         ]
 
@@ -172,9 +220,17 @@ def get_execution_engine_compatibility(cls) -> Optional[str]:
         return ">=1.3.0,<2.0.0"
 
 
-def _resolve_prompt(task_type: str, prompt: Optional[str]) -> str:
+def _resolve_prompt(
+    task_type: str,
+    prompt: Optional[str],
+    output_structure: Optional[Dict[str, str]],
+) -> str:
     if task_type == "custom":
         return prompt
+    if task_type == "structured-answering":
+        return STRUCTURED_ANSWERING_PROMPT_TEMPLATE.format(
+            output_structure=json.dumps(output_structure, indent=4),
+        )
     return TASK_TYPE_TO_PROMPT[task_type]
 
 
@@ -203,9 +259,10 @@ def run(
         model_version: str,
         task_type: str,
         prompt: Optional[str],
+        output_structure: Optional[Dict[str, str]] = None,
         max_new_tokens: Optional[int] = None,
     ) -> BlockResult:
-        resolved_prompt = _resolve_prompt(task_type, prompt)
+        resolved_prompt = _resolve_prompt(task_type, prompt, output_structure)
         if self._step_execution_mode == StepExecutionMode.LOCAL:
             return self.run_locally(
                 images=images,
diff --git a/inference/core/workflows/core_steps/models/foundation/openai/v1.py b/inference/core/workflows/core_steps/models/foundation/openai/v1.py
@@ -95,7 +95,14 @@ class BlockManifest(WorkflowBlockManifest):
     )
     openai_model: Union[
         Selector(kind=[STRING_KIND]),
-        Literal["gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-5.4-nano"],
+        Literal[
+            "gpt-4o",
+            "gpt-4o-mini",
+            "gpt-5.4",
+            "gpt-5.4-mini",
+            "gpt-5.4-nano",
+            "gpt-5.5",
+        ],
     ] = Field(
         default="gpt-4o",
         description="Model to be used",
diff --git a/inference/core/workflows/core_steps/models/foundation/openai/v2.py b/inference/core/workflows/core_steps/models/foundation/openai/v2.py
@@ -173,6 +173,7 @@ class BlockManifest(WorkflowBlockManifest):
             "gpt-5.4",
             "gpt-5.4-mini",
             "gpt-5.4-nano",
+            "gpt-5.5",
         ],
     ] = Field(
         default="gpt-4o",
diff --git a/inference/core/workflows/core_steps/models/foundation/openai/v3.py b/inference/core/workflows/core_steps/models/foundation/openai/v3.py
@@ -181,6 +181,7 @@ class BlockManifest(WorkflowBlockManifest):
             "gpt-5.4",
             "gpt-5.4-mini",
             "gpt-5.4-nano",
+            "gpt-5.5",
             "o3",
             "o4-mini",
         ],
diff --git a/inference/core/workflows/core_steps/models/foundation/openai/v4.py b/inference/core/workflows/core_steps/models/foundation/openai/v4.py
@@ -37,6 +37,11 @@
 )
 
 OPENAI_MODELS = [
+    {
+        "id": "gpt-5.5",
+        "name": "GPT-5.5",
+        "reasoning_effort_values": ["none", "low", "medium", "high", "xhigh"],
+    },
     {
         "id": "gpt-5.4",
         "name": "GPT-5.4",
diff --git a/inference_models/inference_models/models/common/rle_utils.py b/inference_models/inference_models/models/common/rle_utils.py
@@ -8,8 +8,19 @@
 
 
 def torch_mask_to_coco_rle(mask: torch.Tensor) -> dict:
-    np_mask = np.asfortranarray(mask.detach().cpu().numpy().astype(np.uint8))
-    return mask_utils.encode(np_mask)
+    # Convert to uncompressed run length encoding in GPU
+    # coco tools expect fortran order (column-wise)
+    mask_flat = mask.permute(1, 0).reshape(-1)
+    values, lengths = torch.unique_consecutive(mask_flat, return_counts=True)
+    counts = lengths.cpu().tolist()
+
+    if values[0] == 1:
+        counts.insert(0, 0)
+
+    h, w = mask.shape
+    # compress
+    rle = mask_utils.frPyObjects({"counts": counts, "size": [h, w]}, h, w)
+    return rle
 
 
 def coco_rle_masks_to_numpy_mask(instances_masks: InstancesRLEMasks) -> np.ndarray:
diff --git a/tests/inference/hosted_platform_tests/test_workflows.py b/tests/inference/hosted_platform_tests/test_workflows.py
@@ -129,7 +129,7 @@ def test_get_versions_of_execution_engine(object_detection_service_url: str) ->
     # then
     response.raise_for_status()
     response_data = response.json()
-    assert response_data["versions"] == ["1.8.0"]
+    assert response_data["versions"] == ["1.9.0"]
 
 
 FUNCTION = """

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,11 @@`
`37`	`37`	`)`
`38`	`38`
`39`	`39`	`OPENAI_MODELS = [`
	`40`	`+ {`
	`41`	`+ "id": "gpt-5.5",`
	`42`	`+ "name": "GPT-5.5",`
	`43`	`+ "reasoning_effort_values": ["none", "low", "medium", "high", "xhigh"],`
	`44`	`+ },`
`40`	`45`	`{`
`41`	`46`	`"id": "gpt-5.4",`
`42`	`47`	`"name": "GPT-5.4",`