Skip to content

Commit cd268eb

Browse files
authored
Merge branch 'main' into gemma-4
2 parents 09a3cb1 + c2b3fc7 commit cd268eb

8 files changed

Lines changed: 94 additions & 12 deletions

File tree

.github/workflows/load_test_hosted_inference.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ jobs:
114114
if: ${{ github.event.inputs.environment == 'staging' && github.event.inputs.model_type == 'workflows' && github.event.inputs.test_target == 'aws-lambda-serving' }}
115115
run: |
116116
ROBOFLOW_API_KEY=${{ secrets.LOAD_TEST_STAGING_API_KEY }} python -m inference_cli.main benchmark api-speed -wid workflows-staging-test -wn paul-guerrie -d coco -rps 5 -br 500 -h https://lambda-classification.staging.roboflow.com --legacy-endpoints --yes --output_location test_results.json --max_error_rate 5.0
117-
- name: 🏋️‍♂️ Load test 😎 STAGING 😎 AWS LAMBDA | workflows 🔥🔥🔥🔥
117+
- name: 🏋️‍♂️ Load test 😎 STAGING 😎 SERVERLESS V2 | workflows 🔥🔥🔥🔥
118118
if: ${{ github.event.inputs.environment == 'staging' && github.event.inputs.model_type == 'workflows' && github.event.inputs.test_target == 'serverless-v2' }}
119119
run: |
120120
ROBOFLOW_API_KEY=${{ secrets.LOAD_TEST_STAGING_API_KEY }} python -m inference_cli.main benchmark api-speed -wid workflows-staging-test -wn paul-guerrie -d coco -rps 5 -br 500 -h https://serverless.roboflow.one --legacy-endpoints --yes --output_location test_results.json --max_error_rate 5.0

inference/core/workflows/core_steps/models/foundation/glm_ocr/v1.py

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from typing import List, Literal, Optional, Type, Union
1+
import json
2+
from typing import Dict, List, Literal, Optional, Type, Union
23

34
from pydantic import ConfigDict, Field, model_validator
45

@@ -17,6 +18,7 @@
1718
)
1819
from inference.core.workflows.execution_engine.entities.types import (
1920
IMAGE_KIND,
21+
LANGUAGE_MODEL_OUTPUT_KIND,
2022
ROBOFLOW_MODEL_ID_KIND,
2123
STRING_KIND,
2224
ImageInputField,
@@ -29,15 +31,28 @@
2931
)
3032
from inference_sdk import InferenceHTTPClient
3133

34+
STRUCTURED_ANSWERING_PROMPT_TEMPLATE = (
35+
"You are supposed to produce responses in JSON wrapped in Markdown markers: "
36+
"```json\nyour-response\n```. Below is a dictionary with keys and values. "
37+
"Each key must be present in your response. Values represent descriptions "
38+
"for JSON fields to be generated. Provide only JSON Markdown in response.\n\n"
39+
"Specification of requirements regarding output fields:\n{output_structure}"
40+
)
41+
3242
TASK_TYPE_TO_PROMPT = {
3343
"text-recognition": "Text Recognition:",
3444
"table-recognition": "Table Recognition:",
3545
"formula-recognition": "Formula Recognition:",
46+
"structured-answering": None,
3647
"custom": None,
3748
}
3849

3950
TaskType = Literal[
40-
"text-recognition", "table-recognition", "formula-recognition", "custom"
51+
"text-recognition",
52+
"table-recognition",
53+
"formula-recognition",
54+
"structured-answering",
55+
"custom",
4156
]
4257

4358
TASKS_METADATA = {
@@ -53,13 +68,18 @@
5368
"name": "Formula Recognition",
5469
"description": "Recognizes mathematical formulas and equations.",
5570
},
71+
"structured-answering": {
72+
"name": "Structured Output",
73+
"description": "Extract values into a JSON document with a user-defined schema.",
74+
},
5675
"custom": {
5776
"name": "Custom Prompt",
5877
"description": "Provide your own prompt for specialized recognition tasks.",
5978
},
6079
}
6180

6281
TASKS_REQUIRING_PROMPT = {"custom"}
82+
TASKS_REQUIRING_OUTPUT_STRUCTURE = {"structured-answering"}
6383

6484
LONG_DESCRIPTION = """
6585
Recognize text in images using GLM-OCR, a vision language model by Zhipu AI specialized
@@ -74,7 +94,9 @@
7494
- **Table Recognition** — Recognizes table structures and content.
7595
7696
You can also select **Custom Prompt** to provide your own prompt for specialized
77-
recognition tasks.
97+
recognition tasks, or **Structured Output** to extract values from the image
98+
into a JSON document with a user-defined schema (pair with the JSON Parser
99+
block to materialize the keys as workflow outputs).
78100
79101
This block pairs well with detection models and DynamicCropBlock to isolate regions of
80102
interest before running OCR. For example, use an object detection model to find labels
@@ -92,6 +114,9 @@ class BlockManifest(WorkflowBlockManifest):
92114
description="Recognition task to perform. Determines the prompt sent to GLM-OCR.",
93115
json_schema_extra={
94116
"values_metadata": TASKS_METADATA,
117+
"recommended_parsers": {
118+
"structured-answering": "roboflow_core/json_parser@v1",
119+
},
95120
"always_visible": True,
96121
},
97122
)
@@ -106,6 +131,20 @@ class BlockManifest(WorkflowBlockManifest):
106131
},
107132
},
108133
)
134+
output_structure: Optional[Dict[str, str]] = Field(
135+
default=None,
136+
description="Dictionary describing the structure of the expected JSON response. "
137+
"Keys are the JSON field names; values describe what the model should put in each field.",
138+
examples=[{"my_key": "description"}, "$inputs.output_structure"],
139+
json_schema_extra={
140+
"relevant_for": {
141+
"task_type": {
142+
"values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
143+
"required": True,
144+
},
145+
},
146+
},
147+
)
109148
max_new_tokens: Optional[int] = Field(
110149
default=None,
111150
description="Maximum number of tokens to generate. If not set, the model default will be used.",
@@ -151,15 +190,24 @@ class BlockManifest(WorkflowBlockManifest):
151190
def validate_prompt(self) -> "BlockManifest":
152191
if self.task_type == "custom" and not self.prompt:
153192
raise ValueError("`prompt` is required when task_type is 'custom'.")
193+
if (
194+
self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
195+
and not self.output_structure
196+
):
197+
raise ValueError(
198+
f"`output_structure` is required when task_type is '{self.task_type}'."
199+
)
154200
return self
155201

156202
@classmethod
157203
def describe_outputs(cls) -> List[OutputDefinition]:
158204
return [
159205
OutputDefinition(
160206
name="parsed_output",
161-
kind=[STRING_KIND],
162-
description="The recognized text from the image.",
207+
kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND],
208+
description="The recognized text from the image. For "
209+
"`structured-answering` this is a JSON-in-Markdown document "
210+
"ready to be fed into the JSON Parser block.",
163211
),
164212
]
165213

@@ -172,9 +220,17 @@ def get_execution_engine_compatibility(cls) -> Optional[str]:
172220
return ">=1.3.0,<2.0.0"
173221

174222

175-
def _resolve_prompt(task_type: str, prompt: Optional[str]) -> str:
223+
def _resolve_prompt(
224+
task_type: str,
225+
prompt: Optional[str],
226+
output_structure: Optional[Dict[str, str]],
227+
) -> str:
176228
if task_type == "custom":
177229
return prompt
230+
if task_type == "structured-answering":
231+
return STRUCTURED_ANSWERING_PROMPT_TEMPLATE.format(
232+
output_structure=json.dumps(output_structure, indent=4),
233+
)
178234
return TASK_TYPE_TO_PROMPT[task_type]
179235

180236

@@ -203,9 +259,10 @@ def run(
203259
model_version: str,
204260
task_type: str,
205261
prompt: Optional[str],
262+
output_structure: Optional[Dict[str, str]] = None,
206263
max_new_tokens: Optional[int] = None,
207264
) -> BlockResult:
208-
resolved_prompt = _resolve_prompt(task_type, prompt)
265+
resolved_prompt = _resolve_prompt(task_type, prompt, output_structure)
209266
if self._step_execution_mode == StepExecutionMode.LOCAL:
210267
return self.run_locally(
211268
images=images,

inference/core/workflows/core_steps/models/foundation/openai/v1.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,14 @@ class BlockManifest(WorkflowBlockManifest):
9595
)
9696
openai_model: Union[
9797
Selector(kind=[STRING_KIND]),
98-
Literal["gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-5.4-nano"],
98+
Literal[
99+
"gpt-4o",
100+
"gpt-4o-mini",
101+
"gpt-5.4",
102+
"gpt-5.4-mini",
103+
"gpt-5.4-nano",
104+
"gpt-5.5",
105+
],
99106
] = Field(
100107
default="gpt-4o",
101108
description="Model to be used",

inference/core/workflows/core_steps/models/foundation/openai/v2.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ class BlockManifest(WorkflowBlockManifest):
173173
"gpt-5.4",
174174
"gpt-5.4-mini",
175175
"gpt-5.4-nano",
176+
"gpt-5.5",
176177
],
177178
] = Field(
178179
default="gpt-4o",

inference/core/workflows/core_steps/models/foundation/openai/v3.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ class BlockManifest(WorkflowBlockManifest):
181181
"gpt-5.4",
182182
"gpt-5.4-mini",
183183
"gpt-5.4-nano",
184+
"gpt-5.5",
184185
"o3",
185186
"o4-mini",
186187
],

inference/core/workflows/core_steps/models/foundation/openai/v4.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
)
3838

3939
OPENAI_MODELS = [
40+
{
41+
"id": "gpt-5.5",
42+
"name": "GPT-5.5",
43+
"reasoning_effort_values": ["none", "low", "medium", "high", "xhigh"],
44+
},
4045
{
4146
"id": "gpt-5.4",
4247
"name": "GPT-5.4",

inference_models/inference_models/models/common/rle_utils.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,19 @@
88

99

1010
def torch_mask_to_coco_rle(mask: torch.Tensor) -> dict:
11-
np_mask = np.asfortranarray(mask.detach().cpu().numpy().astype(np.uint8))
12-
return mask_utils.encode(np_mask)
11+
# Convert to uncompressed run length encoding in GPU
12+
# coco tools expect fortran order (column-wise)
13+
mask_flat = mask.permute(1, 0).reshape(-1)
14+
values, lengths = torch.unique_consecutive(mask_flat, return_counts=True)
15+
counts = lengths.cpu().tolist()
16+
17+
if values[0] == 1:
18+
counts.insert(0, 0)
19+
20+
h, w = mask.shape
21+
# compress
22+
rle = mask_utils.frPyObjects({"counts": counts, "size": [h, w]}, h, w)
23+
return rle
1324

1425

1526
def coco_rle_masks_to_numpy_mask(instances_masks: InstancesRLEMasks) -> np.ndarray:

tests/inference/hosted_platform_tests/test_workflows.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def test_get_versions_of_execution_engine(object_detection_service_url: str) ->
129129
# then
130130
response.raise_for_status()
131131
response_data = response.json()
132-
assert response_data["versions"] == ["1.8.0"]
132+
assert response_data["versions"] == ["1.9.0"]
133133

134134

135135
FUNCTION = """

0 commit comments

Comments
 (0)