1- from typing import List , Literal , Optional , Type , Union
1+ import json
2+ from typing import Dict , List , Literal , Optional , Type , Union
23
34from pydantic import ConfigDict , Field , model_validator
45
1718)
1819from inference .core .workflows .execution_engine .entities .types import (
1920 IMAGE_KIND ,
21+ LANGUAGE_MODEL_OUTPUT_KIND ,
2022 ROBOFLOW_MODEL_ID_KIND ,
2123 STRING_KIND ,
2224 ImageInputField ,
2931)
3032from inference_sdk import InferenceHTTPClient
3133
34+ STRUCTURED_ANSWERING_PROMPT_TEMPLATE = (
35+ "You are supposed to produce responses in JSON wrapped in Markdown markers: "
36+ "```json\n your-response\n ```. Below is a dictionary with keys and values. "
37+ "Each key must be present in your response. Values represent descriptions "
38+ "for JSON fields to be generated. Provide only JSON Markdown in response.\n \n "
39+ "Specification of requirements regarding output fields:\n {output_structure}"
40+ )
41+
3242TASK_TYPE_TO_PROMPT = {
3343 "text-recognition" : "Text Recognition:" ,
3444 "table-recognition" : "Table Recognition:" ,
3545 "formula-recognition" : "Formula Recognition:" ,
46+ "structured-answering" : None ,
3647 "custom" : None ,
3748}
3849
3950TaskType = Literal [
40- "text-recognition" , "table-recognition" , "formula-recognition" , "custom"
51+ "text-recognition" ,
52+ "table-recognition" ,
53+ "formula-recognition" ,
54+ "structured-answering" ,
55+ "custom" ,
4156]
4257
4358TASKS_METADATA = {
5368 "name" : "Formula Recognition" ,
5469 "description" : "Recognizes mathematical formulas and equations." ,
5570 },
71+ "structured-answering" : {
72+ "name" : "Structured Output" ,
73+ "description" : "Extract values into a JSON document with a user-defined schema." ,
74+ },
5675 "custom" : {
5776 "name" : "Custom Prompt" ,
5877 "description" : "Provide your own prompt for specialized recognition tasks." ,
5978 },
6079}
6180
6281TASKS_REQUIRING_PROMPT = {"custom" }
82+ TASKS_REQUIRING_OUTPUT_STRUCTURE = {"structured-answering" }
6383
6484LONG_DESCRIPTION = """
6585Recognize text in images using GLM-OCR, a vision language model by Zhipu AI specialized
7494- **Table Recognition** — Recognizes table structures and content.
7595
7696You can also select **Custom Prompt** to provide your own prompt for specialized
77- recognition tasks.
97+ recognition tasks, or **Structured Output** to extract values from the image
98+ into a JSON document with a user-defined schema (pair with the JSON Parser
99+ block to materialize the keys as workflow outputs).
78100
79101This block pairs well with detection models and DynamicCropBlock to isolate regions of
80102interest before running OCR. For example, use an object detection model to find labels
@@ -92,6 +114,9 @@ class BlockManifest(WorkflowBlockManifest):
92114 description = "Recognition task to perform. Determines the prompt sent to GLM-OCR." ,
93115 json_schema_extra = {
94116 "values_metadata" : TASKS_METADATA ,
117+ "recommended_parsers" : {
118+ "structured-answering" : "roboflow_core/json_parser@v1" ,
119+ },
95120 "always_visible" : True ,
96121 },
97122 )
@@ -106,6 +131,20 @@ class BlockManifest(WorkflowBlockManifest):
106131 },
107132 },
108133 )
134+ output_structure : Optional [Dict [str , str ]] = Field (
135+ default = None ,
136+ description = "Dictionary describing the structure of the expected JSON response. "
137+ "Keys are the JSON field names; values describe what the model should put in each field." ,
138+ examples = [{"my_key" : "description" }, "$inputs.output_structure" ],
139+ json_schema_extra = {
140+ "relevant_for" : {
141+ "task_type" : {
142+ "values" : TASKS_REQUIRING_OUTPUT_STRUCTURE ,
143+ "required" : True ,
144+ },
145+ },
146+ },
147+ )
109148 max_new_tokens : Optional [int ] = Field (
110149 default = None ,
111150 description = "Maximum number of tokens to generate. If not set, the model default will be used." ,
@@ -151,15 +190,24 @@ class BlockManifest(WorkflowBlockManifest):
151190 def validate_prompt (self ) -> "BlockManifest" :
152191 if self .task_type == "custom" and not self .prompt :
153192 raise ValueError ("`prompt` is required when task_type is 'custom'." )
193+ if (
194+ self .task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
195+ and not self .output_structure
196+ ):
197+ raise ValueError (
198+ f"`output_structure` is required when task_type is '{ self .task_type } '."
199+ )
154200 return self
155201
156202 @classmethod
157203 def describe_outputs (cls ) -> List [OutputDefinition ]:
158204 return [
159205 OutputDefinition (
160206 name = "parsed_output" ,
161- kind = [STRING_KIND ],
162- description = "The recognized text from the image." ,
207+ kind = [STRING_KIND , LANGUAGE_MODEL_OUTPUT_KIND ],
208+ description = "The recognized text from the image. For "
209+ "`structured-answering` this is a JSON-in-Markdown document "
210+ "ready to be fed into the JSON Parser block." ,
163211 ),
164212 ]
165213
@@ -172,9 +220,17 @@ def get_execution_engine_compatibility(cls) -> Optional[str]:
172220 return ">=1.3.0,<2.0.0"
173221
174222
175- def _resolve_prompt (task_type : str , prompt : Optional [str ]) -> str :
223+ def _resolve_prompt (
224+ task_type : str ,
225+ prompt : Optional [str ],
226+ output_structure : Optional [Dict [str , str ]],
227+ ) -> str :
176228 if task_type == "custom" :
177229 return prompt
230+ if task_type == "structured-answering" :
231+ return STRUCTURED_ANSWERING_PROMPT_TEMPLATE .format (
232+ output_structure = json .dumps (output_structure , indent = 4 ),
233+ )
178234 return TASK_TYPE_TO_PROMPT [task_type ]
179235
180236
@@ -203,9 +259,10 @@ def run(
203259 model_version : str ,
204260 task_type : str ,
205261 prompt : Optional [str ],
262+ output_structure : Optional [Dict [str , str ]] = None ,
206263 max_new_tokens : Optional [int ] = None ,
207264 ) -> BlockResult :
208- resolved_prompt = _resolve_prompt (task_type , prompt )
265+ resolved_prompt = _resolve_prompt (task_type , prompt , output_structure )
209266 if self ._step_execution_mode == StepExecutionMode .LOCAL :
210267 return self .run_locally (
211268 images = images ,
0 commit comments