vlm-run · spillai · May 13, 2026 · May 13, 2026 · gemini-code-assist · May 13, 2026
diff --git a/tests/test_predictions.py b/tests/test_predictions.py
@@ -575,9 +575,7 @@ def mock_get_invoice(prediction_id):
     assert isinstance(response.response, dict)
 
 
-@pytest.mark.parametrize(
-    "service_tier", ["auto", "default", "standard", "flex", "priority"]
-)
+@pytest.mark.parametrize("service_tier", ["default", "flex", "priority"])
 def test_generation_config_service_tier(service_tier):
     """service_tier is accepted and round-trips through model_dump()."""
     config = GenerationConfig(service_tier=service_tier)

diff --git a/vlmrun/cli/_cli/chat.py b/vlmrun/cli/_cli/chat.py
@@ -147,6 +147,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 # Available toolsets (must match AgentToolset literal values)
 AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__)
+AVAILABLE_SERVICE_TIERS = ["default", "flex", "priority"]
 
 DEFAULT_MODEL = "vlmrun-orion-1:auto"
 
@@ -583,6 +584,11 @@ def chat(
         "-s",
         help="Session UUID for persisting chat history (stateful conversations).",
     ),
+    service_tier: Optional[str] = typer.Option(
+        None,
+        "--service-tier",
+        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
+    ),
     timeout: Optional[float] = typer.Option(
         None,
         "--timeout",
@@ -647,6 +653,11 @@ def chat(
             console.print(f"  - {m}{default_marker}")
         sys.exit(1)
 
+    if service_tier and service_tier not in AVAILABLE_SERVICE_TIERS:
+        console.print(f"[red]Error:[/] Invalid service tier '{service_tier}'")
+        console.print(f"\nAvailable tiers: {', '.join(AVAILABLE_SERVICE_TIERS)}")
+        sys.exit(1)
+
     # Validate input files if provided
     if input_files:
         for file_path in input_files:
@@ -730,6 +741,8 @@ def chat(
             extra_body["skills"] = agent_skills
         if toolsets:
             extra_body["toolsets"] = toolsets
+        if service_tier:
+            extra_body["service_tier"] = service_tier
         if not extra_body:
             extra_body = None
 

diff --git a/vlmrun/cli/_cli/execute.py b/vlmrun/cli/_cli/execute.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import json
-import sys
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
@@ -36,6 +35,8 @@
 
 AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__)
 
+AVAILABLE_SERVICE_TIERS = ["default", "flex", "priority"]
+
 DEFAULT_MODEL = "vlmrun-orion-1:auto"
 
 EXECUTE_HELP = """Execute an agent via /v1/agent/execute.
@@ -46,7 +47,7 @@
   vlmrun execute -p "Extract invoice fields" -i doc.pdf --schema schema.json
   vlmrun execute -n my-agent:v1 -i img.jpg --skill ./my-skill
   vlmrun execute -n my-agent:v1 -i img.jpg --skill-id my-skill:latest
-  vlmrun execute -p "Describe" -i photo.jpg --no-wait
+  vlmrun execute -p "Describe" -i photo.jpg --wait
   vlmrun execute -n my-agent:v1 -i a.jpg -i b.pdf -t image -t document
 
 \b
@@ -179,9 +180,7 @@ def _upload_files(
                         file_responses.append(future.result())
                         status.update(f"Uploading {file_path.name}...")
                     except Exception as e:
-                        console.print(
-                            f"[red]Error uploading {file_path.name}:[/] {e}"
-                        )
+                        console.print(f"[red]Error uploading {file_path.name}:[/] {e}")
                         raise typer.Exit(1) from e
 
     return file_responses
@@ -288,9 +287,9 @@ def execute(
         help="Model: vlmrun-orion-1[:lite|fast|auto|pro]",
     ),
     wait: bool = typer.Option(
-        True,
+        False,
         "--wait/--no-wait",
-        help="Wait for execution to complete (default: wait).",
+        help="Wait for execution to complete (default: no-wait).",
     ),
     timeout: int = typer.Option(
         300,
@@ -307,6 +306,11 @@ def execute(
         "--callback-url",
         help="URL to call when execution completes (webhook).",
     ),
+    service_tier: Optional[str] = typer.Option(
+        None,
+        "--service-tier",
+        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
+    ),
     output_format: Optional[str] = typer.Option(
         None,
         "--format",
@@ -333,13 +337,16 @@ def execute(
         console.print(f"\nAvailable models: {', '.join(AVAILABLE_MODELS)}")
         raise typer.Exit(1)
 
+    if service_tier and service_tier not in AVAILABLE_SERVICE_TIERS:
+        console.print(f"[red]Error:[/] Invalid service tier '{service_tier}'")
+        console.print(f"\nAvailable tiers: {', '.join(AVAILABLE_SERVICE_TIERS)}")
+        raise typer.Exit(1)
+
     if toolsets:
         for ts in toolsets:
             if ts not in AVAILABLE_TOOLSETS:
                 console.print(f"[red]Error:[/] Invalid toolset '{ts}'")
-                console.print(
-                    f"\nAvailable toolsets: {', '.join(AVAILABLE_TOOLSETS)}"
-                )
+                console.print(f"\nAvailable toolsets: {', '.join(AVAILABLE_TOOLSETS)}")
                 raise typer.Exit(1)
 
     if input_files:
@@ -399,8 +406,7 @@ def execute(
         if file_responses:
             inputs = {
                 "files": [
-                    {"type": "input_file", "file_id": fr.id}
-                    for fr in file_responses
+                    {"type": "input_file", "file_id": fr.id} for fr in file_responses
                 ]
             }
 
@@ -409,11 +415,12 @@ def execute(
             prompt=final_prompt,
             json_schema=json_schema,
             skills=skills,
+            service_tier=service_tier,
         )
 
         if not output_json:
             console.print(
-                f"\n  [bold blue]Submitting execution[/bold blue]"
+                "\n  [bold blue]Submitting execution[/bold blue]"
                 + (f" [dim]({name})[/dim]" if name else "")
                 + f" [dim]model={model}[/dim]"
             )

diff --git a/vlmrun/cli/_cli/executions.py b/vlmrun/cli/_cli/executions.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import json
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 
 import typer
 from rich.console import Console, Group
@@ -13,7 +13,6 @@
 
 if TYPE_CHECKING:
     from vlmrun.client import VLMRun
-    from vlmrun.client.types import AgentExecutionResponse
 
 app = typer.Typer(
     help="List and retrieve agent execution results.",
@@ -227,11 +226,7 @@ def get(
             execution = client.executions.get(execution_id)
 
         if output_json:
-            print(
-                json.dumps(
-                    execution.model_dump(mode="json"), indent=2, default=str
-                )
-            )
+            print(json.dumps(execution.model_dump(mode="json"), indent=2, default=str))
             return
 
         console.print("\nExecution Details:\n", style="white")

diff --git a/vlmrun/cli/_cli/generate.py b/vlmrun/cli/_cli/generate.py
@@ -3,13 +3,11 @@
 from __future__ import annotations
 
 import json
-import sys
 import time
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
 import typer
-from rich import print as rprint
 from rich.console import Console
 from rich.panel import Panel
 from rich.status import Status
@@ -32,6 +30,7 @@
 console = Console()
 
 AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__)
+AVAILABLE_SERVICE_TIERS = ["default", "flex", "priority"]
 
 GENERATE_HELP = """Generate structured predictions for images, documents, videos, and audio.
 
@@ -76,9 +75,7 @@ def _resolve_skills(
 ) -> Optional[List[AgentSkill]]:
     """Build AgentSkill list from --skill dirs or --skill-id references."""
     if skill_dirs and skill_ids:
-        console.print(
-            "[red]Error:[/] --skill and --skill-id are mutually exclusive."
-        )
+        console.print("[red]Error:[/] --skill and --skill-id are mutually exclusive.")
         raise typer.Exit(1)
 
     if skill_dirs:
@@ -175,6 +172,11 @@ def generate(
         "--timeout",
         help="Timeout in seconds when waiting for prediction to complete.",
     ),
+    service_tier: Optional[str] = typer.Option(
+        None,
+        "--service-tier",
+        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
-        help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).",
+        help="Delivery tier: default, flex (50%% discount), or priority (1.8x premium).",
+    ),
     output_format: Optional[str] = typer.Option(
         None,
         "--format",
@@ -196,6 +198,11 @@ def generate(
             console.print(f"[red]Error:[/] Unsupported output format '{output_format}'")
             raise typer.Exit(1)
 
+    if service_tier and service_tier not in AVAILABLE_SERVICE_TIERS:
+        console.print(f"[red]Error:[/] Invalid service tier '{service_tier}'")
+        console.print(f"\nAvailable tiers: {', '.join(AVAILABLE_SERVICE_TIERS)}")
+        raise typer.Exit(1)
+
     suffix = input_file.suffix.lower()
     if suffix not in SUPPORTED_INPUT_FILETYPES:
         console.print(f"[red]Error:[/] Unsupported file type: {suffix}")
@@ -223,11 +230,12 @@ def generate(
             raise typer.Exit(1) from e
 
     config: Optional[GenerationConfig] = None
-    if any([skills, json_schema, prompt]):
+    if any([skills, json_schema, prompt, service_tier]):
         config = GenerationConfig(
             skills=skills,
             json_schema=json_schema,
             prompt=prompt,
+            service_tier=service_tier,
         )
 
     try:
@@ -242,31 +250,45 @@ def generate(
         start_time = time.time()
 
         if media_type in ("image", "document"):
-            with Status("Processing...", console=console, spinner="dots") if not output_json else _noop_ctx():
+            with (
+                Status("Processing...", console=console, spinner="dots")
+                if not output_json
+                else _noop_ctx()
+            ):
                 response: PredictionResponse = client.document.generate(
                     file=input_file,
                     domain=domain,
                     batch=batch,
                     config=config,
                 )
         elif media_type == "video":
-            with Status("Processing...", console=console, spinner="dots") if not output_json else _noop_ctx():
+            with (
+                Status("Processing...", console=console, spinner="dots")
+                if not output_json
+                else _noop_ctx()
+            ):
                 response = client.video.generate(
                     file=input_file,
                     domain=domain,
                     batch=batch,
                     config=config,
                 )
         elif media_type == "audio":
-            with Status("Processing...", console=console, spinner="dots") if not output_json else _noop_ctx():
+            with (
+                Status("Processing...", console=console, spinner="dots")
+                if not output_json
+                else _noop_ctx()
+            ):
                 response = client.audio.generate(
                     file=input_file,
                     domain=domain,
                     batch=batch,
                     config=config,
                 )
         else:
-            console.print(f"[red]Error:[/] Could not determine media type for {input_file}")
+            console.print(
+                f"[red]Error:[/] Could not determine media type for {input_file}"
+            )
             raise typer.Exit(1)
 
         # If batch mode and wait requested, poll until complete

diff --git a/vlmrun/cli/_cli/predictions.py b/vlmrun/cli/_cli/predictions.py
@@ -132,9 +132,7 @@ def list(
     for prediction in predictions:
         usage = prediction.usage
         st = _status_style(prediction.status)
-        dur = _compute_duration(
-            prediction.created_at, prediction.completed_at, usage
-        )
+        dur = _compute_duration(prediction.created_at, prediction.completed_at, usage)
         rows.append(
             _format_row(
                 prediction.id,

diff --git a/vlmrun/client/types.py b/vlmrun/client/types.py
@@ -239,6 +239,13 @@ class AgentExecutionOrCreationConfig(BaseModel):
         default=None,
         description="List of agent skills to enable for this execution. Skills provide domain-specific expertise and capabilities.",
     )
+    service_tier: Literal["default", "flex", "priority"] | None = Field(
+        default=None,
+        description=(
+            "Delivery tier: 'default' (baseline), 'flex' (50%% discount, higher latency), "
+            "or 'priority' (1.8x premium)."
-            "Delivery tier: 'default' (baseline), 'flex' (50%% discount, higher latency), "
-            "or 'priority' (1.8x premium)."
+            "Delivery tier: 'default' (baseline), 'flex' (50% discount, higher latency), "
+            "or 'priority' (1.8x premium)."
-            "Delivery tier: 'default' (baseline), 'flex' (50%% discount, higher latency), "
-            "or 'priority' (1.8x premium)."
+            "Delivery tier: 'default' (baseline), 'flex' (50% discount, higher latency), "
+            "or 'priority' (1.8x premium)."
+        ),
+    )
 
     @model_validator(mode="after")
     def validate_config(self):
@@ -597,15 +604,11 @@ class GenerationConfig(BaseModel):
         default=None,
         description="0-indexed page indices to process for document files. If None, all pages are processed.",
     )
-    service_tier: Literal["auto", "default", "standard", "flex", "priority"] | None = Field(
+    service_tier: Literal["default", "flex", "priority"] | None = Field(
         default=None,
         description=(
-            "Delivery tier mirroring OpenAI's service_tier and Vertex AI's "
-            "Gemini Flex/Priority offering. 'standard'/'default' uses baseline "
-            "rates, 'flex' applies a 50% discount with higher latency, "
-            "'priority' applies a 1.8x premium. When omitted (or 'auto'), the "
-            "server default applies (which itself defaults to 'standard'). The "
-            "chosen tier drives BOTH billing AND the actual request routing."
+            "Delivery tier: 'default' (baseline), 'flex' (50%% discount, higher latency), "
+            "or 'priority' (1.8x premium)."
         ),
     )
     skills: Optional[List["AgentSkill"]] = Field(