From 4fb35281f62df4e56c365bf57750ad1745a859e5 Mon Sep 17 00:00:00 2001 From: Sudeep Pillai Date: Tue, 12 May 2026 18:22:15 -0700 Subject: [PATCH 1/2] feat(cli): add --service-tier flag to execute, generate, and chat commands Expose the service_tier option (standard/flex/priority) across all three CLI entry points so users can control delivery tier and pricing from the command line. Also adds service_tier to AgentExecutionOrCreationConfig so the execute path can carry the value through to the API. --- vlmrun/cli/_cli/chat.py | 13 +++++++++++ vlmrun/cli/_cli/execute.py | 33 +++++++++++++++----------- vlmrun/cli/_cli/executions.py | 9 ++------ vlmrun/cli/_cli/generate.py | 42 ++++++++++++++++++++++++++-------- vlmrun/cli/_cli/predictions.py | 4 +--- vlmrun/client/types.py | 31 +++++++++++++++++-------- 6 files changed, 89 insertions(+), 43 deletions(-) diff --git a/vlmrun/cli/_cli/chat.py b/vlmrun/cli/_cli/chat.py index 7534ce3..9225338 100644 --- a/vlmrun/cli/_cli/chat.py +++ b/vlmrun/cli/_cli/chat.py @@ -147,6 +147,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): # Available toolsets (must match AgentToolset literal values) AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__) +AVAILABLE_SERVICE_TIERS = ["auto", "default", "standard", "flex", "priority"] DEFAULT_MODEL = "vlmrun-orion-1:auto" @@ -583,6 +584,11 @@ def chat( "-s", help="Session UUID for persisting chat history (stateful conversations).", ), + service_tier: Optional[str] = typer.Option( + None, + "--service-tier", + help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).", + ), timeout: Optional[float] = typer.Option( None, "--timeout", @@ -647,6 +653,11 @@ def chat( console.print(f" - {m}{default_marker}") sys.exit(1) + if service_tier and service_tier not in AVAILABLE_SERVICE_TIERS: + console.print(f"[red]Error:[/] Invalid service tier '{service_tier}'") + console.print(f"\nAvailable tiers: {', '.join(AVAILABLE_SERVICE_TIERS)}") + sys.exit(1) + # Validate input files if provided if input_files: for file_path in input_files: @@ -730,6 +741,8 @@ def chat( extra_body["skills"] = agent_skills if toolsets: extra_body["toolsets"] = toolsets + if service_tier: + extra_body["service_tier"] = service_tier if not extra_body: extra_body = None diff --git a/vlmrun/cli/_cli/execute.py b/vlmrun/cli/_cli/execute.py index a3a0b1f..1546919 100644 --- a/vlmrun/cli/_cli/execute.py +++ b/vlmrun/cli/_cli/execute.py @@ -3,7 +3,6 @@ from __future__ import annotations import json -import sys import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path @@ -36,6 +35,8 @@ AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__) +AVAILABLE_SERVICE_TIERS = ["auto", "default", "standard", "flex", "priority"] + DEFAULT_MODEL = "vlmrun-orion-1:auto" EXECUTE_HELP = """Execute an agent via /v1/agent/execute. @@ -46,7 +47,7 @@ vlmrun execute -p "Extract invoice fields" -i doc.pdf --schema schema.json vlmrun execute -n my-agent:v1 -i img.jpg --skill ./my-skill vlmrun execute -n my-agent:v1 -i img.jpg --skill-id my-skill:latest - vlmrun execute -p "Describe" -i photo.jpg --no-wait + vlmrun execute -p "Describe" -i photo.jpg --wait vlmrun execute -n my-agent:v1 -i a.jpg -i b.pdf -t image -t document \b @@ -179,9 +180,7 @@ def _upload_files( file_responses.append(future.result()) status.update(f"Uploading {file_path.name}...") except Exception as e: - console.print( - f"[red]Error uploading {file_path.name}:[/] {e}" - ) + console.print(f"[red]Error uploading {file_path.name}:[/] {e}") raise typer.Exit(1) from e return file_responses @@ -288,9 +287,9 @@ def execute( help="Model: vlmrun-orion-1[:lite|fast|auto|pro]", ), wait: bool = typer.Option( - True, + False, "--wait/--no-wait", - help="Wait for execution to complete (default: wait).", + help="Wait for execution to complete (default: no-wait).", ), timeout: int = typer.Option( 300, @@ -307,6 +306,11 @@ def execute( "--callback-url", help="URL to call when execution completes (webhook).", ), + service_tier: Optional[str] = typer.Option( + None, + "--service-tier", + help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).", + ), output_format: Optional[str] = typer.Option( None, "--format", @@ -333,13 +337,16 @@ def execute( console.print(f"\nAvailable models: {', '.join(AVAILABLE_MODELS)}") raise typer.Exit(1) + if service_tier and service_tier not in AVAILABLE_SERVICE_TIERS: + console.print(f"[red]Error:[/] Invalid service tier '{service_tier}'") + console.print(f"\nAvailable tiers: {', '.join(AVAILABLE_SERVICE_TIERS)}") + raise typer.Exit(1) + if toolsets: for ts in toolsets: if ts not in AVAILABLE_TOOLSETS: console.print(f"[red]Error:[/] Invalid toolset '{ts}'") - console.print( - f"\nAvailable toolsets: {', '.join(AVAILABLE_TOOLSETS)}" - ) + console.print(f"\nAvailable toolsets: {', '.join(AVAILABLE_TOOLSETS)}") raise typer.Exit(1) if input_files: @@ -399,8 +406,7 @@ def execute( if file_responses: inputs = { "files": [ - {"type": "input_file", "file_id": fr.id} - for fr in file_responses + {"type": "input_file", "file_id": fr.id} for fr in file_responses ] } @@ -409,11 +415,12 @@ def execute( prompt=final_prompt, json_schema=json_schema, skills=skills, + service_tier=service_tier, ) if not output_json: console.print( - f"\n [bold blue]Submitting execution[/bold blue]" + "\n [bold blue]Submitting execution[/bold blue]" + (f" [dim]({name})[/dim]" if name else "") + f" [dim]model={model}[/dim]" ) diff --git a/vlmrun/cli/_cli/executions.py b/vlmrun/cli/_cli/executions.py index babab57..fadab87 100644 --- a/vlmrun/cli/_cli/executions.py +++ b/vlmrun/cli/_cli/executions.py @@ -3,7 +3,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Optional import typer from rich.console import Console, Group @@ -13,7 +13,6 @@ if TYPE_CHECKING: from vlmrun.client import VLMRun - from vlmrun.client.types import AgentExecutionResponse app = typer.Typer( help="List and retrieve agent execution results.", @@ -227,11 +226,7 @@ def get( execution = client.executions.get(execution_id) if output_json: - print( - json.dumps( - execution.model_dump(mode="json"), indent=2, default=str - ) - ) + print(json.dumps(execution.model_dump(mode="json"), indent=2, default=str)) return console.print("\nExecution Details:\n", style="white") diff --git a/vlmrun/cli/_cli/generate.py b/vlmrun/cli/_cli/generate.py index 3234a8f..f372dcb 100644 --- a/vlmrun/cli/_cli/generate.py +++ b/vlmrun/cli/_cli/generate.py @@ -3,13 +3,11 @@ from __future__ import annotations import json -import sys import time from pathlib import Path from typing import Any, Dict, List, Optional import typer -from rich import print as rprint from rich.console import Console from rich.panel import Panel from rich.status import Status @@ -32,6 +30,7 @@ console = Console() AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__) +AVAILABLE_SERVICE_TIERS = ["auto", "default", "standard", "flex", "priority"] GENERATE_HELP = """Generate structured predictions for images, documents, videos, and audio. @@ -76,9 +75,7 @@ def _resolve_skills( ) -> Optional[List[AgentSkill]]: """Build AgentSkill list from --skill dirs or --skill-id references.""" if skill_dirs and skill_ids: - console.print( - "[red]Error:[/] --skill and --skill-id are mutually exclusive." - ) + console.print("[red]Error:[/] --skill and --skill-id are mutually exclusive.") raise typer.Exit(1) if skill_dirs: @@ -175,6 +172,11 @@ def generate( "--timeout", help="Timeout in seconds when waiting for prediction to complete.", ), + service_tier: Optional[str] = typer.Option( + None, + "--service-tier", + help="Delivery tier: standard, flex (50%% discount), or priority (1.8x premium).", + ), output_format: Optional[str] = typer.Option( None, "--format", @@ -196,6 +198,11 @@ def generate( console.print(f"[red]Error:[/] Unsupported output format '{output_format}'") raise typer.Exit(1) + if service_tier and service_tier not in AVAILABLE_SERVICE_TIERS: + console.print(f"[red]Error:[/] Invalid service tier '{service_tier}'") + console.print(f"\nAvailable tiers: {', '.join(AVAILABLE_SERVICE_TIERS)}") + raise typer.Exit(1) + suffix = input_file.suffix.lower() if suffix not in SUPPORTED_INPUT_FILETYPES: console.print(f"[red]Error:[/] Unsupported file type: {suffix}") @@ -223,11 +230,12 @@ def generate( raise typer.Exit(1) from e config: Optional[GenerationConfig] = None - if any([skills, json_schema, prompt]): + if any([skills, json_schema, prompt, service_tier]): config = GenerationConfig( skills=skills, json_schema=json_schema, prompt=prompt, + service_tier=service_tier, ) try: @@ -242,7 +250,11 @@ def generate( start_time = time.time() if media_type in ("image", "document"): - with Status("Processing...", console=console, spinner="dots") if not output_json else _noop_ctx(): + with ( + Status("Processing...", console=console, spinner="dots") + if not output_json + else _noop_ctx() + ): response: PredictionResponse = client.document.generate( file=input_file, domain=domain, @@ -250,7 +262,11 @@ def generate( config=config, ) elif media_type == "video": - with Status("Processing...", console=console, spinner="dots") if not output_json else _noop_ctx(): + with ( + Status("Processing...", console=console, spinner="dots") + if not output_json + else _noop_ctx() + ): response = client.video.generate( file=input_file, domain=domain, @@ -258,7 +274,11 @@ def generate( config=config, ) elif media_type == "audio": - with Status("Processing...", console=console, spinner="dots") if not output_json else _noop_ctx(): + with ( + Status("Processing...", console=console, spinner="dots") + if not output_json + else _noop_ctx() + ): response = client.audio.generate( file=input_file, domain=domain, @@ -266,7 +286,9 @@ def generate( config=config, ) else: - console.print(f"[red]Error:[/] Could not determine media type for {input_file}") + console.print( + f"[red]Error:[/] Could not determine media type for {input_file}" + ) raise typer.Exit(1) # If batch mode and wait requested, poll until complete diff --git a/vlmrun/cli/_cli/predictions.py b/vlmrun/cli/_cli/predictions.py index 077c772..df9741f 100644 --- a/vlmrun/cli/_cli/predictions.py +++ b/vlmrun/cli/_cli/predictions.py @@ -132,9 +132,7 @@ def list( for prediction in predictions: usage = prediction.usage st = _status_style(prediction.status) - dur = _compute_duration( - prediction.created_at, prediction.completed_at, usage - ) + dur = _compute_duration(prediction.created_at, prediction.completed_at, usage) rows.append( _format_row( prediction.id, diff --git a/vlmrun/client/types.py b/vlmrun/client/types.py index 2ed02b3..b2e6d0b 100644 --- a/vlmrun/client/types.py +++ b/vlmrun/client/types.py @@ -239,6 +239,15 @@ class AgentExecutionOrCreationConfig(BaseModel): default=None, description="List of agent skills to enable for this execution. Skills provide domain-specific expertise and capabilities.", ) + service_tier: Literal["auto", "default", "standard", "flex", "priority"] | None = ( + Field( + default=None, + description=( + "Delivery tier: 'standard' (baseline), 'flex' (50%% discount, higher latency), " + "or 'priority' (1.8x premium). 'auto'/'default' use the server default (standard)." + ), + ) + ) @model_validator(mode="after") def validate_config(self): @@ -597,16 +606,18 @@ class GenerationConfig(BaseModel): default=None, description="0-indexed page indices to process for document files. If None, all pages are processed.", ) - service_tier: Literal["auto", "default", "standard", "flex", "priority"] | None = Field( - default=None, - description=( - "Delivery tier mirroring OpenAI's service_tier and Vertex AI's " - "Gemini Flex/Priority offering. 'standard'/'default' uses baseline " - "rates, 'flex' applies a 50% discount with higher latency, " - "'priority' applies a 1.8x premium. When omitted (or 'auto'), the " - "server default applies (which itself defaults to 'standard'). The " - "chosen tier drives BOTH billing AND the actual request routing." - ), + service_tier: Literal["auto", "default", "standard", "flex", "priority"] | None = ( + Field( + default=None, + description=( + "Delivery tier mirroring OpenAI's service_tier and Vertex AI's " + "Gemini Flex/Priority offering. 'standard'/'default' uses baseline " + "rates, 'flex' applies a 50% discount with higher latency, " + "'priority' applies a 1.8x premium. When omitted (or 'auto'), the " + "server default applies (which itself defaults to 'standard'). The " + "chosen tier drives BOTH billing AND the actual request routing." + ), + ) ) skills: Optional[List["AgentSkill"]] = Field( default=None, From deebb5bd31ba6f053993a2f65024a27cf30545e9 Mon Sep 17 00:00:00 2001 From: Sudeep Pillai Date: Tue, 12 May 2026 18:24:16 -0700 Subject: [PATCH 2/2] fix: narrow service_tier to default/flex/priority only Remove 'auto' and 'standard' from the accepted service_tier values across GenerationConfig, AgentExecutionOrCreationConfig, CLI flags, and tests to match the currently supported tiers. --- tests/test_predictions.py | 4 +--- vlmrun/cli/_cli/chat.py | 2 +- vlmrun/cli/_cli/execute.py | 2 +- vlmrun/cli/_cli/generate.py | 2 +- vlmrun/client/types.py | 32 ++++++++++++-------------------- 5 files changed, 16 insertions(+), 26 deletions(-) diff --git a/tests/test_predictions.py b/tests/test_predictions.py index 6e54a42..aa61832 100644 --- a/tests/test_predictions.py +++ b/tests/test_predictions.py @@ -575,9 +575,7 @@ def mock_get_invoice(prediction_id): assert isinstance(response.response, dict) -@pytest.mark.parametrize( - "service_tier", ["auto", "default", "standard", "flex", "priority"] -) +@pytest.mark.parametrize("service_tier", ["default", "flex", "priority"]) def test_generation_config_service_tier(service_tier): """service_tier is accepted and round-trips through model_dump().""" config = GenerationConfig(service_tier=service_tier) diff --git a/vlmrun/cli/_cli/chat.py b/vlmrun/cli/_cli/chat.py index 9225338..fee3e4c 100644 --- a/vlmrun/cli/_cli/chat.py +++ b/vlmrun/cli/_cli/chat.py @@ -147,7 +147,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): # Available toolsets (must match AgentToolset literal values) AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__) -AVAILABLE_SERVICE_TIERS = ["auto", "default", "standard", "flex", "priority"] +AVAILABLE_SERVICE_TIERS = ["default", "flex", "priority"] DEFAULT_MODEL = "vlmrun-orion-1:auto" diff --git a/vlmrun/cli/_cli/execute.py b/vlmrun/cli/_cli/execute.py index 1546919..cdb5688 100644 --- a/vlmrun/cli/_cli/execute.py +++ b/vlmrun/cli/_cli/execute.py @@ -35,7 +35,7 @@ AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__) -AVAILABLE_SERVICE_TIERS = ["auto", "default", "standard", "flex", "priority"] +AVAILABLE_SERVICE_TIERS = ["default", "flex", "priority"] DEFAULT_MODEL = "vlmrun-orion-1:auto" diff --git a/vlmrun/cli/_cli/generate.py b/vlmrun/cli/_cli/generate.py index f372dcb..85cf767 100644 --- a/vlmrun/cli/_cli/generate.py +++ b/vlmrun/cli/_cli/generate.py @@ -30,7 +30,7 @@ console = Console() AVAILABLE_TOOLSETS: List[str] = list(AgentToolset.__args__) -AVAILABLE_SERVICE_TIERS = ["auto", "default", "standard", "flex", "priority"] +AVAILABLE_SERVICE_TIERS = ["default", "flex", "priority"] GENERATE_HELP = """Generate structured predictions for images, documents, videos, and audio. diff --git a/vlmrun/client/types.py b/vlmrun/client/types.py index b2e6d0b..d7ee05b 100644 --- a/vlmrun/client/types.py +++ b/vlmrun/client/types.py @@ -239,14 +239,12 @@ class AgentExecutionOrCreationConfig(BaseModel): default=None, description="List of agent skills to enable for this execution. Skills provide domain-specific expertise and capabilities.", ) - service_tier: Literal["auto", "default", "standard", "flex", "priority"] | None = ( - Field( - default=None, - description=( - "Delivery tier: 'standard' (baseline), 'flex' (50%% discount, higher latency), " - "or 'priority' (1.8x premium). 'auto'/'default' use the server default (standard)." - ), - ) + service_tier: Literal["default", "flex", "priority"] | None = Field( + default=None, + description=( + "Delivery tier: 'default' (baseline), 'flex' (50%% discount, higher latency), " + "or 'priority' (1.8x premium)." + ), ) @model_validator(mode="after") @@ -606,18 +604,12 @@ class GenerationConfig(BaseModel): default=None, description="0-indexed page indices to process for document files. If None, all pages are processed.", ) - service_tier: Literal["auto", "default", "standard", "flex", "priority"] | None = ( - Field( - default=None, - description=( - "Delivery tier mirroring OpenAI's service_tier and Vertex AI's " - "Gemini Flex/Priority offering. 'standard'/'default' uses baseline " - "rates, 'flex' applies a 50% discount with higher latency, " - "'priority' applies a 1.8x premium. When omitted (or 'auto'), the " - "server default applies (which itself defaults to 'standard'). The " - "chosen tier drives BOTH billing AND the actual request routing." - ), - ) + service_tier: Literal["default", "flex", "priority"] | None = Field( + default=None, + description=( + "Delivery tier: 'default' (baseline), 'flex' (50%% discount, higher latency), " + "or 'priority' (1.8x premium)." + ), ) skills: Optional[List["AgentSkill"]] = Field( default=None,