roboflow · leeclemnet · Apr 9, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 14, 2026
@@ -4,6 +4,7 @@
 from pydantic import BaseModel, ConfigDict, Field, validator
 
 from inference.core.entities.common import ApiKey, ModelID, ModelType
+from inference_sdk.http.entities import Confidence
 
 
 class BaseRequest(BaseModel):
@@ -145,10 +146,13 @@ class ObjectDetectionInferenceRequest(CVInferenceRequest):
         examples=[["class-1", "class-2", "class-n"]],
         description="If provided, only predictions for the listed classes will be returned",
     )
-    confidence: Optional[float] = Field(
+    confidence: Confidence = Field(
         default=0.4,
-        examples=[0.5],
-        description="The confidence threshold used to filter out predictions",
+        examples=[0.5, "best", "default"],
+        description=(
+            'Confidence threshold. "best" uses model-eval thresholds, '
+            '"default" uses the model built-in, or pass a float.'
+        ),
     )
     fix_batch_size: Optional[bool] = Field(
         default=False,
@@ -245,10 +249,13 @@ def __init__(self, **kwargs):
         kwargs["model_type"] = "classification"
         super().__init__(**kwargs)
 
-    confidence: Optional[float] = Field(
+    confidence: Confidence = Field(
         default=0.4,
-        examples=[0.5],
-        description="The confidence threshold used to filter out predictions",
+        examples=[0.5, "best", "default"],
+        description=(
+            'Confidence threshold. "best" uses model-eval thresholds, '
+            '"default" uses the model built-in, or pass a float.'
+        ),
     )
     visualization_stroke_width: Optional[int] = Field(
         default=1,

@@ -139,7 +139,16 @@ def infer_from_request(
               is also included in the response.
         """
         t1 = perf_counter()
-        responses = self.infer(**request.dict(), return_image_dims=False)
+        kwargs = request.dict()
+        confidence = kwargs.get("confidence")
+        if isinstance(confidence, str):
+            logger.warning(
+                "Legacy inference does not support confidence=%r, "
+                "using model default",
+                confidence,
+            )
+            kwargs.pop("confidence")
+        responses = self.infer(**kwargs, return_image_dims=False)
         for response in responses:
             response.time = perf_counter() - t1
             logger.debug(f"model infer time: {response.time * 1000.0} ms")

@@ -677,25 +677,29 @@ def postprocess(
         List[ClassificationInferenceResponse],
     ]:
         mapped_kwargs = self.map_inference_kwargs(kwargs)
-        post_processed_predictions = self._model.post_process(
-            predictions, **mapped_kwargs
-        )
-        if isinstance(post_processed_predictions, list):
-            # multi-label classification
-            return prepare_multi_label_classification_response(
-                post_processed_predictions,
-                image_sizes=returned_metadata,
-                class_names=self.class_names,
-                confidence_threshold=kwargs.get("confidence", 0.5),
+        if isinstance(self._model, MultiLabelClassificationModel):
+            post_processed_predictions = self._model.post_process(
+                predictions, **mapped_kwargs
             )
-        else:
-            # single-label classification
-            return prepare_classification_response(
+            return prepare_multi_label_classification_response(
                 post_processed_predictions,
                 image_sizes=returned_metadata,
                 class_names=self.class_names,
-                confidence_threshold=kwargs.get("confidence", 0.5),
             )
+        # Single-label classification: top-1 always wins regardless of
+        # confidence, so per-class refinement isn't meaningful here. The base
+        # class deliberately opts out of recommendedParameters entirely. The
+        # response builder still uses kwargs.get("confidence", 0.5) for the
+        # cutoff that decides which alternative classes show up.
+        post_processed_predictions = self._model.post_process(
+            predictions, **mapped_kwargs
+        )
+        return prepare_classification_response(
+            post_processed_predictions,
+            image_sizes=returned_metadata,
+            class_names=self.class_names,
+            confidence_threshold=kwargs.get("confidence") or 0.5,
+        )
 
     def clear_cache(self, delete_from_disk: bool = True) -> None:
         """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.
@@ -747,20 +751,27 @@ def prepare_multi_label_classification_response(
     post_processed_predictions: List[MultiLabelClassificationPrediction],
     image_sizes: List[Tuple[int, int]],
     class_names: List[str],
-    confidence_threshold: float,
 ) -> List[MultiLabelClassificationInferenceResponse]:
+    """Build the API response from a model's post-processed predictions.
+
+    `prediction.class_ids` is the authoritative list of "passed" classes —
+    the model's `post_process` already applied the
+    full priority chain (user → per-class → global → default), so the
+    response builder doesn't re-threshold here. The full per-class score
+    vector is still emitted in `image_predictions_dict` for UI display.
+    """
     results = []
     for prediction, image_size in zip(post_processed_predictions, image_sizes):
-        image_predictions_dict = dict()
-        predicted_classes = []
-        for class_id, confidence in enumerate(prediction.confidence.cpu().tolist()):
-            cls_name = class_names[class_id]
-            image_predictions_dict[cls_name] = {
+        image_predictions_dict = {
+            class_names[class_id]: {
                 "confidence": confidence,
                 "class_id": class_id,
             }
-            if confidence > confidence_threshold:
-                predicted_classes.append(cls_name)
+            for class_id, confidence in enumerate(prediction.confidence.cpu().tolist())
+        }
+        predicted_classes = [
+            class_names[class_id] for class_id in prediction.class_ids.tolist()
+        ]
         results.append(
             MultiLabelClassificationInferenceResponse(
                 predictions=image_predictions_dict,

@@ -1,5 +1,24 @@
 # Changelog
 
+## `0.25.1`
+
+### Fixed
+
+- Fix bug in roboflow_instant_hf confidence filter.
+
+---
+
+## `0.25.0`
+
+### Added
+
+- `post_process(...)` on object detection, instance segmentation, keypoint detection, classification, and semantic 
+segmentation models now accepts `confidence` as `"best"` (use per-class or global thresholds from 
+`RecommendedParameters` when available), `"default"` (model's built-in default), or a float override. Shared NMS 
+helpers accept a per-class `torch.Tensor` for single-pass per-class filtering.
+
+---
+
 ## `0.24.4`
 
 ### Changed

@@ -28,7 +28,7 @@
 if os.environ.get("TOKENIZERS_PARALLELISM") is None:
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-from inference_models.entities import ColorFormat
+from inference_models.entities import ColorFormat, Confidence
 from inference_models.model_pipelines.auto_loaders.core import AutoModelPipeline
 from inference_models.models.auto_loaders.core import AutoModel
 from inference_models.models.auto_loaders.entities import (

@@ -1,5 +1,6 @@
 from collections import namedtuple
-from typing import Literal
+from typing import Literal, Union
 
 ImageDimensions = namedtuple("ImageDimensions", ["height", "width"])
 ColorFormat = Literal["rgb", "bgr"]
+Confidence = Union[float, Literal["best", "default"]]
@@ -17,7 +17,10 @@
     TaskType,
 )
 from inference_models.utils.file_system import dump_json, read_json
-from inference_models.weights_providers.entities import ModelDependency
+from inference_models.weights_providers.entities import (
+    ModelDependency,
+    RecommendedParameters,
+)
 
 
 class AutoResolutionCacheEntry(BaseModel):
@@ -30,6 +33,7 @@ class AutoResolutionCacheEntry(BaseModel):
     model_dependencies: Optional[List[ModelDependency]] = Field(default=None)
     created_at: datetime
     model_features: Optional[dict] = Field(default=None)
+    recommended_parameters: Optional[RecommendedParameters] = Field(default=None)
 
 
 class AutoResolutionCache(ABC):

@@ -81,6 +81,7 @@
     ModelDependency,
     ModelPackageMetadata,
     Quantization,
+    RecommendedParameters,
 )
 
 MODEL_TYPES_TO_LOAD_FROM_CHECKPOINT = {
@@ -926,6 +927,7 @@ def model_directory_pointer(model_dir: str) -> None:
                 model_dependencies=model_metadata.model_dependencies,
                 model_dependencies_instances=model_dependencies_instances,
                 model_dependencies_directories=model_dependencies_directories,
+                recommended_parameters=model_metadata.recommended_parameters,
                 max_package_loading_attempts=max_package_loading_attempts,
                 model_download_file_lock_acquire_timeout=model_download_file_lock_acquire_timeout,
                 verify_hash_while_download=verify_hash_while_download,
@@ -1078,6 +1080,10 @@ def attempt_loading_model_with_auto_load_cache(
             package_id=cache_entry.model_package_id,
         )
         model_init_kwargs[MODEL_DEPENDENCIES_KEY] = model_dependencies_instances
+        # Cache stores the already-resolved (package-vs-model) value written
+        # in initialize_model — no need to re-run resolve_recommended_parameters.
+        if cache_entry.recommended_parameters is not None:
+            model_init_kwargs["recommended_parameters"] = cache_entry.recommended_parameters
         model = model_class.from_pretrained(
             model_package_cache_dir, **model_init_kwargs
         )
@@ -1113,6 +1119,7 @@ def attempt_loading_matching_model_packages(
     model_dependencies: Optional[List[ModelDependency]],
     model_dependencies_instances: Dict[str, AnyModel],
     model_dependencies_directories: Dict[str, str],
+    recommended_parameters: Optional[RecommendedParameters] = None,
     max_package_loading_attempts: Optional[int] = None,
     model_download_file_lock_acquire_timeout: int = FILE_LOCK_ACQUIRE_TIMEOUT,
     verbose: bool = True,
@@ -1153,6 +1160,7 @@ def attempt_loading_matching_model_packages(
                 model_dependencies=model_dependencies,
                 model_dependencies_instances=model_dependencies_instances,
                 model_dependencies_directories=model_dependencies_directories,
+                recommended_parameters=recommended_parameters,
                 verify_hash_while_download=verify_hash_while_download,
                 download_files_without_hash=download_files_without_hash,
                 on_file_created=partial(
@@ -1218,6 +1226,7 @@ def initialize_model(
     model_dependencies: Optional[List[ModelDependency]],
     model_dependencies_instances: Dict[str, AnyModel],
     model_dependencies_directories: Dict[str, str],
+    recommended_parameters: Optional[RecommendedParameters] = None,
     model_download_file_lock_acquire_timeout: int = FILE_LOCK_ACQUIRE_TIMEOUT,
     verify_hash_while_download: bool = True,
     download_files_without_hash: bool = False,
@@ -1307,6 +1316,12 @@ def initialize_model(
     )
     resolved_files.update(dependencies_resolved_files)
     model_init_kwargs[MODEL_DEPENDENCIES_KEY] = model_dependencies_instances
+    resolved_recommended_parameters = resolve_recommended_parameters(
+        package_level=model_package.recommended_parameters,
+        model_level=recommended_parameters,
+    )
+    if resolved_recommended_parameters is not None:
+        model_init_kwargs["recommended_parameters"] = resolved_recommended_parameters
     model = model_class.from_pretrained(model_package_cache_dir, **model_init_kwargs)
     dump_auto_resolution_cache(
         use_auto_resolution_cache=use_auto_resolution_cache,
@@ -1320,6 +1335,7 @@ def initialize_model(
         resolved_files=resolved_files,
         model_dependencies=model_dependencies,
         model_features=model_package.model_features,
+        recommended_parameters=resolved_recommended_parameters,
     )
     return model, model_package_cache_dir
 
@@ -1484,6 +1500,7 @@ def dump_auto_resolution_cache(
     resolved_files: Set[str],
     model_dependencies: Optional[List[ModelDependency]],
     model_features: Optional[dict],
+    recommended_parameters: Optional[RecommendedParameters] = None,
 ) -> None:
     if not use_auto_resolution_cache:
         return None
@@ -1497,6 +1514,7 @@ def dump_auto_resolution_cache(
         created_at=datetime.now(),
         model_dependencies=model_dependencies,
         model_features=model_features,
+        recommended_parameters=recommended_parameters,
     )
     auto_resolution_cache.register(
         auto_negotiation_hash=auto_negotiation_hash, cache_entry=cache_content
@@ -1812,3 +1830,11 @@ def load_class_from_path(module_path: str, class_name: str) -> AnyModel:
             help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
         )
     return getattr(module, class_name)
+
+
+def resolve_recommended_parameters(
+    package_level: Optional[RecommendedParameters],
+    model_level: Optional[RecommendedParameters],
+) -> Optional[RecommendedParameters]:
+    """Package-level recommended_parameters take priority over model-level."""
+    return package_level if package_level is not None else model_level
@@ -17,6 +17,11 @@ class ClassificationPrediction:
 
 class ClassificationModel(ABC, Generic[PreprocessedInputs, RawPrediction]):
 
+    # Single-label classification deliberately opts out of recommendedParameters.
+    # Top-1 always wins regardless of confidence, so per-class refinement isn't
+    # a meaningful semantic for this task type. (Multi-label classification opts
+    # in below — that's where per-class thresholds actually filter the result.)
+
     @classmethod
     @abstractmethod
     def from_pretrained(