refactor(training): extract utility functions for problem type and ID detection

cristofima · cristofima · commit 1329880d044b · 2025-12-20T14:41:02.000-05:00
Moved common logic for detecting problem types and ID columns into a new utils module. This improves code organization and reusability across preprocessing and EDA components.

Modified files (3):
- backend/training/eda.py: Refactored to use utility functions
- backend/training/preprocessor.py: Updated to utilize shared utilities
- backend/training/utils.py: Added new utility functions for detection logic
diff --git a/backend/training/eda.py b/backend/training/eda.py
@@ -1,7 +1,9 @@
 import pandas as pd
 import numpy as np
 from typing import List, Tuple
-import re
+
+# Import shared utilities
+from .utils import detect_problem_type, is_id_column
 
 
 def generate_eda_report(df: pd.DataFrame, target_column: str, output_path: str):
@@ -33,13 +35,6 @@ def generate_eda_report(df: pd.DataFrame, target_column: str, output_path: str):
 class EDAReportGenerator:
     """Generate comprehensive EDA report with CSS-only visualizations"""
     
-    # Common ID column patterns
-    ID_PATTERNS = [
-        r'^id$', r'_id$', r'^id_', r'^uuid$', r'^guid$',
-        r'order.*id', r'customer.*id', r'user.*id', r'transaction.*id',
-        r'^index$', r'^row.*num', r'^serial', r'^record.*id',
-    ]
-    
     def __init__(self, df: pd.DataFrame, target_column: str):
         self.df = df
         self.target_column = target_column
@@ -53,38 +48,12 @@ def __init__(self, df: pd.DataFrame, target_column: str):
         self._analyze_columns()
     
     def _detect_problem_type(self) -> str:
-        """Detect if classification or regression"""
-        # Guard against empty target
-        if len(self.target) == 0:
-            return 'classification'  # Default fallback
-        
-        if pd.api.types.is_numeric_dtype(self.target):
-            unique_ratio = self.target.nunique() / len(self.target)
-            if unique_ratio < 0.05 or self.target.nunique() < 20:
-                return 'classification'
-            return 'regression'
-        return 'classification'
+        """Detect if classification or regression using shared utility."""
+        return detect_problem_type(self.target)
     
     def _is_id_column(self, col_name: str, series: pd.Series) -> bool:
-        """Check if column is likely an ID"""
-        col_lower = col_name.lower().strip()
-        for pattern in self.ID_PATTERNS:
-            if re.search(pattern, col_lower):
-                return True
-        
-        # Check if all unique and sequential
-        if pd.api.types.is_numeric_dtype(series):
-            if series.nunique() == len(series):
-                sorted_vals = series.sort_values()
-                if (sorted_vals.diff().dropna() == 1).all():
-                    return True
-        
-        # High cardinality string column
-        if series.dtype == 'object':
-            if series.nunique() / len(series) > 0.95:
-                return True
-        
-        return False
+        """Check if column is likely an ID using shared utility."""
+        return is_id_column(col_name, series)
     
     def _analyze_columns(self):
         """Analyze and categorize columns"""
diff --git a/backend/training/preprocessor.py b/backend/training/preprocessor.py
@@ -1,37 +1,23 @@
 import pandas as pd
 import numpy as np
-import re
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from typing import Tuple, List
 
 # Feature-engine for robust feature selection
 from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures
 
+# Import shared utilities
+from .utils import (
+    detect_problem_type,
+    is_id_column,
+    is_high_cardinality_categorical,
+)
+
 
 class AutoPreprocessor:
     """Automatic data preprocessing for AutoML"""
     
-    # Common patterns for ID/identifier columns (case insensitive)
-    ID_PATTERNS = [
-        r'^id$',
-        r'_id$',
-        r'^id_',
-        r'_id_',
-        r'^uuid$',
-        r'^guid$',
-        r'order.*id',
-        r'customer.*id',
-        r'user.*id',
-        r'transaction.*id',
-        r'product.*id',
-        r'session.*id',
-        r'^index$',
-        r'^row.*num',
-        r'^serial',
-        r'^record.*id',
-    ]
-    
     def __init__(self, target_column: str):
         self.target_column = target_column
         self.label_encoders = {}
@@ -41,65 +27,6 @@ def __init__(self, target_column: str):
         self.categorical_columns = []
         self.dropped_columns = []  # Track dropped columns for reporting
     
-    def detect_id_column(self, col_name: str, series: pd.Series) -> bool:
-        """
-        Detect if a column is likely an ID/identifier column.
-        Uses both name patterns and data characteristics.
-        """
-        col_lower = col_name.lower().strip()
-        
-        # Check name patterns
-        for pattern in self.ID_PATTERNS:
-            if re.search(pattern, col_lower):
-                return True
-        
-        # Check data characteristics for numeric columns
-        if pd.api.types.is_numeric_dtype(series):
-            n_unique = series.nunique()
-            n_total = len(series)
-            
-            # If all values are unique and sequential, likely an ID
-            if n_unique == n_total:
-                # Check if values are sequential integers
-                if series.dtype in ['int64', 'int32', 'int']:
-                    sorted_vals = series.sort_values()
-                    is_sequential = (sorted_vals.diff().dropna() == 1).all()
-                    if is_sequential:
-                        return True
-        
-        # Check for string columns that look like IDs (high cardinality)
-        if series.dtype == 'object':
-            n_unique = series.nunique()
-            n_total = len(series)
-            
-            # If almost all values are unique, likely an ID
-            if n_unique / n_total > 0.95:
-                # Additional check: IDs often have consistent format
-                sample = series.dropna().head(100)
-                # Check if values look like codes/IDs (alphanumeric patterns)
-                if sample.apply(lambda x: bool(re.match(r'^[A-Za-z0-9\-_]+$', str(x)))).mean() > 0.9:
-                    return True
-        
-        return False
-    
-    def detect_constant_column(self, series: pd.Series) -> bool:
-        """Detect if a column has only one unique value (constant)"""
-        return series.nunique() <= 1
-    
-    def detect_high_cardinality_categorical(self, series: pd.Series, threshold: float = 0.5) -> bool:
-        """
-        Detect categorical columns with too many unique values.
-        These often don't generalize well and can cause overfitting.
-        """
-        if series.dtype != 'object':
-            return False
-        
-        n_unique = series.nunique()
-        n_total = len(series)
-        
-        # If more than 50% unique values, too high cardinality
-        return n_unique / n_total > threshold
-    
     def detect_useless_columns_with_feature_engine(self, df: pd.DataFrame) -> Tuple[List[str], dict]:
         """
         Use feature-engine to detect constant and duplicate columns.
@@ -160,14 +87,14 @@ def detect_useless_columns(self, df: pd.DataFrame) -> List[str]:
             series = df[col]
             
             # Check for ID columns (name patterns + data characteristics)
-            if self.detect_id_column(col, series):
+            if is_id_column(col, series):
                 useless_cols.append(col)
                 reasons[col] = "identifier/ID column"
                 continue
             
             # Check for high cardinality categorical
             if series.dtype == 'object':
-                if self.detect_high_cardinality_categorical(series, threshold=0.5):
+                if is_high_cardinality_categorical(series, threshold=0.5):
                     useless_cols.append(col)
                     reasons[col] = f"high cardinality categorical ({series.nunique()} unique values)"
                     continue
@@ -182,25 +109,9 @@ def detect_useless_columns(self, df: pd.DataFrame) -> List[str]:
         self.dropped_columns = useless_cols
         return useless_cols
     
-    def detect_problem_type(self, y: pd.Series) -> str:
-        """Detect if problem is classification or regression"""
-        # Guard against empty target
-        if len(y) == 0:
-            return 'classification'  # Default fallback
-        
-        # Check if target is numeric
-        if pd.api.types.is_numeric_dtype(y):
-            # If numeric, check unique values ratio
-            unique_ratio = y.nunique() / len(y)
-            
-            # If less than 5% unique values or less than 20 unique values, likely classification
-            if unique_ratio < 0.05 or y.nunique() < 20:
-                return 'classification'
-            else:
-                return 'regression'
-        else:
-            # Non-numeric target is classification
-            return 'classification'
+    def _detect_problem_type(self, y: pd.Series) -> str:
+        """Detect if problem is classification or regression using shared utility."""
+        return detect_problem_type(y)
     
     def handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
         """Handle missing values in the dataset"""
@@ -265,7 +176,7 @@ def preprocess(
                 print(f"✂️  Removed {len(cols_to_drop)} column(s): {cols_to_drop}")
         
         # Detect problem type
-        problem_type = self.detect_problem_type(y)
+        problem_type = self._detect_problem_type(y)
         print(f"Detected problem type: {problem_type}")
         
         # Handle missing values
diff --git a/backend/training/utils.py b/backend/training/utils.py