Skip to content
34 changes: 31 additions & 3 deletions .github/SETUP_CICD.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,41 @@ cat > github-actions-permissions.json <<EOF
{
"Sid": "APIGatewayManagement",
"Effect": "Allow",
"Action": "apigateway:*",
"Resource": "*"
"Action": [
"apigateway:GET",
"apigateway:POST",
"apigateway:PUT",
"apigateway:PATCH",
"apigateway:DELETE",
"apigateway:UpdateRestApiPolicy"
],
"Resource": [
"arn:aws:apigateway:*::/restapis",
"arn:aws:apigateway:*::/restapis/*"
]
},
{
"Sid": "BatchManagement",
"Effect": "Allow",
"Action": "batch:*",
"Action": [
"batch:CreateComputeEnvironment",
"batch:UpdateComputeEnvironment",
"batch:DeleteComputeEnvironment",
"batch:DescribeComputeEnvironments",
"batch:CreateJobQueue",
"batch:UpdateJobQueue",
"batch:DeleteJobQueue",
"batch:DescribeJobQueues",
"batch:RegisterJobDefinition",
"batch:DeregisterJobDefinition",
"batch:DescribeJobDefinitions",
"batch:SubmitJob",
"batch:DescribeJobs",
"batch:ListJobs",
"batch:TerminateJob",
"batch:TagResource",
"batch:UntagResource"
],
"Resource": "*"
},
{
Expand Down
13 changes: 9 additions & 4 deletions backend/training/eda.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Any
from typing import List, Tuple
import re


Expand Down Expand Up @@ -54,6 +54,10 @@ def __init__(self, df: pd.DataFrame, target_column: str):

def _detect_problem_type(self) -> str:
"""Detect if classification or regression"""
# Guard against empty target
if len(self.target) == 0:
return 'classification' # Default fallback

if pd.api.types.is_numeric_dtype(self.target):
unique_ratio = self.target.nunique() / len(self.target)
if unique_ratio < 0.05 or self.target.nunique() < 20:
Expand Down Expand Up @@ -108,9 +112,10 @@ def _analyze_columns(self):

if self.problem_type == 'classification':
class_counts = self.target.value_counts()
imbalance_ratio = class_counts.max() / class_counts.min()
if imbalance_ratio > 3:
self.warnings.append(f"Class imbalance detected (ratio: {imbalance_ratio:.1f}:1)")
if len(class_counts) > 0 and class_counts.min() > 0:
imbalance_ratio = class_counts.max() / class_counts.min()
if imbalance_ratio > 3:
self.warnings.append(f"Class imbalance detected (ratio: {imbalance_ratio:.1f}:1)")

def _get_css(self) -> str:
"""Return CSS styles"""
Expand Down
4 changes: 2 additions & 2 deletions backend/training/model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ def get_feature_importance(model: AutoML, feature_names) -> Dict[str, float]:
print(f" {i+1}. {feature}: {importance:.4f}")

return feature_importance
except Exception:
pass
except (AttributeError, TypeError) as e:
print(f"Could not extract feature importances from model: {e}")

# Fallback: Create equal importance for all features
print("\nCould not extract feature importances, using equal weights")
Expand Down
4 changes: 4 additions & 0 deletions backend/training/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ def detect_useless_columns(self, df: pd.DataFrame) -> List[str]:

def detect_problem_type(self, y: pd.Series) -> str:
"""Detect if problem is classification or regression"""
# Guard against empty target
if len(y) == 0:
return 'classification' # Default fallback

# Check if target is numeric
if pd.api.types.is_numeric_dtype(y):
# If numeric, check unique values ratio
Expand Down
4 changes: 2 additions & 2 deletions backend/training/training_report.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, Any
from datetime import datetime
from datetime import datetime, timezone


def generate_training_report(
Expand Down Expand Up @@ -382,7 +382,7 @@ def _generate_config_info(self) -> str:

def generate(self) -> str:
"""Generate complete HTML report"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")

html = f"""
<!DOCTYPE html>
Expand Down
52 changes: 21 additions & 31 deletions frontend/app/results/[jobId]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,25 @@ export default function ResultsPage() {
setTimeout(() => setCopiedPython(false), 2000);
};

// Generate Docker commands for model prediction (extracted to avoid duplication)
const getDockerCommands = (jobId: string) => {
const modelFile = `model_${jobId.slice(0, 8)}.pkl`;
return `# Build prediction container (one time)
docker build -f scripts/Dockerfile.predict -t automl-predict .

# Show model info and required features
docker run --rm -v \${PWD}:/data automl-predict /data/${modelFile} --info

# Generate sample input JSON (auto-detects features)
docker run --rm -v \${PWD}:/data automl-predict /data/${modelFile} -g /data/sample_input.json

# Edit sample_input.json with your values, then predict
docker run --rm -v \${PWD}:/data automl-predict /data/${modelFile} --json /data/sample_input.json

# Batch predictions from CSV
docker run --rm -v \${PWD}:/data automl-predict /data/${modelFile} -i /data/test.csv -o /data/predictions.csv`;
};

useEffect(() => {
const fetchResults = async () => {
try {
Expand Down Expand Up @@ -276,39 +295,10 @@ export default function ResultsPage() {
</div>
<div className="relative">
<pre className="bg-gray-900 text-gray-100 rounded-lg p-4 overflow-x-auto text-sm font-mono">
<code>{`# Build prediction container (one time)
docker build -f scripts/Dockerfile.predict -t automl-predict .

# Show model info and required features
docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl --info

# Generate sample input JSON (auto-detects features)
docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl -g /data/sample_input.json

# Edit sample_input.json with your values, then predict
docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl --json /data/sample_input.json

# Batch predictions from CSV
docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl -i /data/test.csv -o /data/predictions.csv`}</code>
<code>{getDockerCommands(job.job_id)}</code>
</pre>
<button
onClick={() => {
const code = `# Build prediction container (one time)
docker build -f scripts/Dockerfile.predict -t automl-predict .

# Show model info and required features
docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl --info

# Generate sample input JSON (auto-detects features)
docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl -g /data/sample_input.json

# Edit sample_input.json with your values, then predict
docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl --json /data/sample_input.json

# Batch predictions from CSV
docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl -i /data/test.csv -o /data/predictions.csv`;
handleCopyDocker(code);
}}
onClick={() => handleCopyDocker(getDockerCommands(job.job_id))}
className={`absolute top-2 right-2 px-3 py-1 text-xs rounded transition-all cursor-pointer ${
copiedDocker
? 'bg-green-600 text-white'
Expand Down
4 changes: 3 additions & 1 deletion frontend/next.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ const nextConfig: NextConfig = {
unoptimized: true,
},

// Trailing slashes for better compatibility
// Trailing slashes ensure consistent URL handling across:
// - AWS Amplify SSR deployments (prevents 404 on refresh)
// - Static file serving and client-side navigation
trailingSlash: true,
};

Expand Down
19 changes: 16 additions & 3 deletions infrastructure/terraform/s3.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,12 @@ resource "aws_s3_bucket_cors_configuration" "datasets" {
cors_rule {
allowed_headers = ["*"]
allowed_methods = ["PUT", "GET"]
allowed_origins = ["*"]
# Security: Use specific origins instead of wildcard
# Defaults to Amplify domain + localhost for development
allowed_origins = length(var.cors_allowed_origins) > 0 ? var.cors_allowed_origins : concat(
local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
var.environment == "dev" ? ["http://localhost:3000"] : []
Copy link

Copilot AI Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CORS configuration may result in an empty allowed_origins list in production when Amplify is disabled. When var.cors_allowed_origins is empty, local.amplify_enabled is false (no GitHub integration), and var.environment != "dev", the concat() will produce an empty list [], which is invalid for CORS configuration.

Recommendation: Add a fallback to prevent empty origins list:

allowed_origins = length(var.cors_allowed_origins) > 0 ? var.cors_allowed_origins : (
  length(concat(
    local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
    var.environment == "dev" ? ["http://localhost:3000"] : []
  )) > 0 ? concat(
    local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
    var.environment == "dev" ? ["http://localhost:3000"] : []
  ) : ["*"]  # Fallback to wildcard if no specific origins configured
)

Or require cors_allowed_origins to be set for production deployments without Amplify.

Suggested change
allowed_origins = length(var.cors_allowed_origins) > 0 ? var.cors_allowed_origins : concat(
local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
var.environment == "dev" ? ["http://localhost:3000"] : []
allowed_origins = length(var.cors_allowed_origins) > 0 ? var.cors_allowed_origins : (
length(concat(
local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
var.environment == "dev" ? ["http://localhost:3000"] : []
)) > 0 ? concat(
local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
var.environment == "dev" ? ["http://localhost:3000"] : []
) : ["*"]

Copilot uses AI. Check for mistakes.
)
max_age_seconds = 3600
}
}
Expand Down Expand Up @@ -77,7 +82,11 @@ resource "aws_s3_bucket_cors_configuration" "models" {
cors_rule {
allowed_headers = ["*"]
allowed_methods = ["GET"]
allowed_origins = ["*"]
# Security: Use specific origins instead of wildcard
allowed_origins = length(var.cors_allowed_origins) > 0 ? var.cors_allowed_origins : concat(
local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
var.environment == "dev" ? ["http://localhost:3000"] : []
)
Copy link

Copilot AI Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CORS configuration may result in an empty allowed_origins list in production when Amplify is disabled. When var.cors_allowed_origins is empty, local.amplify_enabled is false (no GitHub integration), and var.environment != "dev", the concat() will produce an empty list [], which is invalid for CORS configuration.

Recommendation: Add a fallback to prevent empty origins list or require cors_allowed_origins to be set for production deployments without Amplify.

Copilot uses AI. Check for mistakes.
expose_headers = ["Content-Disposition"]
max_age_seconds = 3600
}
Expand Down Expand Up @@ -120,7 +129,11 @@ resource "aws_s3_bucket_cors_configuration" "reports" {
cors_rule {
allowed_headers = ["*"]
allowed_methods = ["GET"]
allowed_origins = ["*"]
# Security: Use specific origins instead of wildcard
allowed_origins = length(var.cors_allowed_origins) > 0 ? var.cors_allowed_origins : concat(
local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
var.environment == "dev" ? ["http://localhost:3000"] : []
)
Copy link

Copilot AI Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CORS configuration may result in an empty allowed_origins list in production when Amplify is disabled. When var.cors_allowed_origins is empty, local.amplify_enabled is false (no GitHub integration), and var.environment != "dev", the concat() will produce an empty list [], which is invalid for CORS configuration.

Recommendation: Add a fallback to prevent empty origins list or require cors_allowed_origins to be set for production deployments without Amplify.

Copilot uses AI. Check for mistakes.
expose_headers = ["Content-Disposition"]
max_age_seconds = 3600
}
Expand Down
8 changes: 8 additions & 0 deletions infrastructure/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,11 @@ variable "github_token" {
sensitive = true
default = ""
}

variable "cors_allowed_origins" {
description = "List of allowed origins for S3 CORS configuration. Use specific domains for security."
type = list(string)
default = []
# When empty, defaults to Amplify domain + localhost for dev
# For production, specify exact frontend URLs
}
16 changes: 11 additions & 5 deletions scripts/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,11 @@ def prepare_input(data: pd.DataFrame, preprocessor) -> pd.DataFrame:
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
if df[col].isnull().any():
df[col].fillna(df[col].median(), inplace=True)
median_val = df[col].median()
# Fallback to 0 if median is NaN (empty column or all NaN)
if pd.isna(median_val):
median_val = 0
df[col].fillna(median_val, inplace=True)

categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
Expand Down Expand Up @@ -243,8 +247,9 @@ def predict_single(model_package: dict, input_data: dict) -> dict:

result['probabilities'] = {str(label): float(p) for label, p in zip(class_labels, probas)}
result['confidence'] = float(max(probas))
except Exception:
pass
except (AttributeError, ValueError, IndexError) as e:
# Log warning but continue - probabilities are optional
print(f"⚠️ Could not compute class probabilities: {e}")

return result

Expand Down Expand Up @@ -283,8 +288,9 @@ def predict_batch(model_package: dict, input_path: str, output_path: str) -> Non
try:
probas = model.predict_proba(X)
df['confidence'] = probas.max(axis=1)
except Exception:
pass
except (AttributeError, ValueError) as e:
# Log warning but continue - confidence scores are optional
print(f"⚠️ Could not compute confidence scores: {e}")

# Save results
df.to_csv(output_path, index=False)
Expand Down