imporve features

This commit is contained in:
Urtzi Alfaro
2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions

View File

@@ -11,7 +11,7 @@ from sqlalchemy import text
from app.core.database import get_db
from app.schemas.training import TrainedModelResponse, ModelMetricsResponse
from app.services.training_service import EnhancedTrainingService
from datetime import datetime
from datetime import datetime, timezone
from sqlalchemy import select, delete, func
import uuid
import shutil
@@ -79,13 +79,13 @@ async def get_active_model(
# ✅ FIX: Wrap update query with text() too
update_query = text("""
UPDATE trained_models
SET last_used_at = :now
UPDATE trained_models
SET last_used_at = :now
WHERE id = :model_id
""")
await db.execute(update_query, {
"now": datetime.utcnow(),
"now": datetime.now(timezone.utc),
"model_id": model_record.id
})
await db.commit()
@@ -300,7 +300,7 @@ async def delete_tenant_models_complete(
deletion_stats = {
"tenant_id": tenant_id,
"deleted_at": datetime.utcnow().isoformat(),
"deleted_at": datetime.now(timezone.utc).isoformat(),
"jobs_cancelled": 0,
"models_deleted": 0,
"artifacts_deleted": 0,
@@ -322,7 +322,7 @@ async def delete_tenant_models_complete(
for job in active_jobs:
job.status = "cancelled"
job.updated_at = datetime.utcnow()
job.updated_at = datetime.now(timezone.utc)
deletion_stats["jobs_cancelled"] += 1
if active_jobs:

View File

@@ -17,7 +17,7 @@ from shared.database.base import create_database_manager
from shared.database.transactions import transactional
from shared.database.exceptions import DatabaseError
from app.core.config import settings
from app.ml.enhanced_features import AdvancedFeatureEngineer
from shared.ml.enhanced_features import AdvancedFeatureEngineer
import holidays
logger = structlog.get_logger()

View File

@@ -7,6 +7,7 @@ import pandas as pd
import numpy as np
from typing import Dict, List, Optional
import structlog
from shared.ml.feature_calculator import HistoricalFeatureCalculator
logger = structlog.get_logger()
@@ -19,10 +20,12 @@ class AdvancedFeatureEngineer:
def __init__(self):
self.feature_columns = []
self.feature_calculator = HistoricalFeatureCalculator()
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
"""
Add lagged demand features for capturing recent trends.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
@@ -34,14 +37,20 @@ class AdvancedFeatureEngineer:
if lag_days is None:
lag_days = [1, 7, 14]
df = df.copy()
# Use shared calculator for consistent lag calculation
df = self.feature_calculator.calculate_lag_features(
df,
lag_days=lag_days,
mode='training'
)
# Update feature columns list
for lag in lag_days:
col_name = f'lag_{lag}_day'
df[col_name] = df['quantity'].shift(lag)
self.feature_columns.append(col_name)
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
logger.info(f"Added {len(lag_days)} lagged features", lags=lag_days)
logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
return df
def add_rolling_features(
@@ -52,6 +61,7 @@ class AdvancedFeatureEngineer:
) -> pd.DataFrame:
"""
Add rolling statistics (mean, std, max, min).
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
@@ -67,24 +77,22 @@ class AdvancedFeatureEngineer:
if features is None:
features = ['mean', 'std', 'max', 'min']
df = df.copy()
# Use shared calculator for consistent rolling calculation
df = self.feature_calculator.calculate_rolling_features(
df,
windows=windows,
statistics=features,
mode='training'
)
# Update feature columns list
for window in windows:
for feature in features:
col_name = f'rolling_{feature}_{window}d'
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
if feature == 'mean':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).mean()
elif feature == 'std':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).std()
elif feature == 'max':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).max()
elif feature == 'min':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).min()
self.feature_columns.append(col_name)
logger.info(f"Added rolling features", windows=windows, features=features)
logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
return df
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
@@ -203,6 +211,7 @@ class AdvancedFeatureEngineer:
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add trend-based features.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with date and quantity
@@ -211,27 +220,18 @@ class AdvancedFeatureEngineer:
Returns:
DataFrame with trend features
"""
df = df.copy()
# Use shared calculator for consistent trend calculation
df = self.feature_calculator.calculate_trend_features(
df,
mode='training'
)
# Days since start (linear trend proxy)
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
# Momentum indicators (recent change vs. older change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
self.feature_columns.append('momentum_1_7')
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
self.feature_columns.append('trend_7_30')
# Velocity (rate of change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7
self.feature_columns.append('velocity_week')
self.feature_columns.append('days_since_start')
# Update feature columns list
for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
if feature_name in df.columns and feature_name not in self.feature_columns:
self.feature_columns.append(feature_name)
logger.debug("Added trend features (using shared calculator)")
return df
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:

View File

@@ -7,7 +7,7 @@ import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import structlog
from datetime import datetime
from datetime import datetime, timezone
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
@@ -408,7 +408,7 @@ class HybridProphetXGBoost:
},
'tenant_id': tenant_id,
'inventory_product_id': inventory_product_id,
'trained_at': datetime.utcnow().isoformat()
'trained_at': datetime.now(timezone.utc).isoformat()
}
async def predict(

View File

@@ -844,6 +844,9 @@ class EnhancedBakeryMLTrainer:
# Extract training period from the processed data
training_start_date = None
training_end_date = None
data_freshness_days = None
data_coverage_days = None
if 'ds' in processed_data.columns and not processed_data.empty:
# Ensure ds column is datetime64 before extracting dates (prevents object dtype issues)
ds_datetime = pd.to_datetime(processed_data['ds'])
@@ -857,6 +860,15 @@ class EnhancedBakeryMLTrainer:
training_start_date = pd.Timestamp(min_ts).to_pydatetime().replace(tzinfo=None)
if pd.notna(max_ts):
training_end_date = pd.Timestamp(max_ts).to_pydatetime().replace(tzinfo=None)
# Calculate data freshness metrics
if training_end_date:
from datetime import datetime
data_freshness_days = (datetime.now() - training_end_date).days
# Calculate data coverage period
if training_start_date and training_end_date:
data_coverage_days = (training_end_date - training_start_date).days
# Ensure features are clean string list
try:
@@ -864,6 +876,13 @@ class EnhancedBakeryMLTrainer:
except Exception:
features_used = []
# Prepare hyperparameters with data freshness metrics
hyperparameters = model_info.get("hyperparameters", {})
if data_freshness_days is not None:
hyperparameters["data_freshness_days"] = data_freshness_days
if data_coverage_days is not None:
hyperparameters["data_coverage_days"] = data_coverage_days
model_data = {
"tenant_id": tenant_id,
"inventory_product_id": inventory_product_id,
@@ -876,7 +895,7 @@ class EnhancedBakeryMLTrainer:
"rmse": float(model_info.get("training_metrics", {}).get("rmse", 0)) if model_info.get("training_metrics", {}).get("rmse") is not None else 0,
"r2_score": float(model_info.get("training_metrics", {}).get("r2", 0)) if model_info.get("training_metrics", {}).get("r2") is not None else 0,
"training_samples": int(len(processed_data)),
"hyperparameters": self._serialize_scalers(model_info.get("hyperparameters", {})),
"hyperparameters": self._serialize_scalers(hyperparameters),
"features_used": [str(f) for f in features_used] if features_used else [],
"normalization_params": self._serialize_scalers(self.enhanced_data_processor.get_scalers()) or {}, # Include scalers for prediction consistency
"product_category": model_info.get("product_category", "unknown"), # Store product category
@@ -890,7 +909,9 @@ class EnhancedBakeryMLTrainer:
model_record = await repos['model'].create_model(model_data)
logger.info("Created enhanced model record",
inventory_product_id=inventory_product_id,
model_id=model_record.id)
model_id=model_record.id,
data_freshness_days=data_freshness_days,
data_coverage_days=data_coverage_days)
# Create artifacts for model files
if model_info.get("model_path"):

View File

@@ -6,7 +6,7 @@ Service-specific repository base class with training service utilities
from typing import Optional, List, Dict, Any, Type
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import text
from datetime import datetime, timedelta
from datetime import datetime, timezone, timedelta
import structlog
from shared.database.repository import BaseRepository
@@ -73,7 +73,7 @@ class TrainingBaseRepository(BaseRepository):
async def cleanup_old_records(self, days_old: int = 90, status_filter: str = None) -> int:
"""Clean up old training records"""
try:
cutoff_date = datetime.utcnow() - timedelta(days=days_old)
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
table_name = self.model.__tablename__
# Build query based on available fields

View File

@@ -6,7 +6,7 @@ Repository for trained model operations
from typing import Optional, List, Dict, Any
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, and_, text, desc
from datetime import datetime, timedelta
from datetime import datetime, timezone, timedelta
import structlog
from .base import TrainingBaseRepository
@@ -144,7 +144,7 @@ class ModelRepository(TrainingBaseRepository):
# Promote this model
updated_model = await self.update(model_id, {
"is_production": True,
"last_used_at": datetime.utcnow()
"last_used_at": datetime.now(timezone.utc)
})
logger.info("Model promoted to production",
@@ -164,7 +164,7 @@ class ModelRepository(TrainingBaseRepository):
"""Update model last used timestamp"""
try:
return await self.update(model_id, {
"last_used_at": datetime.utcnow()
"last_used_at": datetime.now(timezone.utc)
})
except Exception as e:
logger.error("Failed to update model usage",
@@ -176,7 +176,7 @@ class ModelRepository(TrainingBaseRepository):
async def archive_old_models(self, tenant_id: str, days_old: int = 90) -> int:
"""Archive old non-production models"""
try:
cutoff_date = datetime.utcnow() - timedelta(days=days_old)
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
query = text("""
UPDATE trained_models
@@ -235,7 +235,7 @@ class ModelRepository(TrainingBaseRepository):
product_stats = {row.inventory_product_id: row.count for row in result.fetchall()}
# Recent activity (models created in last 30 days)
thirty_days_ago = datetime.utcnow() - timedelta(days=30)
thirty_days_ago = datetime.now(timezone.utc) - timedelta(days=30)
recent_models_query = text("""
SELECT COUNT(*) as count
FROM trained_models