imporve features
This commit is contained in:
@@ -11,7 +11,7 @@ from sqlalchemy import text
|
||||
from app.core.database import get_db
|
||||
from app.schemas.training import TrainedModelResponse, ModelMetricsResponse
|
||||
from app.services.training_service import EnhancedTrainingService
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy import select, delete, func
|
||||
import uuid
|
||||
import shutil
|
||||
@@ -79,13 +79,13 @@ async def get_active_model(
|
||||
|
||||
# ✅ FIX: Wrap update query with text() too
|
||||
update_query = text("""
|
||||
UPDATE trained_models
|
||||
SET last_used_at = :now
|
||||
UPDATE trained_models
|
||||
SET last_used_at = :now
|
||||
WHERE id = :model_id
|
||||
""")
|
||||
|
||||
|
||||
await db.execute(update_query, {
|
||||
"now": datetime.utcnow(),
|
||||
"now": datetime.now(timezone.utc),
|
||||
"model_id": model_record.id
|
||||
})
|
||||
await db.commit()
|
||||
@@ -300,7 +300,7 @@ async def delete_tenant_models_complete(
|
||||
|
||||
deletion_stats = {
|
||||
"tenant_id": tenant_id,
|
||||
"deleted_at": datetime.utcnow().isoformat(),
|
||||
"deleted_at": datetime.now(timezone.utc).isoformat(),
|
||||
"jobs_cancelled": 0,
|
||||
"models_deleted": 0,
|
||||
"artifacts_deleted": 0,
|
||||
@@ -322,7 +322,7 @@ async def delete_tenant_models_complete(
|
||||
|
||||
for job in active_jobs:
|
||||
job.status = "cancelled"
|
||||
job.updated_at = datetime.utcnow()
|
||||
job.updated_at = datetime.now(timezone.utc)
|
||||
deletion_stats["jobs_cancelled"] += 1
|
||||
|
||||
if active_jobs:
|
||||
|
||||
@@ -17,7 +17,7 @@ from shared.database.base import create_database_manager
|
||||
from shared.database.transactions import transactional
|
||||
from shared.database.exceptions import DatabaseError
|
||||
from app.core.config import settings
|
||||
from app.ml.enhanced_features import AdvancedFeatureEngineer
|
||||
from shared.ml.enhanced_features import AdvancedFeatureEngineer
|
||||
import holidays
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
@@ -7,6 +7,7 @@ import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional
|
||||
import structlog
|
||||
from shared.ml.feature_calculator import HistoricalFeatureCalculator
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
@@ -19,10 +20,12 @@ class AdvancedFeatureEngineer:
|
||||
|
||||
def __init__(self):
|
||||
self.feature_columns = []
|
||||
self.feature_calculator = HistoricalFeatureCalculator()
|
||||
|
||||
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Add lagged demand features for capturing recent trends.
|
||||
Uses shared feature calculator for consistency with prediction service.
|
||||
|
||||
Args:
|
||||
df: DataFrame with 'quantity' column
|
||||
@@ -34,14 +37,20 @@ class AdvancedFeatureEngineer:
|
||||
if lag_days is None:
|
||||
lag_days = [1, 7, 14]
|
||||
|
||||
df = df.copy()
|
||||
# Use shared calculator for consistent lag calculation
|
||||
df = self.feature_calculator.calculate_lag_features(
|
||||
df,
|
||||
lag_days=lag_days,
|
||||
mode='training'
|
||||
)
|
||||
|
||||
# Update feature columns list
|
||||
for lag in lag_days:
|
||||
col_name = f'lag_{lag}_day'
|
||||
df[col_name] = df['quantity'].shift(lag)
|
||||
self.feature_columns.append(col_name)
|
||||
if col_name not in self.feature_columns:
|
||||
self.feature_columns.append(col_name)
|
||||
|
||||
logger.info(f"Added {len(lag_days)} lagged features", lags=lag_days)
|
||||
logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
|
||||
return df
|
||||
|
||||
def add_rolling_features(
|
||||
@@ -52,6 +61,7 @@ class AdvancedFeatureEngineer:
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Add rolling statistics (mean, std, max, min).
|
||||
Uses shared feature calculator for consistency with prediction service.
|
||||
|
||||
Args:
|
||||
df: DataFrame with 'quantity' column
|
||||
@@ -67,24 +77,22 @@ class AdvancedFeatureEngineer:
|
||||
if features is None:
|
||||
features = ['mean', 'std', 'max', 'min']
|
||||
|
||||
df = df.copy()
|
||||
# Use shared calculator for consistent rolling calculation
|
||||
df = self.feature_calculator.calculate_rolling_features(
|
||||
df,
|
||||
windows=windows,
|
||||
statistics=features,
|
||||
mode='training'
|
||||
)
|
||||
|
||||
# Update feature columns list
|
||||
for window in windows:
|
||||
for feature in features:
|
||||
col_name = f'rolling_{feature}_{window}d'
|
||||
if col_name not in self.feature_columns:
|
||||
self.feature_columns.append(col_name)
|
||||
|
||||
if feature == 'mean':
|
||||
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).mean()
|
||||
elif feature == 'std':
|
||||
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).std()
|
||||
elif feature == 'max':
|
||||
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).max()
|
||||
elif feature == 'min':
|
||||
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).min()
|
||||
|
||||
self.feature_columns.append(col_name)
|
||||
|
||||
logger.info(f"Added rolling features", windows=windows, features=features)
|
||||
logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
|
||||
return df
|
||||
|
||||
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
|
||||
@@ -203,6 +211,7 @@ class AdvancedFeatureEngineer:
|
||||
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
|
||||
"""
|
||||
Add trend-based features.
|
||||
Uses shared feature calculator for consistency with prediction service.
|
||||
|
||||
Args:
|
||||
df: DataFrame with date and quantity
|
||||
@@ -211,27 +220,18 @@ class AdvancedFeatureEngineer:
|
||||
Returns:
|
||||
DataFrame with trend features
|
||||
"""
|
||||
df = df.copy()
|
||||
# Use shared calculator for consistent trend calculation
|
||||
df = self.feature_calculator.calculate_trend_features(
|
||||
df,
|
||||
mode='training'
|
||||
)
|
||||
|
||||
# Days since start (linear trend proxy)
|
||||
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
|
||||
|
||||
# Momentum indicators (recent change vs. older change)
|
||||
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
|
||||
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
|
||||
self.feature_columns.append('momentum_1_7')
|
||||
|
||||
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
|
||||
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
|
||||
self.feature_columns.append('trend_7_30')
|
||||
|
||||
# Velocity (rate of change)
|
||||
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
|
||||
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7
|
||||
self.feature_columns.append('velocity_week')
|
||||
|
||||
self.feature_columns.append('days_since_start')
|
||||
# Update feature columns list
|
||||
for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
|
||||
if feature_name in df.columns and feature_name not in self.feature_columns:
|
||||
self.feature_columns.append(feature_name)
|
||||
|
||||
logger.debug("Added trend features (using shared calculator)")
|
||||
return df
|
||||
|
||||
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
|
||||
@@ -7,7 +7,7 @@ import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
import structlog
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
import joblib
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
|
||||
from sklearn.model_selection import TimeSeriesSplit
|
||||
@@ -408,7 +408,7 @@ class HybridProphetXGBoost:
|
||||
},
|
||||
'tenant_id': tenant_id,
|
||||
'inventory_product_id': inventory_product_id,
|
||||
'trained_at': datetime.utcnow().isoformat()
|
||||
'trained_at': datetime.now(timezone.utc).isoformat()
|
||||
}
|
||||
|
||||
async def predict(
|
||||
|
||||
@@ -844,6 +844,9 @@ class EnhancedBakeryMLTrainer:
|
||||
# Extract training period from the processed data
|
||||
training_start_date = None
|
||||
training_end_date = None
|
||||
data_freshness_days = None
|
||||
data_coverage_days = None
|
||||
|
||||
if 'ds' in processed_data.columns and not processed_data.empty:
|
||||
# Ensure ds column is datetime64 before extracting dates (prevents object dtype issues)
|
||||
ds_datetime = pd.to_datetime(processed_data['ds'])
|
||||
@@ -857,6 +860,15 @@ class EnhancedBakeryMLTrainer:
|
||||
training_start_date = pd.Timestamp(min_ts).to_pydatetime().replace(tzinfo=None)
|
||||
if pd.notna(max_ts):
|
||||
training_end_date = pd.Timestamp(max_ts).to_pydatetime().replace(tzinfo=None)
|
||||
|
||||
# Calculate data freshness metrics
|
||||
if training_end_date:
|
||||
from datetime import datetime
|
||||
data_freshness_days = (datetime.now() - training_end_date).days
|
||||
|
||||
# Calculate data coverage period
|
||||
if training_start_date and training_end_date:
|
||||
data_coverage_days = (training_end_date - training_start_date).days
|
||||
|
||||
# Ensure features are clean string list
|
||||
try:
|
||||
@@ -864,6 +876,13 @@ class EnhancedBakeryMLTrainer:
|
||||
except Exception:
|
||||
features_used = []
|
||||
|
||||
# Prepare hyperparameters with data freshness metrics
|
||||
hyperparameters = model_info.get("hyperparameters", {})
|
||||
if data_freshness_days is not None:
|
||||
hyperparameters["data_freshness_days"] = data_freshness_days
|
||||
if data_coverage_days is not None:
|
||||
hyperparameters["data_coverage_days"] = data_coverage_days
|
||||
|
||||
model_data = {
|
||||
"tenant_id": tenant_id,
|
||||
"inventory_product_id": inventory_product_id,
|
||||
@@ -876,7 +895,7 @@ class EnhancedBakeryMLTrainer:
|
||||
"rmse": float(model_info.get("training_metrics", {}).get("rmse", 0)) if model_info.get("training_metrics", {}).get("rmse") is not None else 0,
|
||||
"r2_score": float(model_info.get("training_metrics", {}).get("r2", 0)) if model_info.get("training_metrics", {}).get("r2") is not None else 0,
|
||||
"training_samples": int(len(processed_data)),
|
||||
"hyperparameters": self._serialize_scalers(model_info.get("hyperparameters", {})),
|
||||
"hyperparameters": self._serialize_scalers(hyperparameters),
|
||||
"features_used": [str(f) for f in features_used] if features_used else [],
|
||||
"normalization_params": self._serialize_scalers(self.enhanced_data_processor.get_scalers()) or {}, # Include scalers for prediction consistency
|
||||
"product_category": model_info.get("product_category", "unknown"), # Store product category
|
||||
@@ -890,7 +909,9 @@ class EnhancedBakeryMLTrainer:
|
||||
model_record = await repos['model'].create_model(model_data)
|
||||
logger.info("Created enhanced model record",
|
||||
inventory_product_id=inventory_product_id,
|
||||
model_id=model_record.id)
|
||||
model_id=model_record.id,
|
||||
data_freshness_days=data_freshness_days,
|
||||
data_coverage_days=data_coverage_days)
|
||||
|
||||
# Create artifacts for model files
|
||||
if model_info.get("model_path"):
|
||||
|
||||
@@ -6,7 +6,7 @@ Service-specific repository base class with training service utilities
|
||||
from typing import Optional, List, Dict, Any, Type
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import text
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import structlog
|
||||
|
||||
from shared.database.repository import BaseRepository
|
||||
@@ -73,7 +73,7 @@ class TrainingBaseRepository(BaseRepository):
|
||||
async def cleanup_old_records(self, days_old: int = 90, status_filter: str = None) -> int:
|
||||
"""Clean up old training records"""
|
||||
try:
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=days_old)
|
||||
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
|
||||
table_name = self.model.__tablename__
|
||||
|
||||
# Build query based on available fields
|
||||
|
||||
@@ -6,7 +6,7 @@ Repository for trained model operations
|
||||
from typing import Optional, List, Dict, Any
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select, and_, text, desc
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import structlog
|
||||
|
||||
from .base import TrainingBaseRepository
|
||||
@@ -144,7 +144,7 @@ class ModelRepository(TrainingBaseRepository):
|
||||
# Promote this model
|
||||
updated_model = await self.update(model_id, {
|
||||
"is_production": True,
|
||||
"last_used_at": datetime.utcnow()
|
||||
"last_used_at": datetime.now(timezone.utc)
|
||||
})
|
||||
|
||||
logger.info("Model promoted to production",
|
||||
@@ -164,7 +164,7 @@ class ModelRepository(TrainingBaseRepository):
|
||||
"""Update model last used timestamp"""
|
||||
try:
|
||||
return await self.update(model_id, {
|
||||
"last_used_at": datetime.utcnow()
|
||||
"last_used_at": datetime.now(timezone.utc)
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error("Failed to update model usage",
|
||||
@@ -176,7 +176,7 @@ class ModelRepository(TrainingBaseRepository):
|
||||
async def archive_old_models(self, tenant_id: str, days_old: int = 90) -> int:
|
||||
"""Archive old non-production models"""
|
||||
try:
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=days_old)
|
||||
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
|
||||
|
||||
query = text("""
|
||||
UPDATE trained_models
|
||||
@@ -235,7 +235,7 @@ class ModelRepository(TrainingBaseRepository):
|
||||
product_stats = {row.inventory_product_id: row.count for row in result.fetchall()}
|
||||
|
||||
# Recent activity (models created in last 30 days)
|
||||
thirty_days_ago = datetime.utcnow() - timedelta(days=30)
|
||||
thirty_days_ago = datetime.now(timezone.utc) - timedelta(days=30)
|
||||
recent_models_query = text("""
|
||||
SELECT COUNT(*) as count
|
||||
FROM trained_models
|
||||
|
||||
Reference in New Issue
Block a user