imporve features

This commit is contained in:
Urtzi Alfaro
2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions

View File

@@ -213,17 +213,17 @@ async def check_is_school_holiday(
response_model=TenantLocationContextResponse
)
async def get_tenant_location_context(
tenant_id: UUID = Depends(get_current_user_dep),
tenant_id: str = Path(..., description="Tenant ID"),
current_user: dict = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""Get location context for a tenant including school calendar assignment (cached)"""
try:
tenant_id_str = str(tenant_id)
# Check cache first
cached = await cache.get_cached_tenant_context(tenant_id_str)
cached = await cache.get_cached_tenant_context(tenant_id)
if cached:
logger.debug("Returning cached tenant context", tenant_id=tenant_id_str)
logger.debug("Returning cached tenant context", tenant_id=tenant_id)
return TenantLocationContextResponse(**cached)
# Cache miss - fetch from database
@@ -261,11 +261,16 @@ async def get_tenant_location_context(
)
async def create_or_update_tenant_location_context(
request: TenantLocationContextCreateRequest,
tenant_id: UUID = Depends(get_current_user_dep),
tenant_id: str = Path(..., description="Tenant ID"),
current_user: dict = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""Create or update tenant location context"""
try:
# Convert to UUID for use with repository
tenant_uuid = UUID(tenant_id)
repo = CalendarRepository(db)
# Validate calendar_id if provided
@@ -279,7 +284,7 @@ async def create_or_update_tenant_location_context(
# Create or update context
context_obj = await repo.create_or_update_tenant_location_context(
tenant_id=tenant_id,
tenant_id=tenant_uuid,
city_id=request.city_id,
school_calendar_id=request.school_calendar_id,
neighborhood=request.neighborhood,
@@ -288,13 +293,13 @@ async def create_or_update_tenant_location_context(
)
# Invalidate cache since context was updated
await cache.invalidate_tenant_context(str(tenant_id))
await cache.invalidate_tenant_context(tenant_id)
# Get full context with calendar details
context = await repo.get_tenant_with_calendar(tenant_id)
context = await repo.get_tenant_with_calendar(tenant_uuid)
# Cache the new context
await cache.set_cached_tenant_context(str(tenant_id), context)
await cache.set_cached_tenant_context(tenant_id, context)
return TenantLocationContextResponse(**context)
@@ -317,13 +322,18 @@ async def create_or_update_tenant_location_context(
status_code=204
)
async def delete_tenant_location_context(
tenant_id: UUID = Depends(get_current_user_dep),
tenant_id: str = Path(..., description="Tenant ID"),
current_user: dict = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""Delete tenant location context"""
try:
# Convert to UUID for use with repository
tenant_uuid = UUID(tenant_id)
repo = CalendarRepository(db)
deleted = await repo.delete_tenant_location_context(tenant_id)
deleted = await repo.delete_tenant_location_context(tenant_uuid)
if not deleted:
raise HTTPException(
@@ -347,6 +357,97 @@ async def delete_tenant_location_context(
)
# ===== Calendar Suggestion Endpoint =====
@router.post(
route_builder.build_base_route("location-context/suggest-calendar")
)
async def suggest_calendar_for_tenant(
tenant_id: str = Path(..., description="Tenant ID"),
current_user: dict = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""
Suggest an appropriate school calendar for a tenant based on location and POI data.
This endpoint analyzes:
- Tenant's city location
- Detected schools nearby (from POI detection)
- Available calendars for the city
- Bakery-specific heuristics (primary schools = stronger morning rush)
Returns a suggestion with confidence score and reasoning.
Does NOT automatically assign - requires admin approval.
"""
try:
from app.utils.calendar_suggester import CalendarSuggester
from app.repositories.poi_context_repository import POIContextRepository
tenant_uuid = UUID(tenant_id)
# Get tenant's location context
calendar_repo = CalendarRepository(db)
location_context = await calendar_repo.get_tenant_location_context(tenant_uuid)
if not location_context:
raise HTTPException(
status_code=404,
detail="Location context not found. Create location context first."
)
city_id = location_context.city_id
# Get available calendars for city
calendars_result = await calendar_repo.get_calendars_by_city(city_id, enabled_only=True)
calendars = calendars_result.get("calendars", []) if calendars_result else []
# Get POI context if available
poi_repo = POIContextRepository(db)
poi_context = await poi_repo.get_by_tenant_id(tenant_uuid)
poi_data = poi_context.to_dict() if poi_context else None
# Generate suggestion
suggester = CalendarSuggester()
suggestion = suggester.suggest_calendar_for_tenant(
city_id=city_id,
available_calendars=calendars,
poi_context=poi_data,
tenant_data=None # Could include tenant info if needed
)
# Format for admin display
admin_message = suggester.format_suggestion_for_admin(suggestion)
logger.info(
"Calendar suggestion generated",
tenant_id=tenant_id,
city_id=city_id,
suggested_calendar=suggestion.get("suggested_calendar_id"),
confidence=suggestion.get("confidence")
)
return {
**suggestion,
"admin_message": admin_message,
"tenant_id": tenant_id,
"current_calendar_id": str(location_context.school_calendar_id) if location_context.school_calendar_id else None
}
except HTTPException:
raise
except Exception as e:
logger.error(
"Error generating calendar suggestion",
tenant_id=tenant_id,
error=str(e),
exc_info=True
)
raise HTTPException(
status_code=500,
detail=f"Error generating calendar suggestion: {str(e)}"
)
# ===== Helper Endpoints =====
@router.get(

View File

@@ -21,10 +21,10 @@ from app.core.redis_client import get_redis_client
logger = structlog.get_logger()
router = APIRouter(prefix="/poi-context", tags=["POI Context"])
router = APIRouter(prefix="/tenants", tags=["POI Context"])
@router.post("/{tenant_id}/detect")
@router.post("/{tenant_id}/poi-context/detect")
async def detect_pois_for_tenant(
tenant_id: str,
latitude: float = Query(..., description="Bakery latitude"),
@@ -209,13 +209,79 @@ async def detect_pois_for_tenant(
relevant_categories=len(feature_selection.get("relevant_categories", []))
)
# Phase 3: Auto-trigger calendar suggestion after POI detection
# This helps admins by providing intelligent calendar recommendations
calendar_suggestion = None
try:
from app.utils.calendar_suggester import CalendarSuggester
from app.repositories.calendar_repository import CalendarRepository
# Get tenant's location context
calendar_repo = CalendarRepository(db)
location_context = await calendar_repo.get_tenant_location_context(tenant_uuid)
if location_context and location_context.school_calendar_id is None:
# Only suggest if no calendar assigned yet
city_id = location_context.city_id
# Get available calendars for city
calendars_result = await calendar_repo.get_calendars_by_city(city_id, enabled_only=True)
calendars = calendars_result.get("calendars", []) if calendars_result else []
if calendars:
# Generate suggestion using POI data
suggester = CalendarSuggester()
calendar_suggestion = suggester.suggest_calendar_for_tenant(
city_id=city_id,
available_calendars=calendars,
poi_context=poi_context.to_dict(),
tenant_data=None
)
logger.info(
"Calendar suggestion auto-generated after POI detection",
tenant_id=tenant_id,
suggested_calendar=calendar_suggestion.get("calendar_name"),
confidence=calendar_suggestion.get("confidence_percentage"),
should_auto_assign=calendar_suggestion.get("should_auto_assign")
)
# TODO: Send notification to admin about available suggestion
# This will be implemented when notification service is integrated
else:
logger.info(
"No calendars available for city, skipping suggestion",
tenant_id=tenant_id,
city_id=city_id
)
elif location_context and location_context.school_calendar_id:
logger.info(
"Calendar already assigned, skipping suggestion",
tenant_id=tenant_id,
calendar_id=str(location_context.school_calendar_id)
)
else:
logger.warning(
"No location context found, skipping calendar suggestion",
tenant_id=tenant_id
)
except Exception as e:
# Non-blocking: POI detection should succeed even if suggestion fails
logger.warning(
"Failed to auto-generate calendar suggestion (non-blocking)",
tenant_id=tenant_id,
error=str(e)
)
return {
"status": "success",
"source": "detection",
"poi_context": poi_context.to_dict(),
"feature_selection": feature_selection,
"competitor_analysis": competitor_analysis,
"competitive_insights": competitive_insights
"competitive_insights": competitive_insights,
"calendar_suggestion": calendar_suggestion # Include suggestion in response
}
except Exception as e:
@@ -231,7 +297,7 @@ async def detect_pois_for_tenant(
)
@router.get("/{tenant_id}")
@router.get("/{tenant_id}/poi-context")
async def get_poi_context(
tenant_id: str,
db: AsyncSession = Depends(get_db)
@@ -265,7 +331,7 @@ async def get_poi_context(
}
@router.post("/{tenant_id}/refresh")
@router.post("/{tenant_id}/poi-context/refresh")
async def refresh_poi_context(
tenant_id: str,
db: AsyncSession = Depends(get_db)
@@ -299,7 +365,7 @@ async def refresh_poi_context(
)
@router.delete("/{tenant_id}")
@router.delete("/{tenant_id}/poi-context")
async def delete_poi_context(
tenant_id: str,
db: AsyncSession = Depends(get_db)
@@ -327,7 +393,7 @@ async def delete_poi_context(
}
@router.get("/{tenant_id}/feature-importance")
@router.get("/{tenant_id}/poi-context/feature-importance")
async def get_feature_importance(
tenant_id: str,
db: AsyncSession = Depends(get_db)
@@ -364,7 +430,7 @@ async def get_feature_importance(
}
@router.get("/{tenant_id}/competitor-analysis")
@router.get("/{tenant_id}/poi-context/competitor-analysis")
async def get_competitor_analysis(
tenant_id: str,
db: AsyncSession = Depends(get_db)

View File

@@ -0,0 +1,342 @@
"""
Calendar Suggester Utility
Provides intelligent school calendar suggestions based on POI detection data,
tenant location, and heuristics optimized for bakery demand forecasting.
"""
from typing import Optional, Dict, List, Any, Tuple
from datetime import datetime, date, timezone
import structlog
logger = structlog.get_logger()
class CalendarSuggester:
"""
Suggests appropriate school calendars for tenants based on location context.
Uses POI detection data, proximity analysis, and bakery-specific heuristics
to provide intelligent calendar recommendations with confidence scores.
"""
def __init__(self):
self.logger = logger
def suggest_calendar_for_tenant(
self,
city_id: str,
available_calendars: List[Dict[str, Any]],
poi_context: Optional[Dict[str, Any]] = None,
tenant_data: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
Suggest the most appropriate calendar for a tenant.
Args:
city_id: Normalized city ID (e.g., "madrid")
available_calendars: List of available school calendars for the city
poi_context: Optional POI detection results including school data
tenant_data: Optional tenant information (location, etc.)
Returns:
Dict with:
- suggested_calendar_id: UUID of suggested calendar or None
- calendar_name: Name of suggested calendar
- confidence: Float 0.0-1.0 confidence score
- reasoning: List of reasoning steps
- fallback_calendars: Alternative suggestions
- should_assign: Boolean recommendation to auto-assign
"""
if not available_calendars:
return self._no_calendars_available(city_id)
# Get current academic year
academic_year = self._get_current_academic_year()
# Filter calendars for current academic year
current_year_calendars = [
cal for cal in available_calendars
if cal.get("academic_year") == academic_year
]
if not current_year_calendars:
# Fallback to any calendar if current year not available
current_year_calendars = available_calendars
self.logger.warning(
"No calendars for current academic year, using all available",
city_id=city_id,
academic_year=academic_year
)
# Analyze POI context if available
school_analysis = self._analyze_schools_from_poi(poi_context) if poi_context else None
# Apply bakery-specific heuristics
suggestion = self._apply_suggestion_heuristics(
current_year_calendars,
school_analysis,
city_id
)
return suggestion
def _get_current_academic_year(self) -> str:
"""
Determine current academic year based on date.
Academic year runs September to June (Spain):
- Jan-Aug: Previous year (e.g., 2024-2025)
- Sep-Dec: Current year (e.g., 2025-2026)
Returns:
Academic year string (e.g., "2024-2025")
"""
today = date.today()
year = today.year
# Academic year starts in September
if today.month >= 9: # September onwards
return f"{year}-{year + 1}"
else: # January-August
return f"{year - 1}-{year}"
def _analyze_schools_from_poi(
self,
poi_context: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
"""
Analyze school POIs to infer school type preferences.
Args:
poi_context: POI detection results
Returns:
Dict with:
- has_schools_nearby: Boolean
- school_count: Int count of schools
- nearest_distance: Float distance to nearest school (meters)
- proximity_score: Float proximity score
- school_names: List of detected school names
"""
try:
poi_results = poi_context.get("poi_detection_results", {})
schools_data = poi_results.get("schools", {})
if not schools_data:
return None
school_pois = schools_data.get("pois", [])
school_count = len(school_pois)
if school_count == 0:
return None
# Extract school details
school_names = [
poi.get("name", "Unknown School")
for poi in school_pois
if poi.get("name")
]
# Get proximity metrics
features = schools_data.get("features", {})
proximity_score = features.get("proximity_score", 0.0)
# Calculate nearest distance (approximate from POI data)
nearest_distance = None
if school_pois:
# If we have POIs, estimate nearest distance
# This is approximate - exact calculation would require tenant coords
nearest_distance = 100.0 # Default assumption if schools detected
return {
"has_schools_nearby": True,
"school_count": school_count,
"nearest_distance": nearest_distance,
"proximity_score": proximity_score,
"school_names": school_names
}
except Exception as e:
self.logger.warning(
"Failed to analyze schools from POI",
error=str(e)
)
return None
def _apply_suggestion_heuristics(
self,
calendars: List[Dict[str, Any]],
school_analysis: Optional[Dict[str, Any]],
city_id: str
) -> Dict[str, Any]:
"""
Apply heuristics to suggest best calendar.
Bakery-specific heuristics:
1. If schools detected nearby -> Prefer primary (stronger morning rush)
2. If no schools detected -> Still suggest primary (more common, safer default)
3. Primary schools have stronger impact on bakery traffic
Args:
calendars: List of available calendars
school_analysis: Analysis of nearby schools
city_id: City identifier
Returns:
Suggestion dict with confidence and reasoning
"""
reasoning = []
confidence = 0.0
# Separate calendars by type
primary_calendars = [c for c in calendars if c.get("school_type") == "primary"]
secondary_calendars = [c for c in calendars if c.get("school_type") == "secondary"]
other_calendars = [c for c in calendars if c.get("school_type") not in ["primary", "secondary"]]
# Heuristic 1: Schools detected nearby
if school_analysis and school_analysis.get("has_schools_nearby"):
school_count = school_analysis.get("school_count", 0)
proximity_score = school_analysis.get("proximity_score", 0.0)
reasoning.append(f"Detected {school_count} schools nearby (proximity score: {proximity_score:.2f})")
if primary_calendars:
suggested = primary_calendars[0]
confidence = min(0.85, 0.65 + (proximity_score * 0.1)) # 65-85% confidence
reasoning.append("Primary schools create strong morning rush (7:30-9am drop-off)")
reasoning.append("Primary calendars recommended for bakeries near schools")
elif secondary_calendars:
suggested = secondary_calendars[0]
confidence = 0.70
reasoning.append("Secondary school calendars available (later morning start)")
else:
suggested = calendars[0]
confidence = 0.50
reasoning.append("Using available calendar (school type not specified)")
# Heuristic 2: No schools detected
else:
reasoning.append("No schools detected within 500m radius")
if primary_calendars:
suggested = primary_calendars[0]
confidence = 0.60 # Lower confidence without detected schools
reasoning.append("Defaulting to primary calendar (more common, safer choice)")
reasoning.append("Primary school holidays still affect general foot traffic")
elif secondary_calendars:
suggested = secondary_calendars[0]
confidence = 0.55
reasoning.append("Secondary calendar available as default")
elif other_calendars:
suggested = other_calendars[0]
confidence = 0.50
reasoning.append("Using available calendar")
else:
suggested = calendars[0]
confidence = 0.45
reasoning.append("No preferred calendar type available")
# Confidence adjustment based on school analysis quality
if school_analysis:
if school_analysis.get("school_count", 0) >= 3:
confidence = min(1.0, confidence + 0.05) # Boost for multiple schools
reasoning.append("High confidence: Multiple schools detected")
proximity = school_analysis.get("proximity_score", 0.0)
if proximity > 2.0:
confidence = min(1.0, confidence + 0.05) # Boost for close proximity
reasoning.append("High confidence: Schools very close to bakery")
# Determine if we should auto-assign
# Only auto-assign if confidence >= 75% AND schools detected
should_auto_assign = (
confidence >= 0.75 and
school_analysis is not None and
school_analysis.get("has_schools_nearby", False)
)
# Build fallback suggestions
fallback_calendars = []
for cal in calendars:
if cal.get("id") != suggested.get("id"):
fallback_calendars.append({
"calendar_id": str(cal.get("id")),
"calendar_name": cal.get("name"),
"school_type": cal.get("school_type"),
"academic_year": cal.get("academic_year")
})
return {
"suggested_calendar_id": str(suggested.get("id")),
"calendar_name": suggested.get("name"),
"school_type": suggested.get("school_type"),
"academic_year": suggested.get("academic_year"),
"confidence": round(confidence, 2),
"confidence_percentage": round(confidence * 100, 1),
"reasoning": reasoning,
"fallback_calendars": fallback_calendars[:2], # Top 2 alternatives
"should_auto_assign": should_auto_assign,
"school_analysis": school_analysis,
"city_id": city_id
}
def _no_calendars_available(self, city_id: str) -> Dict[str, Any]:
"""Return response when no calendars available for city."""
return {
"suggested_calendar_id": None,
"calendar_name": None,
"school_type": None,
"academic_year": None,
"confidence": 0.0,
"confidence_percentage": 0.0,
"reasoning": [
f"No school calendars configured for city: {city_id}",
"Calendar assignment not possible at this time",
"Location context created without calendar (can be added later)"
],
"fallback_calendars": [],
"should_auto_assign": False,
"school_analysis": None,
"city_id": city_id
}
def format_suggestion_for_admin(self, suggestion: Dict[str, Any]) -> str:
"""
Format suggestion as human-readable text for admin UI.
Args:
suggestion: Suggestion dict from suggest_calendar_for_tenant
Returns:
Formatted string for display
"""
if not suggestion.get("suggested_calendar_id"):
return f"⚠️ No calendars available for {suggestion.get('city_id', 'this city')}"
confidence_pct = suggestion.get("confidence_percentage", 0)
calendar_name = suggestion.get("calendar_name", "Unknown")
school_type = suggestion.get("school_type", "").capitalize()
# Confidence emoji
if confidence_pct >= 80:
emoji = ""
elif confidence_pct >= 60:
emoji = "📊"
else:
emoji = "💡"
text = f"{emoji} **Suggested**: {calendar_name}\n"
text += f"**Type**: {school_type} | **Confidence**: {confidence_pct}%\n\n"
text += "**Reasoning**:\n"
for reason in suggestion.get("reasoning", []):
text += f"{reason}\n"
if suggestion.get("fallback_calendars"):
text += "\n**Alternatives**:\n"
for alt in suggestion.get("fallback_calendars", [])[:2]:
text += f"{alt.get('calendar_name')} ({alt.get('school_type')})\n"
return text

View File

@@ -56,21 +56,17 @@ class BakeryForecaster:
from app.services.poi_feature_service import POIFeatureService
self.poi_feature_service = POIFeatureService()
# Initialize enhanced data processor from shared module
if use_enhanced_features:
# Import enhanced data processor from training service
import sys
import os
# Add training service to path
training_path = os.path.join(os.path.dirname(__file__), '../../../training')
if training_path not in sys.path:
sys.path.insert(0, training_path)
try:
from app.ml.data_processor import EnhancedBakeryDataProcessor
self.data_processor = EnhancedBakeryDataProcessor(database_manager)
logger.info("Enhanced features enabled for forecasting")
from shared.ml.data_processor import EnhancedBakeryDataProcessor
self.data_processor = EnhancedBakeryDataProcessor(region='MD')
logger.info("Enhanced features enabled using shared data processor")
except ImportError as e:
logger.warning(f"Could not import EnhancedBakeryDataProcessor: {e}, falling back to basic features")
logger.warning(
f"Could not import EnhancedBakeryDataProcessor from shared module: {e}. "
"Falling back to basic features."
)
self.use_enhanced_features = False
self.data_processor = None
else:

View File

@@ -1056,13 +1056,13 @@ class EnhancedForecastingService:
- External service is unavailable
"""
try:
# Get tenant's calendar ID
calendar_id = await self.data_client.get_tenant_calendar(tenant_id)
# Get tenant's calendar information
calendar_info = await self.data_client.fetch_tenant_calendar(tenant_id)
if calendar_id:
if calendar_info:
# Check school holiday via external service
is_school_holiday = await self.data_client.check_school_holiday(
calendar_id=calendar_id,
calendar_id=calendar_info["calendar_id"],
check_date=date_obj.isoformat(),
tenant_id=tenant_id
)

View File

@@ -206,13 +206,39 @@ class PredictionService:
# Calculate confidence interval
confidence_interval = upper_bound - lower_bound
# Adjust confidence based on data freshness if historical features were calculated
adjusted_confidence_level = confidence_level
data_availability_score = features.get('historical_data_availability_score', 1.0) # Default to 1.0 if not available
# Reduce confidence if historical data is significantly old
if data_availability_score < 0.5:
# For data availability score < 0.5 (more than 90 days old), reduce confidence
adjusted_confidence_level = max(0.6, confidence_level * data_availability_score)
# Increase confidence interval to reflect uncertainty
adjustment_factor = 1.0 + (0.5 * (1.0 - data_availability_score)) # Up to 50% wider interval
adjusted_lower_bound = prediction_value - (prediction_value - lower_bound) * adjustment_factor
adjusted_upper_bound = prediction_value + (upper_bound - prediction_value) * adjustment_factor
logger.info("Adjusted prediction confidence due to stale historical data",
original_confidence=confidence_level,
adjusted_confidence=adjusted_confidence_level,
data_availability_score=data_availability_score,
original_interval=confidence_interval,
adjusted_interval=adjusted_upper_bound - adjusted_lower_bound)
lower_bound = max(0, adjusted_lower_bound)
upper_bound = adjusted_upper_bound
confidence_interval = upper_bound - lower_bound
result = {
"prediction": max(0, prediction_value), # Ensure non-negative
"lower_bound": max(0, lower_bound),
"upper_bound": max(0, upper_bound),
"confidence_interval": confidence_interval,
"confidence_level": confidence_level
"confidence_level": adjusted_confidence_level,
"data_freshness_score": data_availability_score # Include data freshness in result
}
# Record metrics
@@ -222,35 +248,45 @@ class PredictionService:
# Register metrics if not already registered
if "prediction_processing_time" not in metrics._histograms:
metrics.register_histogram(
"prediction_processing_time",
"Time taken to process predictions",
"prediction_processing_time",
"Time taken to process predictions",
labels=['service', 'model_type']
)
if "predictions_served_total" not in metrics._counters:
try:
metrics.register_counter(
"predictions_served_total",
"Total number of predictions served",
"predictions_served_total",
"Total number of predictions served",
labels=['service', 'status']
)
except Exception as reg_error:
# Metric might already exist in global registry
logger.debug("Counter already exists in registry", error=str(reg_error))
# Now record the metrics
metrics.observe_histogram(
"prediction_processing_time",
processing_time,
labels={'service': 'forecasting-service', 'model_type': 'prophet'}
)
metrics.increment_counter(
"predictions_served_total",
labels={'service': 'forecasting-service', 'status': 'success'}
)
# Now record the metrics - try with expected labels, fallback if needed
try:
metrics.observe_histogram(
"prediction_processing_time",
processing_time,
labels={'service': 'forecasting-service', 'model_type': 'prophet'}
)
metrics.increment_counter(
"predictions_served_total",
labels={'service': 'forecasting-service', 'status': 'success'}
)
except Exception as label_error:
# If specific labels fail, try without labels to avoid breaking predictions
logger.warning("Failed to record metrics with labels, trying without", error=str(label_error))
try:
metrics.observe_histogram("prediction_processing_time", processing_time)
metrics.increment_counter("predictions_served_total")
except Exception as no_label_error:
logger.warning("Failed to record metrics even without labels", error=str(no_label_error))
except Exception as metrics_error:
# Log metrics error but don't fail the prediction
logger.warning("Failed to record metrics", error=str(metrics_error))
logger.warning("Failed to register or record metrics", error=str(metrics_error))
logger.info("Prediction generated successfully",
model_id=model_id,
@@ -260,22 +296,32 @@ class PredictionService:
return result
except Exception as e:
logger.error("Error generating prediction",
error=str(e),
logger.error("Error generating prediction",
error=str(e),
model_id=model_id)
# Record error metrics with robust error handling
try:
if "prediction_errors_total" not in metrics._counters:
metrics.register_counter(
"prediction_errors_total",
"Total number of prediction errors",
"prediction_errors_total",
"Total number of prediction errors",
labels=['service', 'error_type']
)
metrics.increment_counter(
"prediction_errors_total",
labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
)
except Exception:
pass # Don't fail on metrics errors
# Try with labels first, then without if that fails
try:
metrics.increment_counter(
"prediction_errors_total",
labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
)
except Exception as label_error:
logger.debug("Failed to record error metrics with labels", error=str(label_error))
try:
metrics.increment_counter("prediction_errors_total")
except Exception as no_label_error:
logger.warning("Failed to record error metrics even without labels", error=str(no_label_error))
except Exception as registration_error:
logger.warning("Failed to register error metrics", error=str(registration_error))
raise
async def predict_with_weather_forecast(
@@ -353,6 +399,33 @@ class PredictionService:
'weather_description': day_weather.get('description', 'Clear')
})
# CRITICAL FIX: Fetch historical sales data and calculate historical features
# This populates lag, rolling, and trend features for better predictions
# Using 90 days for better trend analysis and more robust rolling statistics
if 'tenant_id' in enriched_features and 'inventory_product_id' in enriched_features and 'date' in enriched_features:
try:
forecast_date = pd.to_datetime(enriched_features['date'])
historical_sales = await self._fetch_historical_sales(
tenant_id=enriched_features['tenant_id'],
inventory_product_id=enriched_features['inventory_product_id'],
forecast_date=forecast_date,
days_back=90 # Changed from 30 to 90 for better historical context
)
# Calculate historical features and merge into features dict
historical_features = self._calculate_historical_features(
historical_sales, forecast_date
)
enriched_features.update(historical_features)
logger.info("Historical features enriched",
lag_1_day=historical_features.get('lag_1_day'),
rolling_mean_7d=historical_features.get('rolling_mean_7d'))
except Exception as e:
logger.warning("Failed to enrich with historical features, using defaults",
error=str(e))
# Features dict will use defaults (0.0) from _prepare_prophet_features
# Prepare Prophet dataframe with weather features
prophet_df = self._prepare_prophet_features(enriched_features)
@@ -363,6 +436,29 @@ class PredictionService:
lower_bound = float(forecast['yhat_lower'].iloc[0])
upper_bound = float(forecast['yhat_upper'].iloc[0])
# Calculate confidence adjustment based on data freshness
current_confidence_level = confidence_level
data_availability_score = enriched_features.get('historical_data_availability_score', 1.0) # Default to 1.0 if not available
# Adjust confidence based on data freshness if historical features were calculated
# Reduce confidence if historical data is significantly old
if data_availability_score < 0.5:
# For data availability score < 0.5 (more than 90 days old), reduce confidence
current_confidence_level = max(0.6, confidence_level * data_availability_score)
# Increase confidence interval to reflect uncertainty
adjustment_factor = 1.0 + (0.5 * (1.0 - data_availability_score)) # Up to 50% wider interval
adjusted_lower_bound = prediction_value - (prediction_value - lower_bound) * adjustment_factor
adjusted_upper_bound = prediction_value + (upper_bound - prediction_value) * adjustment_factor
logger.info("Adjusted weather prediction confidence due to stale historical data",
original_confidence=confidence_level,
adjusted_confidence=current_confidence_level,
data_availability_score=data_availability_score)
lower_bound = max(0, adjusted_lower_bound)
upper_bound = adjusted_upper_bound
# Apply weather-based adjustments (business rules)
adjusted_prediction = self._apply_weather_adjustments(
prediction_value,
@@ -375,7 +471,8 @@ class PredictionService:
"prediction": max(0, adjusted_prediction),
"lower_bound": max(0, lower_bound),
"upper_bound": max(0, upper_bound),
"confidence_level": confidence_level,
"confidence_level": current_confidence_level,
"data_freshness_score": data_availability_score, # Include data freshness in result
"weather": {
"temperature": enriched_features['temperature'],
"precipitation": enriched_features['precipitation'],
@@ -567,6 +664,8 @@ class PredictionService:
) -> pd.Series:
"""
Fetch historical sales data for calculating lagged and rolling features.
Enhanced to handle cases where recent data is not available by extending
the search for the most recent data if needed.
Args:
tenant_id: Tenant UUID
@@ -578,7 +677,7 @@ class PredictionService:
pandas Series with sales quantities indexed by date
"""
try:
# Calculate date range
# Calculate initial date range for recent data
end_date = forecast_date - pd.Timedelta(days=1) # Day before forecast
start_date = end_date - pd.Timedelta(days=days_back)
@@ -589,7 +688,7 @@ class PredictionService:
end_date=end_date.date(),
days_back=days_back)
# Fetch sales data from sales service
# First, try to fetch sales data from the recent period
sales_data = await self.sales_client.get_sales_data(
tenant_id=tenant_id,
start_date=start_date.strftime("%Y-%m-%d"),
@@ -598,15 +697,72 @@ class PredictionService:
aggregation="daily"
)
# If no recent data found, search for the most recent available data
if not sales_data:
logger.warning("No historical sales data found",
logger.info("No recent sales data found, expanding search to find most recent data",
tenant_id=tenant_id,
product_id=inventory_product_id)
# Search for available data in larger time windows (up to 2 years back)
search_windows = [365, 730] # 1 year, 2 years
for window_days in search_windows:
extended_start_date = forecast_date - pd.Timedelta(days=window_days)
logger.debug("Expanding search window for historical data",
start_date=extended_start_date.date(),
end_date=end_date.date(),
window_days=window_days)
sales_data = await self.sales_client.get_sales_data(
tenant_id=tenant_id,
start_date=extended_start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
product_id=inventory_product_id,
aggregation="daily"
)
if sales_data:
logger.info("Found historical data in expanded search window",
tenant_id=tenant_id,
product_id=inventory_product_id,
data_start=sales_data[0]['sale_date'] if sales_data else "None",
data_end=sales_data[-1]['sale_date'] if sales_data else "None",
window_days=window_days)
break
if not sales_data:
logger.warning("No historical sales data found in any search window",
tenant_id=tenant_id,
product_id=inventory_product_id)
return pd.Series(dtype=float)
# Convert to pandas Series indexed by date
# Convert to pandas DataFrame and check if it has the expected structure
df = pd.DataFrame(sales_data)
df['sale_date'] = pd.to_datetime(df['sale_date'])
# Check if the expected 'sale_date' column exists
if df.empty:
logger.warning("No historical sales data returned from API")
return pd.Series(dtype=float)
# Check for available columns and find date column
available_columns = list(df.columns)
logger.debug(f"Available sales data columns: {available_columns}")
# Check for alternative date column names
date_columns = ['sale_date', 'date', 'forecast_date', 'datetime', 'timestamp']
date_column = None
for col in date_columns:
if col in df.columns:
date_column = col
break
if date_column is None:
logger.error(f"Sales data missing expected date column. Available columns: {available_columns}")
logger.debug(f"Sample of sales data: {df.head()}")
return pd.Series(dtype=float)
df['sale_date'] = pd.to_datetime(df[date_column])
df = df.set_index('sale_date')
# Extract quantity column (could be 'quantity' or 'total_quantity')
@@ -639,6 +795,10 @@ class PredictionService:
) -> Dict[str, float]:
"""
Calculate lagged, rolling, and trend features from historical sales data.
Enhanced to handle cases where recent data is not available by using
available historical data with appropriate temporal adjustments.
Now uses shared feature calculator for consistency with training service.
Args:
historical_sales: Series of sales quantities indexed by date
@@ -647,117 +807,26 @@ class PredictionService:
Returns:
Dictionary of calculated features
"""
features = {}
try:
if len(historical_sales) == 0:
logger.warning("No historical data available, using default values")
# Return all features with default values (0.0)
return {
# Lagged features
'lag_1_day': 0.0,
'lag_7_day': 0.0,
'lag_14_day': 0.0,
# Rolling statistics (7-day window)
'rolling_mean_7d': 0.0,
'rolling_std_7d': 0.0,
'rolling_max_7d': 0.0,
'rolling_min_7d': 0.0,
# Rolling statistics (14-day window)
'rolling_mean_14d': 0.0,
'rolling_std_14d': 0.0,
'rolling_max_14d': 0.0,
'rolling_min_14d': 0.0,
# Rolling statistics (30-day window)
'rolling_mean_30d': 0.0,
'rolling_std_30d': 0.0,
'rolling_max_30d': 0.0,
'rolling_min_30d': 0.0,
# Trend features
'days_since_start': 0,
'momentum_1_7': 0.0,
'trend_7_30': 0.0,
'velocity_week': 0.0,
}
# Use shared feature calculator for consistency
from shared.ml.feature_calculator import HistoricalFeatureCalculator
# Calculate lagged features
features['lag_1_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) >= 1 else 0.0
features['lag_7_day'] = float(historical_sales.iloc[-7]) if len(historical_sales) >= 7 else features['lag_1_day']
features['lag_14_day'] = float(historical_sales.iloc[-14]) if len(historical_sales) >= 14 else features['lag_7_day']
calculator = HistoricalFeatureCalculator()
# Calculate rolling statistics (7-day window)
if len(historical_sales) >= 7:
window_7d = historical_sales.iloc[-7:]
features['rolling_mean_7d'] = float(window_7d.mean())
features['rolling_std_7d'] = float(window_7d.std())
features['rolling_max_7d'] = float(window_7d.max())
features['rolling_min_7d'] = float(window_7d.min())
else:
features['rolling_mean_7d'] = features['lag_1_day']
features['rolling_std_7d'] = 0.0
features['rolling_max_7d'] = features['lag_1_day']
features['rolling_min_7d'] = features['lag_1_day']
# Calculate all features using shared calculator
features = calculator.calculate_all_features(
sales_data=historical_sales,
reference_date=forecast_date,
mode='prediction'
)
# Calculate rolling statistics (14-day window)
if len(historical_sales) >= 14:
window_14d = historical_sales.iloc[-14:]
features['rolling_mean_14d'] = float(window_14d.mean())
features['rolling_std_14d'] = float(window_14d.std())
features['rolling_max_14d'] = float(window_14d.max())
features['rolling_min_14d'] = float(window_14d.min())
else:
features['rolling_mean_14d'] = features['rolling_mean_7d']
features['rolling_std_14d'] = features['rolling_std_7d']
features['rolling_max_14d'] = features['rolling_max_7d']
features['rolling_min_14d'] = features['rolling_min_7d']
# Calculate rolling statistics (30-day window)
if len(historical_sales) >= 30:
window_30d = historical_sales.iloc[-30:]
features['rolling_mean_30d'] = float(window_30d.mean())
features['rolling_std_30d'] = float(window_30d.std())
features['rolling_max_30d'] = float(window_30d.max())
features['rolling_min_30d'] = float(window_30d.min())
else:
features['rolling_mean_30d'] = features['rolling_mean_14d']
features['rolling_std_30d'] = features['rolling_std_14d']
features['rolling_max_30d'] = features['rolling_max_14d']
features['rolling_min_30d'] = features['rolling_min_14d']
# Calculate trend features
if len(historical_sales) > 0:
# Days since first sale
features['days_since_start'] = (forecast_date - historical_sales.index[0]).days
# Momentum (difference between recent lag_1_day and lag_7_day)
if len(historical_sales) >= 7:
features['momentum_1_7'] = features['lag_1_day'] - features['lag_7_day']
else:
features['momentum_1_7'] = 0.0
# Trend (difference between recent 7-day and 30-day averages)
if len(historical_sales) >= 30:
features['trend_7_30'] = features['rolling_mean_7d'] - features['rolling_mean_30d']
else:
features['trend_7_30'] = 0.0
# Velocity (rate of change over the last week)
if len(historical_sales) >= 7:
week_change = historical_sales.iloc[-1] - historical_sales.iloc[-7]
features['velocity_week'] = float(week_change / 7.0)
else:
features['velocity_week'] = 0.0
else:
features['days_since_start'] = 0
features['momentum_1_7'] = 0.0
features['trend_7_30'] = 0.0
features['velocity_week'] = 0.0
logger.debug("Historical features calculated",
lag_1_day=features['lag_1_day'],
rolling_mean_7d=features['rolling_mean_7d'],
rolling_mean_30d=features['rolling_mean_30d'],
momentum=features['momentum_1_7'])
logger.debug("Historical features calculated (using shared calculator)",
lag_1_day=features.get('lag_1_day', 0.0),
rolling_mean_7d=features.get('rolling_mean_7d', 0.0),
rolling_mean_30d=features.get('rolling_mean_30d', 0.0),
momentum=features.get('momentum_1_7', 0.0),
days_since_last_sale=features.get('days_since_last_sale', 0),
data_availability_score=features.get('historical_data_availability_score', 0.0))
return features
@@ -770,8 +839,9 @@ class PredictionService:
'rolling_mean_7d', 'rolling_std_7d', 'rolling_max_7d', 'rolling_min_7d',
'rolling_mean_14d', 'rolling_std_14d', 'rolling_max_14d', 'rolling_min_14d',
'rolling_mean_30d', 'rolling_std_30d', 'rolling_max_30d', 'rolling_min_30d',
'momentum_1_7', 'trend_7_30', 'velocity_week'
]} | {'days_since_start': 0}
'momentum_1_7', 'trend_7_30', 'velocity_week',
'days_since_last_sale', 'historical_data_availability_score'
]}
def _prepare_prophet_features(self, features: Dict[str, Any]) -> pd.DataFrame:
"""Convert features to Prophet-compatible DataFrame - COMPLETE FEATURE MATCHING"""
@@ -962,6 +1032,9 @@ class PredictionService:
'momentum_1_7': float(features.get('momentum_1_7', 0.0)),
'trend_7_30': float(features.get('trend_7_30', 0.0)),
'velocity_week': float(features.get('velocity_week', 0.0)),
# Data freshness metrics to help model understand data recency
'days_since_last_sale': int(features.get('days_since_last_sale', 0)),
'historical_data_availability_score': float(features.get('historical_data_availability_score', 0.0)),
}
# Calculate interaction features

View File

@@ -92,7 +92,7 @@ class InventoryAlertRepository:
JOIN ingredients i ON s.ingredient_id = i.id
WHERE i.tenant_id = :tenant_id
AND s.is_available = true
AND s.expiration_date <= CURRENT_DATE + INTERVAL ':days_threshold days'
AND s.expiration_date <= CURRENT_DATE + (INTERVAL '1 day' * :days_threshold)
ORDER BY s.expiration_date ASC, total_value DESC
""")
@@ -134,7 +134,7 @@ class InventoryAlertRepository:
FROM temperature_logs tl
WHERE tl.tenant_id = :tenant_id
AND tl.is_within_range = false
AND tl.recorded_at > NOW() - INTERVAL ':hours_back hours'
AND tl.recorded_at > NOW() - (INTERVAL '1 hour' * :hours_back)
AND tl.alert_triggered = false
ORDER BY deviation DESC, tl.recorded_at DESC
""")

View File

@@ -227,9 +227,9 @@ class InventoryAlertService(BaseAlertService, AlertServiceMixin):
"""Process expiring items for a tenant"""
try:
# Group by urgency
expired = [i for i in items if i['days_to_expiry'] <= 0]
urgent = [i for i in items if 0 < i['days_to_expiry'] <= 2]
warning = [i for i in items if 2 < i['days_to_expiry'] <= 7]
expired = [i for i in items if i['days_until_expiry'] <= 0]
urgent = [i for i in items if 0 < i['days_until_expiry'] <= 2]
warning = [i for i in items if 2 < i['days_until_expiry'] <= 7]
# Process expired products (urgent alerts)
if expired:
@@ -257,7 +257,7 @@ class InventoryAlertService(BaseAlertService, AlertServiceMixin):
'name': item['name'],
'stock_id': str(item['stock_id']),
'quantity': float(item['current_quantity']),
'days_expired': abs(item['days_to_expiry'])
'days_expired': abs(item['days_until_expiry'])
} for item in expired
]
}
@@ -270,12 +270,12 @@ class InventoryAlertService(BaseAlertService, AlertServiceMixin):
'type': 'urgent_expiry',
'severity': 'high',
'title': f'⏰ Caducidad Urgente: {item["name"]}',
'message': f'{item["name"]} caduca en {item["days_to_expiry"]} día(s). Usar prioritariamente.',
'message': f'{item["name"]} caduca en {item["days_until_expiry"]} día(s). Usar prioritariamente.',
'actions': ['Usar inmediatamente', 'Promoción especial', 'Revisar recetas', 'Documentar'],
'metadata': {
'ingredient_id': str(item['id']),
'stock_id': str(item['stock_id']),
'days_to_expiry': item['days_to_expiry'],
'days_to_expiry': item['days_until_expiry'],
'quantity': float(item['current_quantity'])
}
}, item_type='alert')

View File

@@ -18,18 +18,44 @@ depends_on = None
def upgrade():
"""Rename metadata columns to additional_data to avoid SQLAlchemy reserved attribute conflict"""
# Rename metadata column in equipment_connection_logs
op.execute('ALTER TABLE equipment_connection_logs RENAME COLUMN metadata TO additional_data')
# Check if columns need to be renamed (they may already be named additional_data in migration 002)
from sqlalchemy import inspect
from alembic import op
# Rename metadata column in equipment_iot_alerts
op.execute('ALTER TABLE equipment_iot_alerts RENAME COLUMN metadata TO additional_data')
connection = op.get_bind()
inspector = inspect(connection)
# Check equipment_connection_logs table
if 'equipment_connection_logs' in inspector.get_table_names():
columns = [col['name'] for col in inspector.get_columns('equipment_connection_logs')]
if 'metadata' in columns and 'additional_data' not in columns:
op.execute('ALTER TABLE equipment_connection_logs RENAME COLUMN metadata TO additional_data')
# Check equipment_iot_alerts table
if 'equipment_iot_alerts' in inspector.get_table_names():
columns = [col['name'] for col in inspector.get_columns('equipment_iot_alerts')]
if 'metadata' in columns and 'additional_data' not in columns:
op.execute('ALTER TABLE equipment_iot_alerts RENAME COLUMN metadata TO additional_data')
def downgrade():
"""Revert column names back to metadata"""
# Revert metadata column in equipment_iot_alerts
op.execute('ALTER TABLE equipment_iot_alerts RENAME COLUMN additional_data TO metadata')
# Check if columns need to be renamed back
from sqlalchemy import inspect
from alembic import op
# Revert metadata column in equipment_connection_logs
op.execute('ALTER TABLE equipment_connection_logs RENAME COLUMN additional_data TO metadata')
connection = op.get_bind()
inspector = inspect(connection)
# Check equipment_iot_alerts table
if 'equipment_iot_alerts' in inspector.get_table_names():
columns = [col['name'] for col in inspector.get_columns('equipment_iot_alerts')]
if 'additional_data' in columns and 'metadata' not in columns:
op.execute('ALTER TABLE equipment_iot_alerts RENAME COLUMN additional_data TO metadata')
# Check equipment_connection_logs table
if 'equipment_connection_logs' in inspector.get_table_names():
columns = [col['name'] for col in inspector.get_columns('equipment_connection_logs')]
if 'additional_data' in columns and 'metadata' not in columns:
op.execute('ALTER TABLE equipment_connection_logs RENAME COLUMN additional_data TO metadata')

View File

@@ -170,13 +170,49 @@ class EnhancedTenantService:
await publish_tenant_created(str(tenant.id), owner_id, bakery_data.name)
except Exception as e:
logger.warning("Failed to publish tenant created event", error=str(e))
# Automatically create location-context with city information
# This is non-blocking - failure won't prevent tenant creation
try:
from shared.clients.external_client import ExternalServiceClient
from shared.utils.city_normalization import normalize_city_id
from app.core.config import settings
external_client = ExternalServiceClient(settings, "tenant-service")
city_id = normalize_city_id(bakery_data.city)
if city_id:
await external_client.create_tenant_location_context(
tenant_id=str(tenant.id),
city_id=city_id,
notes="Auto-created during tenant registration"
)
logger.info(
"Automatically created location-context",
tenant_id=str(tenant.id),
city_id=city_id
)
else:
logger.warning(
"Could not normalize city for location-context",
tenant_id=str(tenant.id),
city=bakery_data.city
)
except Exception as e:
logger.warning(
"Failed to auto-create location-context (non-blocking)",
tenant_id=str(tenant.id),
city=bakery_data.city,
error=str(e)
)
# Don't fail tenant creation if location-context creation fails
logger.info("Bakery created successfully",
tenant_id=tenant.id,
name=bakery_data.name,
owner_id=owner_id,
subdomain=tenant.subdomain)
return TenantResponse.from_orm(tenant)
except (ValidationError, DuplicateRecordError) as e:

View File

@@ -11,7 +11,7 @@ from sqlalchemy import text
from app.core.database import get_db
from app.schemas.training import TrainedModelResponse, ModelMetricsResponse
from app.services.training_service import EnhancedTrainingService
from datetime import datetime
from datetime import datetime, timezone
from sqlalchemy import select, delete, func
import uuid
import shutil
@@ -79,13 +79,13 @@ async def get_active_model(
# ✅ FIX: Wrap update query with text() too
update_query = text("""
UPDATE trained_models
SET last_used_at = :now
UPDATE trained_models
SET last_used_at = :now
WHERE id = :model_id
""")
await db.execute(update_query, {
"now": datetime.utcnow(),
"now": datetime.now(timezone.utc),
"model_id": model_record.id
})
await db.commit()
@@ -300,7 +300,7 @@ async def delete_tenant_models_complete(
deletion_stats = {
"tenant_id": tenant_id,
"deleted_at": datetime.utcnow().isoformat(),
"deleted_at": datetime.now(timezone.utc).isoformat(),
"jobs_cancelled": 0,
"models_deleted": 0,
"artifacts_deleted": 0,
@@ -322,7 +322,7 @@ async def delete_tenant_models_complete(
for job in active_jobs:
job.status = "cancelled"
job.updated_at = datetime.utcnow()
job.updated_at = datetime.now(timezone.utc)
deletion_stats["jobs_cancelled"] += 1
if active_jobs:

View File

@@ -17,7 +17,7 @@ from shared.database.base import create_database_manager
from shared.database.transactions import transactional
from shared.database.exceptions import DatabaseError
from app.core.config import settings
from app.ml.enhanced_features import AdvancedFeatureEngineer
from shared.ml.enhanced_features import AdvancedFeatureEngineer
import holidays
logger = structlog.get_logger()

View File

@@ -7,6 +7,7 @@ import pandas as pd
import numpy as np
from typing import Dict, List, Optional
import structlog
from shared.ml.feature_calculator import HistoricalFeatureCalculator
logger = structlog.get_logger()
@@ -19,10 +20,12 @@ class AdvancedFeatureEngineer:
def __init__(self):
self.feature_columns = []
self.feature_calculator = HistoricalFeatureCalculator()
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
"""
Add lagged demand features for capturing recent trends.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
@@ -34,14 +37,20 @@ class AdvancedFeatureEngineer:
if lag_days is None:
lag_days = [1, 7, 14]
df = df.copy()
# Use shared calculator for consistent lag calculation
df = self.feature_calculator.calculate_lag_features(
df,
lag_days=lag_days,
mode='training'
)
# Update feature columns list
for lag in lag_days:
col_name = f'lag_{lag}_day'
df[col_name] = df['quantity'].shift(lag)
self.feature_columns.append(col_name)
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
logger.info(f"Added {len(lag_days)} lagged features", lags=lag_days)
logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
return df
def add_rolling_features(
@@ -52,6 +61,7 @@ class AdvancedFeatureEngineer:
) -> pd.DataFrame:
"""
Add rolling statistics (mean, std, max, min).
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
@@ -67,24 +77,22 @@ class AdvancedFeatureEngineer:
if features is None:
features = ['mean', 'std', 'max', 'min']
df = df.copy()
# Use shared calculator for consistent rolling calculation
df = self.feature_calculator.calculate_rolling_features(
df,
windows=windows,
statistics=features,
mode='training'
)
# Update feature columns list
for window in windows:
for feature in features:
col_name = f'rolling_{feature}_{window}d'
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
if feature == 'mean':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).mean()
elif feature == 'std':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).std()
elif feature == 'max':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).max()
elif feature == 'min':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).min()
self.feature_columns.append(col_name)
logger.info(f"Added rolling features", windows=windows, features=features)
logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
return df
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
@@ -203,6 +211,7 @@ class AdvancedFeatureEngineer:
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add trend-based features.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with date and quantity
@@ -211,27 +220,18 @@ class AdvancedFeatureEngineer:
Returns:
DataFrame with trend features
"""
df = df.copy()
# Use shared calculator for consistent trend calculation
df = self.feature_calculator.calculate_trend_features(
df,
mode='training'
)
# Days since start (linear trend proxy)
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
# Momentum indicators (recent change vs. older change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
self.feature_columns.append('momentum_1_7')
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
self.feature_columns.append('trend_7_30')
# Velocity (rate of change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7
self.feature_columns.append('velocity_week')
self.feature_columns.append('days_since_start')
# Update feature columns list
for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
if feature_name in df.columns and feature_name not in self.feature_columns:
self.feature_columns.append(feature_name)
logger.debug("Added trend features (using shared calculator)")
return df
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:

View File

@@ -7,7 +7,7 @@ import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import structlog
from datetime import datetime
from datetime import datetime, timezone
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
@@ -408,7 +408,7 @@ class HybridProphetXGBoost:
},
'tenant_id': tenant_id,
'inventory_product_id': inventory_product_id,
'trained_at': datetime.utcnow().isoformat()
'trained_at': datetime.now(timezone.utc).isoformat()
}
async def predict(

View File

@@ -844,6 +844,9 @@ class EnhancedBakeryMLTrainer:
# Extract training period from the processed data
training_start_date = None
training_end_date = None
data_freshness_days = None
data_coverage_days = None
if 'ds' in processed_data.columns and not processed_data.empty:
# Ensure ds column is datetime64 before extracting dates (prevents object dtype issues)
ds_datetime = pd.to_datetime(processed_data['ds'])
@@ -857,6 +860,15 @@ class EnhancedBakeryMLTrainer:
training_start_date = pd.Timestamp(min_ts).to_pydatetime().replace(tzinfo=None)
if pd.notna(max_ts):
training_end_date = pd.Timestamp(max_ts).to_pydatetime().replace(tzinfo=None)
# Calculate data freshness metrics
if training_end_date:
from datetime import datetime
data_freshness_days = (datetime.now() - training_end_date).days
# Calculate data coverage period
if training_start_date and training_end_date:
data_coverage_days = (training_end_date - training_start_date).days
# Ensure features are clean string list
try:
@@ -864,6 +876,13 @@ class EnhancedBakeryMLTrainer:
except Exception:
features_used = []
# Prepare hyperparameters with data freshness metrics
hyperparameters = model_info.get("hyperparameters", {})
if data_freshness_days is not None:
hyperparameters["data_freshness_days"] = data_freshness_days
if data_coverage_days is not None:
hyperparameters["data_coverage_days"] = data_coverage_days
model_data = {
"tenant_id": tenant_id,
"inventory_product_id": inventory_product_id,
@@ -876,7 +895,7 @@ class EnhancedBakeryMLTrainer:
"rmse": float(model_info.get("training_metrics", {}).get("rmse", 0)) if model_info.get("training_metrics", {}).get("rmse") is not None else 0,
"r2_score": float(model_info.get("training_metrics", {}).get("r2", 0)) if model_info.get("training_metrics", {}).get("r2") is not None else 0,
"training_samples": int(len(processed_data)),
"hyperparameters": self._serialize_scalers(model_info.get("hyperparameters", {})),
"hyperparameters": self._serialize_scalers(hyperparameters),
"features_used": [str(f) for f in features_used] if features_used else [],
"normalization_params": self._serialize_scalers(self.enhanced_data_processor.get_scalers()) or {}, # Include scalers for prediction consistency
"product_category": model_info.get("product_category", "unknown"), # Store product category
@@ -890,7 +909,9 @@ class EnhancedBakeryMLTrainer:
model_record = await repos['model'].create_model(model_data)
logger.info("Created enhanced model record",
inventory_product_id=inventory_product_id,
model_id=model_record.id)
model_id=model_record.id,
data_freshness_days=data_freshness_days,
data_coverage_days=data_coverage_days)
# Create artifacts for model files
if model_info.get("model_path"):

View File

@@ -6,7 +6,7 @@ Service-specific repository base class with training service utilities
from typing import Optional, List, Dict, Any, Type
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import text
from datetime import datetime, timedelta
from datetime import datetime, timezone, timedelta
import structlog
from shared.database.repository import BaseRepository
@@ -73,7 +73,7 @@ class TrainingBaseRepository(BaseRepository):
async def cleanup_old_records(self, days_old: int = 90, status_filter: str = None) -> int:
"""Clean up old training records"""
try:
cutoff_date = datetime.utcnow() - timedelta(days=days_old)
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
table_name = self.model.__tablename__
# Build query based on available fields

View File

@@ -6,7 +6,7 @@ Repository for trained model operations
from typing import Optional, List, Dict, Any
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, and_, text, desc
from datetime import datetime, timedelta
from datetime import datetime, timezone, timedelta
import structlog
from .base import TrainingBaseRepository
@@ -144,7 +144,7 @@ class ModelRepository(TrainingBaseRepository):
# Promote this model
updated_model = await self.update(model_id, {
"is_production": True,
"last_used_at": datetime.utcnow()
"last_used_at": datetime.now(timezone.utc)
})
logger.info("Model promoted to production",
@@ -164,7 +164,7 @@ class ModelRepository(TrainingBaseRepository):
"""Update model last used timestamp"""
try:
return await self.update(model_id, {
"last_used_at": datetime.utcnow()
"last_used_at": datetime.now(timezone.utc)
})
except Exception as e:
logger.error("Failed to update model usage",
@@ -176,7 +176,7 @@ class ModelRepository(TrainingBaseRepository):
async def archive_old_models(self, tenant_id: str, days_old: int = 90) -> int:
"""Archive old non-production models"""
try:
cutoff_date = datetime.utcnow() - timedelta(days=days_old)
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
query = text("""
UPDATE trained_models
@@ -235,7 +235,7 @@ class ModelRepository(TrainingBaseRepository):
product_stats = {row.inventory_product_id: row.count for row in result.fetchall()}
# Recent activity (models created in last 30 days)
thirty_days_ago = datetime.utcnow() - timedelta(days=30)
thirty_days_ago = datetime.now(timezone.utc) - timedelta(days=30)
recent_models_query = text("""
SELECT COUNT(*) as count
FROM trained_models