Files
bakery-ia/services/forecasting/app/services/historical_validation_service.py
2025-11-18 07:17:17 +01:00

481 lines
17 KiB
Python

# ================================================================
# services/forecasting/app/services/historical_validation_service.py
# ================================================================
"""
Historical Validation Service
Handles validation backfill when historical sales data is uploaded late.
Detects gaps in validation coverage and automatically triggers validation
for periods where forecasts exist but haven't been validated yet.
"""
from typing import Dict, Any, List, Optional
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, and_, func, Date, or_
from datetime import datetime, timedelta, timezone, date
import structlog
import uuid
from app.models.forecasts import Forecast
from app.models.validation_run import ValidationRun
from app.models.sales_data_update import SalesDataUpdate
from app.services.validation_service import ValidationService
from shared.database.exceptions import DatabaseError
logger = structlog.get_logger()
class HistoricalValidationService:
"""Service for backfilling historical validation when sales data arrives late"""
def __init__(self, db_session: AsyncSession):
self.db = db_session
self.validation_service = ValidationService(db_session)
async def detect_validation_gaps(
self,
tenant_id: uuid.UUID,
lookback_days: int = 90
) -> List[Dict[str, Any]]:
"""
Detect date ranges where forecasts exist but haven't been validated
Args:
tenant_id: Tenant identifier
lookback_days: How far back to check (default 90 days)
Returns:
List of gap periods with date ranges
"""
try:
end_date = datetime.now(timezone.utc)
start_date = end_date - timedelta(days=lookback_days)
logger.info(
"Detecting validation gaps",
tenant_id=tenant_id,
start_date=start_date.isoformat(),
end_date=end_date.isoformat()
)
# Get all dates with forecasts
forecast_query = select(
func.cast(Forecast.forecast_date, Date).label('forecast_date')
).where(
and_(
Forecast.tenant_id == tenant_id,
Forecast.forecast_date >= start_date,
Forecast.forecast_date <= end_date
)
).group_by(
func.cast(Forecast.forecast_date, Date)
).order_by(
func.cast(Forecast.forecast_date, Date)
)
forecast_result = await self.db.execute(forecast_query)
forecast_dates = {row.forecast_date for row in forecast_result.fetchall()}
if not forecast_dates:
logger.info("No forecasts found in lookback period", tenant_id=tenant_id)
return []
# Get all dates that have been validated
validation_query = select(
func.cast(ValidationRun.validation_start_date, Date).label('validated_date')
).where(
and_(
ValidationRun.tenant_id == tenant_id,
ValidationRun.status == "completed",
ValidationRun.validation_start_date >= start_date,
ValidationRun.validation_end_date <= end_date
)
).group_by(
func.cast(ValidationRun.validation_start_date, Date)
)
validation_result = await self.db.execute(validation_query)
validated_dates = {row.validated_date for row in validation_result.fetchall()}
# Find gaps (dates with forecasts but no validation)
gap_dates = sorted(forecast_dates - validated_dates)
if not gap_dates:
logger.info("No validation gaps found", tenant_id=tenant_id)
return []
# Group consecutive dates into ranges
gaps = []
current_gap_start = gap_dates[0]
current_gap_end = gap_dates[0]
for i in range(1, len(gap_dates)):
if (gap_dates[i] - current_gap_end).days == 1:
# Consecutive date, extend current gap
current_gap_end = gap_dates[i]
else:
# Gap in dates, save current gap and start new one
gaps.append({
"start_date": current_gap_start,
"end_date": current_gap_end,
"days_count": (current_gap_end - current_gap_start).days + 1
})
current_gap_start = gap_dates[i]
current_gap_end = gap_dates[i]
# Don't forget the last gap
gaps.append({
"start_date": current_gap_start,
"end_date": current_gap_end,
"days_count": (current_gap_end - current_gap_start).days + 1
})
logger.info(
"Validation gaps detected",
tenant_id=tenant_id,
gaps_count=len(gaps),
total_days=len(gap_dates)
)
return gaps
except Exception as e:
logger.error(
"Failed to detect validation gaps",
tenant_id=tenant_id,
error=str(e)
)
raise DatabaseError(f"Failed to detect validation gaps: {str(e)}")
async def backfill_validation(
self,
tenant_id: uuid.UUID,
start_date: date,
end_date: date,
triggered_by: str = "manual",
sales_data_update_id: Optional[uuid.UUID] = None
) -> Dict[str, Any]:
"""
Backfill validation for a historical date range
Args:
tenant_id: Tenant identifier
start_date: Start date for backfill
end_date: End date for backfill
triggered_by: How this backfill was triggered
sales_data_update_id: Optional link to sales data update record
Returns:
Backfill results with validation summary
"""
try:
logger.info(
"Starting validation backfill",
tenant_id=tenant_id,
start_date=start_date.isoformat(),
end_date=end_date.isoformat(),
triggered_by=triggered_by
)
# Convert dates to datetime
start_datetime = datetime.combine(start_date, datetime.min.time()).replace(tzinfo=timezone.utc)
end_datetime = datetime.combine(end_date, datetime.max.time()).replace(tzinfo=timezone.utc)
# Run validation for the date range
validation_result = await self.validation_service.validate_date_range(
tenant_id=tenant_id,
start_date=start_datetime,
end_date=end_datetime,
orchestration_run_id=None,
triggered_by=triggered_by
)
# Update sales data update record if provided
if sales_data_update_id:
await self._update_sales_data_record(
sales_data_update_id=sales_data_update_id,
validation_run_id=uuid.UUID(validation_result["validation_run_id"]),
status="completed" if validation_result["status"] == "completed" else "failed"
)
logger.info(
"Validation backfill completed",
tenant_id=tenant_id,
validation_run_id=validation_result.get("validation_run_id"),
forecasts_evaluated=validation_result.get("forecasts_evaluated")
)
return {
**validation_result,
"backfill_date_range": {
"start": start_date.isoformat(),
"end": end_date.isoformat()
}
}
except Exception as e:
logger.error(
"Validation backfill failed",
tenant_id=tenant_id,
start_date=start_date.isoformat(),
end_date=end_date.isoformat(),
error=str(e)
)
if sales_data_update_id:
await self._update_sales_data_record(
sales_data_update_id=sales_data_update_id,
validation_run_id=None,
status="failed",
error_message=str(e)
)
raise DatabaseError(f"Validation backfill failed: {str(e)}")
async def auto_backfill_gaps(
self,
tenant_id: uuid.UUID,
lookback_days: int = 90,
max_gaps_to_process: int = 10
) -> Dict[str, Any]:
"""
Automatically detect and backfill validation gaps
Args:
tenant_id: Tenant identifier
lookback_days: How far back to check
max_gaps_to_process: Maximum number of gaps to process in one run
Returns:
Summary of backfill operations
"""
try:
logger.info(
"Starting auto backfill",
tenant_id=tenant_id,
lookback_days=lookback_days
)
# Detect gaps
gaps = await self.detect_validation_gaps(tenant_id, lookback_days)
if not gaps:
return {
"gaps_found": 0,
"gaps_processed": 0,
"validations_completed": 0,
"message": "No validation gaps found"
}
# Limit number of gaps to process
gaps_to_process = gaps[:max_gaps_to_process]
results = []
for gap in gaps_to_process:
try:
result = await self.backfill_validation(
tenant_id=tenant_id,
start_date=gap["start_date"],
end_date=gap["end_date"],
triggered_by="auto_backfill"
)
results.append({
"gap": gap,
"result": result,
"status": "success"
})
except Exception as e:
logger.error(
"Failed to backfill gap",
gap=gap,
error=str(e)
)
results.append({
"gap": gap,
"error": str(e),
"status": "failed"
})
successful = sum(1 for r in results if r["status"] == "success")
logger.info(
"Auto backfill completed",
tenant_id=tenant_id,
gaps_found=len(gaps),
gaps_processed=len(results),
successful=successful
)
return {
"gaps_found": len(gaps),
"gaps_processed": len(results),
"validations_completed": successful,
"validations_failed": len(results) - successful,
"results": results
}
except Exception as e:
logger.error(
"Auto backfill failed",
tenant_id=tenant_id,
error=str(e)
)
raise DatabaseError(f"Auto backfill failed: {str(e)}")
async def register_sales_data_update(
self,
tenant_id: uuid.UUID,
start_date: date,
end_date: date,
records_affected: int,
update_source: str = "import",
import_job_id: Optional[str] = None,
auto_trigger_validation: bool = True
) -> Dict[str, Any]:
"""
Register a sales data update and optionally trigger validation
Args:
tenant_id: Tenant identifier
start_date: Start date of updated data
end_date: End date of updated data
records_affected: Number of sales records affected
update_source: Source of update (import, manual, pos_sync)
import_job_id: Optional import job ID
auto_trigger_validation: Whether to automatically trigger validation
Returns:
Update record and validation result if triggered
"""
try:
# Create sales data update record
update_record = SalesDataUpdate(
tenant_id=tenant_id,
update_date_start=start_date,
update_date_end=end_date,
records_affected=records_affected,
update_source=update_source,
import_job_id=import_job_id,
requires_validation=auto_trigger_validation,
validation_status="pending" if auto_trigger_validation else "not_required"
)
self.db.add(update_record)
await self.db.flush()
logger.info(
"Registered sales data update",
tenant_id=tenant_id,
update_id=update_record.id,
date_range=f"{start_date} to {end_date}",
records_affected=records_affected
)
result = {
"update_id": str(update_record.id),
"update_record": update_record.to_dict(),
"validation_triggered": False
}
# Trigger validation if requested
if auto_trigger_validation:
try:
validation_result = await self.backfill_validation(
tenant_id=tenant_id,
start_date=start_date,
end_date=end_date,
triggered_by="sales_data_update",
sales_data_update_id=update_record.id
)
result["validation_triggered"] = True
result["validation_result"] = validation_result
logger.info(
"Validation triggered for sales data update",
update_id=update_record.id,
validation_run_id=validation_result.get("validation_run_id")
)
except Exception as e:
logger.error(
"Failed to trigger validation for sales data update",
update_id=update_record.id,
error=str(e)
)
update_record.validation_status = "failed"
update_record.validation_error = str(e)[:500]
await self.db.commit()
return result
except Exception as e:
logger.error(
"Failed to register sales data update",
tenant_id=tenant_id,
error=str(e)
)
await self.db.rollback()
raise DatabaseError(f"Failed to register sales data update: {str(e)}")
async def _update_sales_data_record(
self,
sales_data_update_id: uuid.UUID,
validation_run_id: Optional[uuid.UUID],
status: str,
error_message: Optional[str] = None
):
"""Update sales data update record with validation results"""
try:
query = select(SalesDataUpdate).where(SalesDataUpdate.id == sales_data_update_id)
result = await self.db.execute(query)
update_record = result.scalar_one_or_none()
if update_record:
update_record.validation_status = status
update_record.validation_run_id = validation_run_id
update_record.validated_at = datetime.now(timezone.utc)
if error_message:
update_record.validation_error = error_message[:500]
await self.db.commit()
except Exception as e:
logger.error(
"Failed to update sales data record",
sales_data_update_id=sales_data_update_id,
error=str(e)
)
async def get_pending_validations(
self,
tenant_id: uuid.UUID,
limit: int = 50
) -> List[SalesDataUpdate]:
"""Get pending sales data updates that need validation"""
try:
query = (
select(SalesDataUpdate)
.where(
and_(
SalesDataUpdate.tenant_id == tenant_id,
SalesDataUpdate.validation_status == "pending",
SalesDataUpdate.requires_validation == True
)
)
.order_by(SalesDataUpdate.created_at)
.limit(limit)
)
result = await self.db.execute(query)
return result.scalars().all()
except Exception as e:
logger.error(
"Failed to get pending validations",
tenant_id=tenant_id,
error=str(e)
)
raise DatabaseError(f"Failed to get pending validations: {str(e)}")