Files
bakery-ia/services/forecasting/scripts/demo/seed_demo_forecasts.py
2025-11-27 15:52:40 +01:00

517 lines
19 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Demo Forecasting Seeding Script for Forecasting Service
Creates demand forecasts and prediction batches for demo template tenants
This script runs as a Kubernetes init job inside the forecasting-service container.
"""
import asyncio
import uuid
import sys
import os
import json
import random
from datetime import datetime, timezone, timedelta
from pathlib import Path
# Add app to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import select
import structlog
from app.models.forecasts import Forecast, PredictionBatch
# Add shared path for demo utilities
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
from shared.utils.demo_dates import BASE_REFERENCE_DATE
# Configure logging
logger = structlog.get_logger()
DEMO_TENANT_SAN_PABLO = uuid.UUID("a1b2c3d4-e5f6-47a8-b9c0-d1e2f3a4b5c6") # Individual bakery
DEMO_TENANT_LA_ESPIGA = uuid.UUID("b2c3d4e5-f6a7-48b9-c0d1-e2f3a4b5c6d7") # Central bakery
# Day of week mapping
DAYS_OF_WEEK = {
0: "lunes",
1: "martes",
2: "miercoles",
3: "jueves",
4: "viernes",
5: "sabado",
6: "domingo"
}
def load_forecasting_config():
"""Load forecasting configuration from JSON file"""
config_file = Path(__file__).parent / "previsiones_config_es.json"
if not config_file.exists():
raise FileNotFoundError(f"Forecasting config file not found: {config_file}")
with open(config_file, 'r', encoding='utf-8') as f:
return json.load(f)
def calculate_datetime_from_offset(offset_days: int) -> datetime:
"""Calculate a datetime based on offset from BASE_REFERENCE_DATE"""
return BASE_REFERENCE_DATE + timedelta(days=offset_days)
def weighted_choice(choices: list) -> dict:
"""Make a weighted random choice from list of dicts with 'peso' key"""
total_weight = sum(c.get("peso", 1.0) for c in choices)
r = random.uniform(0, total_weight)
cumulative = 0
for choice in choices:
cumulative += choice.get("peso", 1.0)
if r <= cumulative:
return choice
return choices[-1]
def calculate_demand(
product: dict,
day_of_week: int,
is_weekend: bool,
weather_temp: float,
weather_precip: float,
traffic_volume: int,
config: dict
) -> float:
"""Calculate predicted demand based on various factors"""
# Base demand
base_demand = product["demanda_base_diaria"]
# Weekly trend factor
day_name = DAYS_OF_WEEK[day_of_week]
weekly_factor = product["tendencia_semanal"][day_name]
# Apply seasonality (simple growth factor for "creciente")
seasonality_factor = 1.0
if product["estacionalidad"] == "creciente":
seasonality_factor = 1.05
# Weather impact (simple model)
weather_factor = 1.0
temp_impact = config["configuracion_previsiones"]["factores_externos"]["temperatura"]["impacto_demanda"]
precip_impact = config["configuracion_previsiones"]["factores_externos"]["precipitacion"]["impacto_demanda"]
if weather_temp > 22.0:
weather_factor += temp_impact * (weather_temp - 22.0) / 10.0
if weather_precip > 0:
weather_factor += precip_impact
# Traffic correlation
traffic_correlation = config["configuracion_previsiones"]["factores_externos"]["volumen_trafico"]["correlacion_demanda"]
traffic_factor = 1.0 + (traffic_volume / 1000.0 - 1.0) * traffic_correlation
# Calculate predicted demand
predicted = base_demand * weekly_factor * seasonality_factor * weather_factor * traffic_factor
# Add randomness based on variability
variability = product["variabilidad"]
predicted = predicted * random.uniform(1.0 - variability, 1.0 + variability)
return max(0.0, predicted)
async def generate_forecasts_for_tenant(
db: AsyncSession,
tenant_id: uuid.UUID,
tenant_name: str,
business_type: str,
config: dict
):
"""Generate forecasts for a specific tenant"""
logger.info(f"Generating forecasts for: {tenant_name}", tenant_id=str(tenant_id))
# Check if forecasts already exist
result = await db.execute(
select(Forecast).where(Forecast.tenant_id == tenant_id).limit(1)
)
existing = result.scalar_one_or_none()
if existing:
logger.info(f"Forecasts already exist for {tenant_name}, skipping seed")
return {"tenant_id": str(tenant_id), "forecasts_created": 0, "batches_created": 0, "skipped": True}
forecast_config = config["configuracion_previsiones"]
batches_config = config["lotes_prediccion"]
# Get location for this business type
location = forecast_config["ubicaciones"][business_type]
# Get multiplier for central bakery
multiplier = forecast_config["multiplicador_central_bakery"] if business_type == "central_bakery" else 1.0
forecasts_created = 0
batches_created = 0
# Generate prediction batches first
num_batches = batches_config["lotes_por_tenant"]
for batch_idx in range(num_batches):
# Select batch status
status_rand = random.random()
cumulative = 0
batch_status = "completed"
for status, weight in batches_config["distribucion_estados"].items():
cumulative += weight
if status_rand <= cumulative:
batch_status = status
break
# Select forecast days
forecast_days = random.choice(batches_config["dias_prevision_lotes"])
# Create batch at different times in the past
requested_offset = -(batch_idx + 1) * 10 # Batches every 10 days in the past
requested_at = calculate_datetime_from_offset(requested_offset)
completed_at = None
processing_time = None
if batch_status == "completed":
processing_time = random.randint(5000, 25000) # 5-25 seconds
completed_at = requested_at + timedelta(milliseconds=processing_time)
batch = PredictionBatch(
id=uuid.uuid4(),
tenant_id=tenant_id,
batch_name=f"Previsión {forecast_days} días - {requested_at.strftime('%Y%m%d')}",
requested_at=requested_at,
completed_at=completed_at,
status=batch_status,
total_products=forecast_config["productos_por_tenant"],
completed_products=forecast_config["productos_por_tenant"] if batch_status == "completed" else 0,
failed_products=0 if batch_status != "failed" else random.randint(1, 3),
forecast_days=forecast_days,
business_type=business_type,
error_message="Error de conexión con servicio de clima" if batch_status == "failed" else None,
processing_time_ms=processing_time
)
db.add(batch)
batches_created += 1
await db.flush()
# Generate historical forecasts (past 30 days)
dias_historico = forecast_config["dias_historico"]
for product in forecast_config["productos_demo"]:
product_id = uuid.UUID(product["id"])
product_name = product["nombre"]
for day_offset in range(-dias_historico, 0):
forecast_date = calculate_datetime_from_offset(day_offset)
day_of_week = forecast_date.weekday()
is_weekend = day_of_week >= 5
# Generate weather data
weather_temp = random.uniform(
forecast_config["factores_externos"]["temperatura"]["min"],
forecast_config["factores_externos"]["temperatura"]["max"]
)
weather_precip = 0.0
if random.random() < forecast_config["factores_externos"]["precipitacion"]["probabilidad_lluvia"]:
weather_precip = random.uniform(0.5, forecast_config["factores_externos"]["precipitacion"]["mm_promedio"])
weather_descriptions = ["Despejado", "Parcialmente nublado", "Nublado", "Lluvia ligera", "Lluvia"]
weather_desc = random.choice(weather_descriptions)
# Traffic volume
traffic_volume = random.randint(
forecast_config["factores_externos"]["volumen_trafico"]["min"],
forecast_config["factores_externos"]["volumen_trafico"]["max"]
)
# Calculate demand
predicted_demand = calculate_demand(
product, day_of_week, is_weekend,
weather_temp, weather_precip, traffic_volume, config
)
# Apply multiplier for central bakery
predicted_demand *= multiplier
# Calculate confidence intervals
lower_pct = forecast_config["precision_modelo"]["intervalo_confianza_porcentaje"]["inferior"] / 100.0
upper_pct = forecast_config["precision_modelo"]["intervalo_confianza_porcentaje"]["superior"] / 100.0
confidence_lower = predicted_demand * (1.0 - lower_pct)
confidence_upper = predicted_demand * (1.0 + upper_pct)
# Select algorithm
algorithm_choice = weighted_choice(forecast_config["algoritmos"])
algorithm = algorithm_choice["algoritmo"]
# Processing time
processing_time = random.randint(
forecast_config["tiempo_procesamiento_ms"]["min"],
forecast_config["tiempo_procesamiento_ms"]["max"]
)
# Model info
model_version = f"v{random.randint(1, 3)}.{random.randint(0, 9)}"
model_id = f"{algorithm}_{business_type}_{model_version}"
# Create forecast
forecast = Forecast(
id=uuid.uuid4(),
tenant_id=tenant_id,
inventory_product_id=product_id,
product_name=product_name,
location=location,
forecast_date=forecast_date,
created_at=forecast_date - timedelta(days=1), # Created day before
predicted_demand=predicted_demand,
confidence_lower=confidence_lower,
confidence_upper=confidence_upper,
confidence_level=forecast_config["nivel_confianza"],
model_id=model_id,
model_version=model_version,
algorithm=algorithm,
business_type=business_type,
day_of_week=day_of_week,
is_holiday=False, # Could add holiday logic
is_weekend=is_weekend,
weather_temperature=weather_temp,
weather_precipitation=weather_precip,
weather_description=weather_desc,
traffic_volume=traffic_volume,
processing_time_ms=processing_time,
features_used={
"day_of_week": True,
"weather": True,
"traffic": True,
"historical_demand": True,
"seasonality": True
}
)
db.add(forecast)
forecasts_created += 1
# Generate future forecasts (next 14 days)
dias_futuro = forecast_config["dias_prevision_futuro"]
for product in forecast_config["productos_demo"]:
product_id = uuid.UUID(product["id"])
product_name = product["nombre"]
for day_offset in range(1, dias_futuro + 1):
forecast_date = calculate_datetime_from_offset(day_offset)
day_of_week = forecast_date.weekday()
is_weekend = day_of_week >= 5
# Generate weather forecast data (slightly less certain)
weather_temp = random.uniform(
forecast_config["factores_externos"]["temperatura"]["min"],
forecast_config["factores_externos"]["temperatura"]["max"]
)
weather_precip = 0.0
if random.random() < forecast_config["factores_externos"]["precipitacion"]["probabilidad_lluvia"]:
weather_precip = random.uniform(0.5, forecast_config["factores_externos"]["precipitacion"]["mm_promedio"])
weather_desc = random.choice(["Despejado", "Parcialmente nublado", "Nublado"])
traffic_volume = random.randint(
forecast_config["factores_externos"]["volumen_trafico"]["min"],
forecast_config["factores_externos"]["volumen_trafico"]["max"]
)
# Calculate demand
predicted_demand = calculate_demand(
product, day_of_week, is_weekend,
weather_temp, weather_precip, traffic_volume, config
)
predicted_demand *= multiplier
# Wider confidence intervals for future predictions
lower_pct = (forecast_config["precision_modelo"]["intervalo_confianza_porcentaje"]["inferior"] + 5.0) / 100.0
upper_pct = (forecast_config["precision_modelo"]["intervalo_confianza_porcentaje"]["superior"] + 5.0) / 100.0
confidence_lower = predicted_demand * (1.0 - lower_pct)
confidence_upper = predicted_demand * (1.0 + upper_pct)
algorithm_choice = weighted_choice(forecast_config["algoritmos"])
algorithm = algorithm_choice["algoritmo"]
processing_time = random.randint(
forecast_config["tiempo_procesamiento_ms"]["min"],
forecast_config["tiempo_procesamiento_ms"]["max"]
)
model_version = f"v{random.randint(1, 3)}.{random.randint(0, 9)}"
model_id = f"{algorithm}_{business_type}_{model_version}"
forecast = Forecast(
id=uuid.uuid4(),
tenant_id=tenant_id,
inventory_product_id=product_id,
product_name=product_name,
location=location,
forecast_date=forecast_date,
created_at=BASE_REFERENCE_DATE, # Created today
predicted_demand=predicted_demand,
confidence_lower=confidence_lower,
confidence_upper=confidence_upper,
confidence_level=forecast_config["nivel_confianza"],
model_id=model_id,
model_version=model_version,
algorithm=algorithm,
business_type=business_type,
day_of_week=day_of_week,
is_holiday=False,
is_weekend=is_weekend,
weather_temperature=weather_temp,
weather_precipitation=weather_precip,
weather_description=weather_desc,
traffic_volume=traffic_volume,
processing_time_ms=processing_time,
features_used={
"day_of_week": True,
"weather": True,
"traffic": True,
"historical_demand": True,
"seasonality": True
}
)
db.add(forecast)
forecasts_created += 1
await db.commit()
logger.info(f"Successfully created {forecasts_created} forecasts and {batches_created} batches for {tenant_name}")
return {
"tenant_id": str(tenant_id),
"forecasts_created": forecasts_created,
"batches_created": batches_created,
"skipped": False
}
async def seed_all(db: AsyncSession):
"""Seed all demo tenants with forecasting data"""
logger.info("Starting demo forecasting seed process")
# Load configuration
config = load_forecasting_config()
results = []
# Seed San Pablo (Individual Bakery)
result_san_pablo = await generate_forecasts_for_tenant(
db,
DEMO_TENANT_SAN_PABLO,
"San Pablo - Individual Bakery",
"individual_bakery",
config
)
results.append(result_san_pablo)
# Seed La Espiga (Central Bakery)
result_la_espiga = await generate_forecasts_for_tenant(
db,
DEMO_TENANT_LA_ESPIGA,
"La Espiga - Central Bakery",
"central_bakery",
config
)
results.append(result_la_espiga)
total_forecasts = sum(r["forecasts_created"] for r in results)
total_batches = sum(r["batches_created"] for r in results)
return {
"results": results,
"total_forecasts_created": total_forecasts,
"total_batches_created": total_batches,
"status": "completed"
}
def validate_base_reference_date():
"""Ensure BASE_REFERENCE_DATE hasn't changed since last seed"""
expected_date = datetime(2025, 1, 8, 6, 0, 0, tzinfo=timezone.utc)
if BASE_REFERENCE_DATE != expected_date:
logger.warning(
"BASE_REFERENCE_DATE has changed! This may cause date inconsistencies.",
current=BASE_REFERENCE_DATE.isoformat(),
expected=expected_date.isoformat()
)
# Don't fail - just warn. Allow intentional changes.
logger.info("BASE_REFERENCE_DATE validation", date=BASE_REFERENCE_DATE.isoformat())
async def main():
"""Main execution function"""
validate_base_reference_date() # Add this line
# Get database URL from environment
database_url = os.getenv("FORECASTING_DATABASE_URL")
if not database_url:
logger.error("FORECASTING_DATABASE_URL environment variable must be set")
return 1
# Ensure asyncpg driver
if database_url.startswith("postgresql://"):
database_url = database_url.replace("postgresql://", "postgresql+asyncpg://", 1)
# Create async engine
engine = create_async_engine(database_url, echo=False)
async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
try:
async with async_session() as session:
result = await seed_all(session)
logger.info(
"Forecasting seed completed successfully!",
total_forecasts=result["total_forecasts_created"],
total_batches=result["total_batches_created"],
status=result["status"]
)
# Print summary
print("\n" + "="*60)
print("DEMO FORECASTING SEED SUMMARY")
print("="*60)
for tenant_result in result["results"]:
tenant_id = tenant_result["tenant_id"]
forecasts = tenant_result["forecasts_created"]
batches = tenant_result["batches_created"]
skipped = tenant_result.get("skipped", False)
status = "SKIPPED (already exists)" if skipped else f"CREATED {forecasts} forecasts, {batches} batches"
print(f"Tenant {tenant_id}: {status}")
print(f"\nTotal Forecasts: {result['total_forecasts_created']}")
print(f"Total Batches: {result['total_batches_created']}")
print("="*60 + "\n")
return 0
except Exception as e:
logger.error(f"Forecasting seed failed: {str(e)}", exc_info=True)
return 1
finally:
await engine.dispose()
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)