409 lines
14 KiB
Python
409 lines
14 KiB
Python
# services/external/app/ingestion/ingestion_manager.py
|
|
"""
|
|
Data Ingestion Manager - Coordinates multi-city data collection
|
|
"""
|
|
|
|
from typing import List, Dict, Any
|
|
from datetime import datetime, timedelta
|
|
import structlog
|
|
import asyncio
|
|
|
|
from app.registry.city_registry import CityRegistry
|
|
from app.registry.calendar_registry import CalendarRegistry
|
|
from .adapters import get_adapter
|
|
from app.repositories.city_data_repository import CityDataRepository
|
|
from app.repositories.calendar_repository import CalendarRepository
|
|
from app.core.database import database_manager
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
class DataIngestionManager:
|
|
"""Orchestrates data ingestion across all cities"""
|
|
|
|
def __init__(self):
|
|
self.registry = CityRegistry()
|
|
self.database_manager = database_manager
|
|
|
|
async def initialize_all_cities(self, months: int = 24):
|
|
"""
|
|
Initialize historical data for all enabled cities
|
|
Called by Kubernetes Init Job
|
|
"""
|
|
enabled_cities = self.registry.get_enabled_cities()
|
|
|
|
logger.info(
|
|
"Starting full data initialization",
|
|
cities=len(enabled_cities),
|
|
months=months
|
|
)
|
|
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=months * 30)
|
|
|
|
tasks = [
|
|
self.initialize_city(city.city_id, start_date, end_date)
|
|
for city in enabled_cities
|
|
]
|
|
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
successes = sum(1 for r in results if r is True)
|
|
failures = len(results) - successes
|
|
|
|
logger.info(
|
|
"Data initialization complete",
|
|
total=len(results),
|
|
successes=successes,
|
|
failures=failures
|
|
)
|
|
|
|
# Consider success if we have at least some cities initialized (majority success)
|
|
# This allows the system to continue even if some external APIs are temporarily unavailable
|
|
if successes > 0:
|
|
logger.info(
|
|
"Partial success achieved - continuing with available data",
|
|
success_ratio=f"{successes}/{len(results)}"
|
|
)
|
|
return True
|
|
else:
|
|
logger.error("All city initializations failed - system cannot proceed")
|
|
return False
|
|
|
|
async def initialize_city(
|
|
self,
|
|
city_id: str,
|
|
start_date: datetime,
|
|
end_date: datetime
|
|
) -> bool:
|
|
"""Initialize historical data for a single city (idempotent)"""
|
|
try:
|
|
city = self.registry.get_city(city_id)
|
|
if not city:
|
|
logger.error("City not found", city_id=city_id)
|
|
return False
|
|
|
|
logger.info(
|
|
"Initializing city data",
|
|
city=city.name,
|
|
start=start_date.date(),
|
|
end=end_date.date()
|
|
)
|
|
|
|
# Check if data already exists (idempotency)
|
|
async with self.database_manager.get_session() as session:
|
|
repo = CityDataRepository(session)
|
|
coverage = await repo.get_data_coverage(city_id, start_date, end_date)
|
|
|
|
days_in_range = (end_date - start_date).days
|
|
expected_records = days_in_range # One record per day minimum
|
|
|
|
# If we have >= 90% coverage, skip initialization
|
|
threshold = expected_records * 0.9
|
|
weather_sufficient = coverage['weather'] >= threshold
|
|
traffic_sufficient = coverage['traffic'] >= threshold
|
|
|
|
if weather_sufficient and traffic_sufficient:
|
|
logger.info(
|
|
"City data already initialized, skipping",
|
|
city=city.name,
|
|
weather_records=coverage['weather'],
|
|
traffic_records=coverage['traffic'],
|
|
threshold=int(threshold)
|
|
)
|
|
return True
|
|
|
|
logger.info(
|
|
"Insufficient data coverage, proceeding with initialization",
|
|
city=city.name,
|
|
existing_weather=coverage['weather'],
|
|
existing_traffic=coverage['traffic'],
|
|
expected=expected_records
|
|
)
|
|
|
|
adapter = get_adapter(
|
|
city_id,
|
|
{
|
|
"weather_config": city.weather_config,
|
|
"traffic_config": city.traffic_config
|
|
}
|
|
)
|
|
|
|
if not await adapter.validate_connection():
|
|
logger.error("Adapter validation failed", city=city.name)
|
|
return False
|
|
|
|
# Fetch data with error handling to allow partial success
|
|
weather_data = []
|
|
traffic_data = []
|
|
|
|
# Fetch weather data
|
|
try:
|
|
weather_data = await adapter.fetch_historical_weather(
|
|
start_date, end_date
|
|
)
|
|
logger.info("Weather data fetched successfully",
|
|
records=len(weather_data), city=city.name)
|
|
except Exception as weather_error:
|
|
logger.error("Failed to fetch weather data",
|
|
city=city.name, error=str(weather_error))
|
|
# Don't return False here - continue with whatever data we can get
|
|
|
|
# Fetch traffic data
|
|
try:
|
|
traffic_data = await adapter.fetch_historical_traffic(
|
|
start_date, end_date
|
|
)
|
|
logger.info("Traffic data fetched successfully",
|
|
records=len(traffic_data), city=city.name)
|
|
except Exception as traffic_error:
|
|
logger.error("Failed to fetch traffic data",
|
|
city=city.name, error=str(traffic_error))
|
|
# Don't return False here - continue with weather data only if available
|
|
|
|
# Store available data (at least one type should be available for partial success)
|
|
async with self.database_manager.get_session() as session:
|
|
repo = CityDataRepository(session)
|
|
|
|
weather_stored = 0
|
|
traffic_stored = 0
|
|
|
|
if weather_data:
|
|
weather_stored = await repo.bulk_store_weather(
|
|
city_id, weather_data
|
|
)
|
|
|
|
if traffic_data:
|
|
traffic_stored = await repo.bulk_store_traffic(
|
|
city_id, traffic_data
|
|
)
|
|
|
|
# Only fail if both data types failed to fetch
|
|
if not weather_data and not traffic_data:
|
|
logger.error("Both weather and traffic data fetch failed", city=city.name)
|
|
return False
|
|
|
|
logger.info(
|
|
"City initialization complete",
|
|
city=city.name,
|
|
weather_records=weather_stored,
|
|
traffic_records=traffic_stored
|
|
)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"City initialization failed",
|
|
city_id=city_id,
|
|
error=str(e)
|
|
)
|
|
return False
|
|
|
|
async def rotate_monthly_data(self):
|
|
"""
|
|
Rotate 24-month window: delete old, ingest new
|
|
Called by Kubernetes CronJob monthly
|
|
"""
|
|
enabled_cities = self.registry.get_enabled_cities()
|
|
|
|
logger.info("Starting monthly data rotation", cities=len(enabled_cities))
|
|
|
|
now = datetime.now()
|
|
cutoff_date = now - timedelta(days=24 * 30)
|
|
|
|
last_month_end = now.replace(day=1) - timedelta(days=1)
|
|
last_month_start = last_month_end.replace(day=1)
|
|
|
|
tasks = []
|
|
for city in enabled_cities:
|
|
tasks.append(
|
|
self._rotate_city_data(
|
|
city.city_id,
|
|
cutoff_date,
|
|
last_month_start,
|
|
last_month_end
|
|
)
|
|
)
|
|
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
successes = sum(1 for r in results if r is True)
|
|
logger.info(
|
|
"Monthly rotation complete",
|
|
total=len(results),
|
|
successes=successes
|
|
)
|
|
|
|
async def _rotate_city_data(
|
|
self,
|
|
city_id: str,
|
|
cutoff_date: datetime,
|
|
new_start: datetime,
|
|
new_end: datetime
|
|
) -> bool:
|
|
"""Rotate data for a single city"""
|
|
try:
|
|
city = self.registry.get_city(city_id)
|
|
if not city:
|
|
return False
|
|
|
|
logger.info(
|
|
"Rotating city data",
|
|
city=city.name,
|
|
cutoff=cutoff_date.date(),
|
|
new_month=new_start.strftime("%Y-%m")
|
|
)
|
|
|
|
async with self.database_manager.get_session() as session:
|
|
repo = CityDataRepository(session)
|
|
|
|
deleted_weather = await repo.delete_weather_before(
|
|
city_id, cutoff_date
|
|
)
|
|
deleted_traffic = await repo.delete_traffic_before(
|
|
city_id, cutoff_date
|
|
)
|
|
|
|
logger.info(
|
|
"Old data deleted",
|
|
city=city.name,
|
|
weather_deleted=deleted_weather,
|
|
traffic_deleted=deleted_traffic
|
|
)
|
|
|
|
adapter = get_adapter(city_id, {
|
|
"weather_config": city.weather_config,
|
|
"traffic_config": city.traffic_config
|
|
})
|
|
|
|
new_weather = await adapter.fetch_historical_weather(
|
|
new_start, new_end
|
|
)
|
|
new_traffic = await adapter.fetch_historical_traffic(
|
|
new_start, new_end
|
|
)
|
|
|
|
async with self.database_manager.get_session() as session:
|
|
repo = CityDataRepository(session)
|
|
|
|
weather_stored = await repo.bulk_store_weather(
|
|
city_id, new_weather
|
|
)
|
|
traffic_stored = await repo.bulk_store_traffic(
|
|
city_id, new_traffic
|
|
)
|
|
|
|
logger.info(
|
|
"New data ingested",
|
|
city=city.name,
|
|
weather_added=weather_stored,
|
|
traffic_added=traffic_stored
|
|
)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"City rotation failed",
|
|
city_id=city_id,
|
|
error=str(e)
|
|
)
|
|
return False
|
|
|
|
async def seed_school_calendars(self) -> bool:
|
|
"""
|
|
Seed school calendars from CalendarRegistry into database
|
|
Called during initialization - idempotent
|
|
"""
|
|
try:
|
|
logger.info("Starting school calendar seeding...")
|
|
|
|
# Get all calendars from registry
|
|
calendars = CalendarRegistry.get_all_calendars()
|
|
logger.info(f"Found {len(calendars)} calendars in registry")
|
|
|
|
async with self.database_manager.get_session() as session:
|
|
repo = CalendarRepository(session)
|
|
|
|
seeded_count = 0
|
|
skipped_count = 0
|
|
|
|
for cal_def in calendars:
|
|
logger.info(
|
|
"Processing calendar",
|
|
calendar_id=cal_def.calendar_id,
|
|
city=cal_def.city_id,
|
|
type=cal_def.school_type.value,
|
|
year=cal_def.academic_year
|
|
)
|
|
|
|
# Check if calendar already exists (idempotency)
|
|
existing = await repo.get_calendar_by_city_type_year(
|
|
city_id=cal_def.city_id,
|
|
school_type=cal_def.school_type.value,
|
|
academic_year=cal_def.academic_year
|
|
)
|
|
|
|
if existing:
|
|
logger.info(
|
|
"Calendar already exists, skipping",
|
|
calendar_id=cal_def.calendar_id
|
|
)
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Convert holiday periods to dict format
|
|
holiday_periods = [
|
|
{
|
|
"name": hp.name,
|
|
"start_date": hp.start_date,
|
|
"end_date": hp.end_date,
|
|
"description": hp.description
|
|
}
|
|
for hp in cal_def.holiday_periods
|
|
]
|
|
|
|
# Convert school hours to dict format
|
|
school_hours = {
|
|
"morning_start": cal_def.school_hours.morning_start,
|
|
"morning_end": cal_def.school_hours.morning_end,
|
|
"has_afternoon_session": cal_def.school_hours.has_afternoon_session,
|
|
"afternoon_start": cal_def.school_hours.afternoon_start,
|
|
"afternoon_end": cal_def.school_hours.afternoon_end
|
|
}
|
|
|
|
# Create calendar in database
|
|
created_calendar = await repo.create_school_calendar(
|
|
city_id=cal_def.city_id,
|
|
calendar_name=cal_def.calendar_name,
|
|
school_type=cal_def.school_type.value,
|
|
academic_year=cal_def.academic_year,
|
|
holiday_periods=holiday_periods,
|
|
school_hours=school_hours,
|
|
source=cal_def.source,
|
|
enabled=cal_def.enabled
|
|
)
|
|
|
|
logger.info(
|
|
"Calendar seeded successfully",
|
|
calendar_id=str(created_calendar.id),
|
|
city=cal_def.city_id,
|
|
type=cal_def.school_type.value,
|
|
year=cal_def.academic_year
|
|
)
|
|
seeded_count += 1
|
|
|
|
logger.info(
|
|
"School calendar seeding completed",
|
|
seeded=seeded_count,
|
|
skipped=skipped_count,
|
|
total=len(calendars)
|
|
)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error("Error seeding school calendars", error=str(e))
|
|
return False
|