Initial commit - production deployment
This commit is contained in:
1
services/external/app/ingestion/__init__.py
vendored
Normal file
1
services/external/app/ingestion/__init__.py
vendored
Normal file
@@ -0,0 +1 @@
|
||||
"""Data ingestion module for multi-city external data"""
|
||||
20
services/external/app/ingestion/adapters/__init__.py
vendored
Normal file
20
services/external/app/ingestion/adapters/__init__.py
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
# services/external/app/ingestion/adapters/__init__.py
|
||||
"""
|
||||
Adapter registry - Maps city IDs to adapter implementations
|
||||
"""
|
||||
|
||||
from typing import Dict, Type
|
||||
from ..base_adapter import CityDataAdapter
|
||||
from .madrid_adapter import MadridAdapter
|
||||
|
||||
ADAPTER_REGISTRY: Dict[str, Type[CityDataAdapter]] = {
|
||||
"madrid": MadridAdapter,
|
||||
}
|
||||
|
||||
|
||||
def get_adapter(city_id: str, config: Dict) -> CityDataAdapter:
|
||||
"""Factory to instantiate appropriate adapter"""
|
||||
adapter_class = ADAPTER_REGISTRY.get(city_id)
|
||||
if not adapter_class:
|
||||
raise ValueError(f"No adapter registered for city: {city_id}")
|
||||
return adapter_class(city_id, config)
|
||||
152
services/external/app/ingestion/adapters/madrid_adapter.py
vendored
Normal file
152
services/external/app/ingestion/adapters/madrid_adapter.py
vendored
Normal file
@@ -0,0 +1,152 @@
|
||||
# services/external/app/ingestion/adapters/madrid_adapter.py
|
||||
"""
|
||||
Madrid city data adapter - Uses existing AEMET and Madrid OpenData clients
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
import structlog
|
||||
|
||||
from ..base_adapter import CityDataAdapter
|
||||
from app.external.aemet import AEMETClient
|
||||
from app.external.apis.madrid_traffic_client import MadridTrafficClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class MadridAdapter(CityDataAdapter):
|
||||
"""Adapter for Madrid using AEMET + Madrid OpenData"""
|
||||
|
||||
def __init__(self, city_id: str, config: Dict[str, Any]):
|
||||
super().__init__(city_id, config)
|
||||
self.aemet_client = AEMETClient()
|
||||
self.traffic_client = MadridTrafficClient()
|
||||
|
||||
self.madrid_lat = 40.4168
|
||||
self.madrid_lon = -3.7038
|
||||
|
||||
async def fetch_historical_weather(
|
||||
self,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch historical weather from AEMET"""
|
||||
try:
|
||||
logger.info(
|
||||
"Fetching Madrid historical weather",
|
||||
start=start_date.isoformat(),
|
||||
end=end_date.isoformat()
|
||||
)
|
||||
|
||||
weather_data = await self.aemet_client.get_historical_weather(
|
||||
self.madrid_lat,
|
||||
self.madrid_lon,
|
||||
start_date,
|
||||
end_date
|
||||
)
|
||||
|
||||
for record in weather_data:
|
||||
record['city_id'] = self.city_id
|
||||
record['city_name'] = 'Madrid'
|
||||
|
||||
logger.info(
|
||||
"Madrid weather data fetched",
|
||||
records=len(weather_data)
|
||||
)
|
||||
|
||||
return weather_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching Madrid weather", error=str(e))
|
||||
return []
|
||||
|
||||
async def fetch_historical_traffic(
|
||||
self,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch historical traffic from Madrid OpenData"""
|
||||
try:
|
||||
logger.info(
|
||||
"Fetching Madrid historical traffic",
|
||||
start=start_date.isoformat(),
|
||||
end=end_date.isoformat()
|
||||
)
|
||||
|
||||
traffic_data = await self.traffic_client.get_historical_traffic(
|
||||
self.madrid_lat,
|
||||
self.madrid_lon,
|
||||
start_date,
|
||||
end_date
|
||||
)
|
||||
|
||||
for record in traffic_data:
|
||||
record['city_id'] = self.city_id
|
||||
record['city_name'] = 'Madrid'
|
||||
|
||||
logger.info(
|
||||
"Madrid traffic data fetched",
|
||||
records=len(traffic_data)
|
||||
)
|
||||
|
||||
return traffic_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching Madrid traffic", error=str(e))
|
||||
return []
|
||||
|
||||
async def validate_connection(self) -> bool:
|
||||
"""Validate connection to AEMET and Madrid OpenData
|
||||
|
||||
Note: Validation is lenient - allows partial failures for temporary API issues.
|
||||
AEMET rate limits may cause weather validation to fail during initialization.
|
||||
Madrid traffic API outages should not block validation entirely.
|
||||
"""
|
||||
try:
|
||||
traffic_validation_passed = False
|
||||
weather_validation_passed = False
|
||||
|
||||
# Try traffic API first
|
||||
try:
|
||||
test_traffic = await self.traffic_client.get_current_traffic(
|
||||
self.madrid_lat,
|
||||
self.madrid_lon
|
||||
)
|
||||
|
||||
if test_traffic is not None and len(test_traffic) > 0:
|
||||
traffic_validation_passed = True
|
||||
logger.info("Traffic API validation successful")
|
||||
else:
|
||||
logger.warning("Traffic API validation failed - temporary unavailability (proceeding anyway)")
|
||||
except Exception as traffic_error:
|
||||
logger.warning("Traffic API validation error (temporary unavailability) - proceeding anyway", error=str(traffic_error))
|
||||
|
||||
# Try weather API
|
||||
try:
|
||||
test_weather = await self.aemet_client.get_current_weather(
|
||||
self.madrid_lat,
|
||||
self.madrid_lon
|
||||
)
|
||||
|
||||
if test_weather is not None:
|
||||
weather_validation_passed = True
|
||||
logger.info("Weather API validation successful")
|
||||
else:
|
||||
logger.warning("Weather API validation failed (likely rate limited) - proceeding anyway")
|
||||
except Exception as weather_error:
|
||||
logger.warning("Weather API validation error - proceeding anyway", error=str(weather_error))
|
||||
|
||||
# At least one validation should pass for basic connectivity
|
||||
if not traffic_validation_passed and not weather_validation_passed:
|
||||
logger.error("Both traffic and weather API validations failed - no connectivity")
|
||||
return False
|
||||
|
||||
# Return success if at least one API is accessible
|
||||
logger.info("Adapter connection validation passed",
|
||||
traffic_valid=traffic_validation_passed,
|
||||
weather_valid=weather_validation_passed)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Madrid adapter connection validation failed", error=str(e))
|
||||
return False
|
||||
43
services/external/app/ingestion/base_adapter.py
vendored
Normal file
43
services/external/app/ingestion/base_adapter.py
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
# services/external/app/ingestion/base_adapter.py
|
||||
"""
|
||||
Base adapter interface for city-specific data sources
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class CityDataAdapter(ABC):
|
||||
"""Abstract base class for city-specific data adapters"""
|
||||
|
||||
def __init__(self, city_id: str, config: Dict[str, Any]):
|
||||
self.city_id = city_id
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
async def fetch_historical_weather(
|
||||
self,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch historical weather data for date range"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def fetch_historical_traffic(
|
||||
self,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch historical traffic data for date range"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def validate_connection(self) -> bool:
|
||||
"""Validate connection to data source"""
|
||||
pass
|
||||
|
||||
def get_city_id(self) -> str:
|
||||
"""Get city identifier"""
|
||||
return self.city_id
|
||||
408
services/external/app/ingestion/ingestion_manager.py
vendored
Normal file
408
services/external/app/ingestion/ingestion_manager.py
vendored
Normal file
@@ -0,0 +1,408 @@
|
||||
# services/external/app/ingestion/ingestion_manager.py
|
||||
"""
|
||||
Data Ingestion Manager - Coordinates multi-city data collection
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime, timedelta
|
||||
import structlog
|
||||
import asyncio
|
||||
|
||||
from app.registry.city_registry import CityRegistry
|
||||
from app.registry.calendar_registry import CalendarRegistry
|
||||
from .adapters import get_adapter
|
||||
from app.repositories.city_data_repository import CityDataRepository
|
||||
from app.repositories.calendar_repository import CalendarRepository
|
||||
from app.core.database import database_manager
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class DataIngestionManager:
|
||||
"""Orchestrates data ingestion across all cities"""
|
||||
|
||||
def __init__(self):
|
||||
self.registry = CityRegistry()
|
||||
self.database_manager = database_manager
|
||||
|
||||
async def initialize_all_cities(self, months: int = 24):
|
||||
"""
|
||||
Initialize historical data for all enabled cities
|
||||
Called by Kubernetes Init Job
|
||||
"""
|
||||
enabled_cities = self.registry.get_enabled_cities()
|
||||
|
||||
logger.info(
|
||||
"Starting full data initialization",
|
||||
cities=len(enabled_cities),
|
||||
months=months
|
||||
)
|
||||
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=months * 30)
|
||||
|
||||
tasks = [
|
||||
self.initialize_city(city.city_id, start_date, end_date)
|
||||
for city in enabled_cities
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
successes = sum(1 for r in results if r is True)
|
||||
failures = len(results) - successes
|
||||
|
||||
logger.info(
|
||||
"Data initialization complete",
|
||||
total=len(results),
|
||||
successes=successes,
|
||||
failures=failures
|
||||
)
|
||||
|
||||
# Consider success if we have at least some cities initialized (majority success)
|
||||
# This allows the system to continue even if some external APIs are temporarily unavailable
|
||||
if successes > 0:
|
||||
logger.info(
|
||||
"Partial success achieved - continuing with available data",
|
||||
success_ratio=f"{successes}/{len(results)}"
|
||||
)
|
||||
return True
|
||||
else:
|
||||
logger.error("All city initializations failed - system cannot proceed")
|
||||
return False
|
||||
|
||||
async def initialize_city(
|
||||
self,
|
||||
city_id: str,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> bool:
|
||||
"""Initialize historical data for a single city (idempotent)"""
|
||||
try:
|
||||
city = self.registry.get_city(city_id)
|
||||
if not city:
|
||||
logger.error("City not found", city_id=city_id)
|
||||
return False
|
||||
|
||||
logger.info(
|
||||
"Initializing city data",
|
||||
city=city.name,
|
||||
start=start_date.date(),
|
||||
end=end_date.date()
|
||||
)
|
||||
|
||||
# Check if data already exists (idempotency)
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CityDataRepository(session)
|
||||
coverage = await repo.get_data_coverage(city_id, start_date, end_date)
|
||||
|
||||
days_in_range = (end_date - start_date).days
|
||||
expected_records = days_in_range # One record per day minimum
|
||||
|
||||
# If we have >= 90% coverage, skip initialization
|
||||
threshold = expected_records * 0.9
|
||||
weather_sufficient = coverage['weather'] >= threshold
|
||||
traffic_sufficient = coverage['traffic'] >= threshold
|
||||
|
||||
if weather_sufficient and traffic_sufficient:
|
||||
logger.info(
|
||||
"City data already initialized, skipping",
|
||||
city=city.name,
|
||||
weather_records=coverage['weather'],
|
||||
traffic_records=coverage['traffic'],
|
||||
threshold=int(threshold)
|
||||
)
|
||||
return True
|
||||
|
||||
logger.info(
|
||||
"Insufficient data coverage, proceeding with initialization",
|
||||
city=city.name,
|
||||
existing_weather=coverage['weather'],
|
||||
existing_traffic=coverage['traffic'],
|
||||
expected=expected_records
|
||||
)
|
||||
|
||||
adapter = get_adapter(
|
||||
city_id,
|
||||
{
|
||||
"weather_config": city.weather_config,
|
||||
"traffic_config": city.traffic_config
|
||||
}
|
||||
)
|
||||
|
||||
if not await adapter.validate_connection():
|
||||
logger.error("Adapter validation failed", city=city.name)
|
||||
return False
|
||||
|
||||
# Fetch data with error handling to allow partial success
|
||||
weather_data = []
|
||||
traffic_data = []
|
||||
|
||||
# Fetch weather data
|
||||
try:
|
||||
weather_data = await adapter.fetch_historical_weather(
|
||||
start_date, end_date
|
||||
)
|
||||
logger.info("Weather data fetched successfully",
|
||||
records=len(weather_data), city=city.name)
|
||||
except Exception as weather_error:
|
||||
logger.error("Failed to fetch weather data",
|
||||
city=city.name, error=str(weather_error))
|
||||
# Don't return False here - continue with whatever data we can get
|
||||
|
||||
# Fetch traffic data
|
||||
try:
|
||||
traffic_data = await adapter.fetch_historical_traffic(
|
||||
start_date, end_date
|
||||
)
|
||||
logger.info("Traffic data fetched successfully",
|
||||
records=len(traffic_data), city=city.name)
|
||||
except Exception as traffic_error:
|
||||
logger.error("Failed to fetch traffic data",
|
||||
city=city.name, error=str(traffic_error))
|
||||
# Don't return False here - continue with weather data only if available
|
||||
|
||||
# Store available data (at least one type should be available for partial success)
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CityDataRepository(session)
|
||||
|
||||
weather_stored = 0
|
||||
traffic_stored = 0
|
||||
|
||||
if weather_data:
|
||||
weather_stored = await repo.bulk_store_weather(
|
||||
city_id, weather_data
|
||||
)
|
||||
|
||||
if traffic_data:
|
||||
traffic_stored = await repo.bulk_store_traffic(
|
||||
city_id, traffic_data
|
||||
)
|
||||
|
||||
# Only fail if both data types failed to fetch
|
||||
if not weather_data and not traffic_data:
|
||||
logger.error("Both weather and traffic data fetch failed", city=city.name)
|
||||
return False
|
||||
|
||||
logger.info(
|
||||
"City initialization complete",
|
||||
city=city.name,
|
||||
weather_records=weather_stored,
|
||||
traffic_records=traffic_stored
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"City initialization failed",
|
||||
city_id=city_id,
|
||||
error=str(e)
|
||||
)
|
||||
return False
|
||||
|
||||
async def rotate_monthly_data(self):
|
||||
"""
|
||||
Rotate 24-month window: delete old, ingest new
|
||||
Called by Kubernetes CronJob monthly
|
||||
"""
|
||||
enabled_cities = self.registry.get_enabled_cities()
|
||||
|
||||
logger.info("Starting monthly data rotation", cities=len(enabled_cities))
|
||||
|
||||
now = datetime.now()
|
||||
cutoff_date = now - timedelta(days=24 * 30)
|
||||
|
||||
last_month_end = now.replace(day=1) - timedelta(days=1)
|
||||
last_month_start = last_month_end.replace(day=1)
|
||||
|
||||
tasks = []
|
||||
for city in enabled_cities:
|
||||
tasks.append(
|
||||
self._rotate_city_data(
|
||||
city.city_id,
|
||||
cutoff_date,
|
||||
last_month_start,
|
||||
last_month_end
|
||||
)
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
successes = sum(1 for r in results if r is True)
|
||||
logger.info(
|
||||
"Monthly rotation complete",
|
||||
total=len(results),
|
||||
successes=successes
|
||||
)
|
||||
|
||||
async def _rotate_city_data(
|
||||
self,
|
||||
city_id: str,
|
||||
cutoff_date: datetime,
|
||||
new_start: datetime,
|
||||
new_end: datetime
|
||||
) -> bool:
|
||||
"""Rotate data for a single city"""
|
||||
try:
|
||||
city = self.registry.get_city(city_id)
|
||||
if not city:
|
||||
return False
|
||||
|
||||
logger.info(
|
||||
"Rotating city data",
|
||||
city=city.name,
|
||||
cutoff=cutoff_date.date(),
|
||||
new_month=new_start.strftime("%Y-%m")
|
||||
)
|
||||
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CityDataRepository(session)
|
||||
|
||||
deleted_weather = await repo.delete_weather_before(
|
||||
city_id, cutoff_date
|
||||
)
|
||||
deleted_traffic = await repo.delete_traffic_before(
|
||||
city_id, cutoff_date
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Old data deleted",
|
||||
city=city.name,
|
||||
weather_deleted=deleted_weather,
|
||||
traffic_deleted=deleted_traffic
|
||||
)
|
||||
|
||||
adapter = get_adapter(city_id, {
|
||||
"weather_config": city.weather_config,
|
||||
"traffic_config": city.traffic_config
|
||||
})
|
||||
|
||||
new_weather = await adapter.fetch_historical_weather(
|
||||
new_start, new_end
|
||||
)
|
||||
new_traffic = await adapter.fetch_historical_traffic(
|
||||
new_start, new_end
|
||||
)
|
||||
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CityDataRepository(session)
|
||||
|
||||
weather_stored = await repo.bulk_store_weather(
|
||||
city_id, new_weather
|
||||
)
|
||||
traffic_stored = await repo.bulk_store_traffic(
|
||||
city_id, new_traffic
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"New data ingested",
|
||||
city=city.name,
|
||||
weather_added=weather_stored,
|
||||
traffic_added=traffic_stored
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"City rotation failed",
|
||||
city_id=city_id,
|
||||
error=str(e)
|
||||
)
|
||||
return False
|
||||
|
||||
async def seed_school_calendars(self) -> bool:
|
||||
"""
|
||||
Seed school calendars from CalendarRegistry into database
|
||||
Called during initialization - idempotent
|
||||
"""
|
||||
try:
|
||||
logger.info("Starting school calendar seeding...")
|
||||
|
||||
# Get all calendars from registry
|
||||
calendars = CalendarRegistry.get_all_calendars()
|
||||
logger.info(f"Found {len(calendars)} calendars in registry")
|
||||
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CalendarRepository(session)
|
||||
|
||||
seeded_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
for cal_def in calendars:
|
||||
logger.info(
|
||||
"Processing calendar",
|
||||
calendar_id=cal_def.calendar_id,
|
||||
city=cal_def.city_id,
|
||||
type=cal_def.school_type.value,
|
||||
year=cal_def.academic_year
|
||||
)
|
||||
|
||||
# Check if calendar already exists (idempotency)
|
||||
existing = await repo.get_calendar_by_city_type_year(
|
||||
city_id=cal_def.city_id,
|
||||
school_type=cal_def.school_type.value,
|
||||
academic_year=cal_def.academic_year
|
||||
)
|
||||
|
||||
if existing:
|
||||
logger.info(
|
||||
"Calendar already exists, skipping",
|
||||
calendar_id=cal_def.calendar_id
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Convert holiday periods to dict format
|
||||
holiday_periods = [
|
||||
{
|
||||
"name": hp.name,
|
||||
"start_date": hp.start_date,
|
||||
"end_date": hp.end_date,
|
||||
"description": hp.description
|
||||
}
|
||||
for hp in cal_def.holiday_periods
|
||||
]
|
||||
|
||||
# Convert school hours to dict format
|
||||
school_hours = {
|
||||
"morning_start": cal_def.school_hours.morning_start,
|
||||
"morning_end": cal_def.school_hours.morning_end,
|
||||
"has_afternoon_session": cal_def.school_hours.has_afternoon_session,
|
||||
"afternoon_start": cal_def.school_hours.afternoon_start,
|
||||
"afternoon_end": cal_def.school_hours.afternoon_end
|
||||
}
|
||||
|
||||
# Create calendar in database
|
||||
created_calendar = await repo.create_school_calendar(
|
||||
city_id=cal_def.city_id,
|
||||
calendar_name=cal_def.calendar_name,
|
||||
school_type=cal_def.school_type.value,
|
||||
academic_year=cal_def.academic_year,
|
||||
holiday_periods=holiday_periods,
|
||||
school_hours=school_hours,
|
||||
source=cal_def.source,
|
||||
enabled=cal_def.enabled
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Calendar seeded successfully",
|
||||
calendar_id=str(created_calendar.id),
|
||||
city=cal_def.city_id,
|
||||
type=cal_def.school_type.value,
|
||||
year=cal_def.academic_year
|
||||
)
|
||||
seeded_count += 1
|
||||
|
||||
logger.info(
|
||||
"School calendar seeding completed",
|
||||
seeded=seeded_count,
|
||||
skipped=skipped_count,
|
||||
total=len(calendars)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error seeding school calendars", error=str(e))
|
||||
return False
|
||||
Reference in New Issue
Block a user