REFACTOR external service and improve websocket training

This commit is contained in:
Urtzi Alfaro
2025-10-09 14:11:02 +02:00
parent 7c72f83c51
commit 3c689b4f98
111 changed files with 13289 additions and 2374 deletions

View File

@@ -0,0 +1 @@
"""Data ingestion module for multi-city external data"""

View File

@@ -0,0 +1,20 @@
# services/external/app/ingestion/adapters/__init__.py
"""
Adapter registry - Maps city IDs to adapter implementations
"""
from typing import Dict, Type
from ..base_adapter import CityDataAdapter
from .madrid_adapter import MadridAdapter
ADAPTER_REGISTRY: Dict[str, Type[CityDataAdapter]] = {
"madrid": MadridAdapter,
}
def get_adapter(city_id: str, config: Dict) -> CityDataAdapter:
"""Factory to instantiate appropriate adapter"""
adapter_class = ADAPTER_REGISTRY.get(city_id)
if not adapter_class:
raise ValueError(f"No adapter registered for city: {city_id}")
return adapter_class(city_id, config)

View File

@@ -0,0 +1,131 @@
# services/external/app/ingestion/adapters/madrid_adapter.py
"""
Madrid city data adapter - Uses existing AEMET and Madrid OpenData clients
"""
from typing import List, Dict, Any
from datetime import datetime
import structlog
from ..base_adapter import CityDataAdapter
from app.external.aemet import AEMETClient
from app.external.apis.madrid_traffic_client import MadridTrafficClient
logger = structlog.get_logger()
class MadridAdapter(CityDataAdapter):
"""Adapter for Madrid using AEMET + Madrid OpenData"""
def __init__(self, city_id: str, config: Dict[str, Any]):
super().__init__(city_id, config)
self.aemet_client = AEMETClient()
self.traffic_client = MadridTrafficClient()
self.madrid_lat = 40.4168
self.madrid_lon = -3.7038
async def fetch_historical_weather(
self,
start_date: datetime,
end_date: datetime
) -> List[Dict[str, Any]]:
"""Fetch historical weather from AEMET"""
try:
logger.info(
"Fetching Madrid historical weather",
start=start_date.isoformat(),
end=end_date.isoformat()
)
weather_data = await self.aemet_client.get_historical_weather(
self.madrid_lat,
self.madrid_lon,
start_date,
end_date
)
for record in weather_data:
record['city_id'] = self.city_id
record['city_name'] = 'Madrid'
logger.info(
"Madrid weather data fetched",
records=len(weather_data)
)
return weather_data
except Exception as e:
logger.error("Error fetching Madrid weather", error=str(e))
return []
async def fetch_historical_traffic(
self,
start_date: datetime,
end_date: datetime
) -> List[Dict[str, Any]]:
"""Fetch historical traffic from Madrid OpenData"""
try:
logger.info(
"Fetching Madrid historical traffic",
start=start_date.isoformat(),
end=end_date.isoformat()
)
traffic_data = await self.traffic_client.get_historical_traffic(
self.madrid_lat,
self.madrid_lon,
start_date,
end_date
)
for record in traffic_data:
record['city_id'] = self.city_id
record['city_name'] = 'Madrid'
logger.info(
"Madrid traffic data fetched",
records=len(traffic_data)
)
return traffic_data
except Exception as e:
logger.error("Error fetching Madrid traffic", error=str(e))
return []
async def validate_connection(self) -> bool:
"""Validate connection to AEMET and Madrid OpenData
Note: Validation is lenient - passes if traffic API works.
AEMET rate limits may cause weather validation to fail during initialization.
"""
try:
test_traffic = await self.traffic_client.get_current_traffic(
self.madrid_lat,
self.madrid_lon
)
# Traffic API must work (critical for operations)
if test_traffic is None:
logger.error("Traffic API validation failed - this is critical")
return False
# Try weather API, but don't fail validation if rate limited
test_weather = await self.aemet_client.get_current_weather(
self.madrid_lat,
self.madrid_lon
)
if test_weather is None:
logger.warning("Weather API validation failed (likely rate limited) - proceeding anyway")
else:
logger.info("Weather API validation successful")
# Pass validation if traffic works (weather can be fetched later)
return True
except Exception as e:
logger.error("Madrid adapter connection validation failed", error=str(e))
return False

View File

@@ -0,0 +1,43 @@
# services/external/app/ingestion/base_adapter.py
"""
Base adapter interface for city-specific data sources
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any
from datetime import datetime
class CityDataAdapter(ABC):
"""Abstract base class for city-specific data adapters"""
def __init__(self, city_id: str, config: Dict[str, Any]):
self.city_id = city_id
self.config = config
@abstractmethod
async def fetch_historical_weather(
self,
start_date: datetime,
end_date: datetime
) -> List[Dict[str, Any]]:
"""Fetch historical weather data for date range"""
pass
@abstractmethod
async def fetch_historical_traffic(
self,
start_date: datetime,
end_date: datetime
) -> List[Dict[str, Any]]:
"""Fetch historical traffic data for date range"""
pass
@abstractmethod
async def validate_connection(self) -> bool:
"""Validate connection to data source"""
pass
def get_city_id(self) -> str:
"""Get city identifier"""
return self.city_id

View File

@@ -0,0 +1,268 @@
# services/external/app/ingestion/ingestion_manager.py
"""
Data Ingestion Manager - Coordinates multi-city data collection
"""
from typing import List, Dict, Any
from datetime import datetime, timedelta
import structlog
import asyncio
from app.registry.city_registry import CityRegistry
from .adapters import get_adapter
from app.repositories.city_data_repository import CityDataRepository
from app.core.database import database_manager
logger = structlog.get_logger()
class DataIngestionManager:
"""Orchestrates data ingestion across all cities"""
def __init__(self):
self.registry = CityRegistry()
self.database_manager = database_manager
async def initialize_all_cities(self, months: int = 24):
"""
Initialize historical data for all enabled cities
Called by Kubernetes Init Job
"""
enabled_cities = self.registry.get_enabled_cities()
logger.info(
"Starting full data initialization",
cities=len(enabled_cities),
months=months
)
end_date = datetime.now()
start_date = end_date - timedelta(days=months * 30)
tasks = [
self.initialize_city(city.city_id, start_date, end_date)
for city in enabled_cities
]
results = await asyncio.gather(*tasks, return_exceptions=True)
successes = sum(1 for r in results if r is True)
failures = len(results) - successes
logger.info(
"Data initialization complete",
total=len(results),
successes=successes,
failures=failures
)
return successes == len(results)
async def initialize_city(
self,
city_id: str,
start_date: datetime,
end_date: datetime
) -> bool:
"""Initialize historical data for a single city (idempotent)"""
try:
city = self.registry.get_city(city_id)
if not city:
logger.error("City not found", city_id=city_id)
return False
logger.info(
"Initializing city data",
city=city.name,
start=start_date.date(),
end=end_date.date()
)
# Check if data already exists (idempotency)
async with self.database_manager.get_session() as session:
repo = CityDataRepository(session)
coverage = await repo.get_data_coverage(city_id, start_date, end_date)
days_in_range = (end_date - start_date).days
expected_records = days_in_range # One record per day minimum
# If we have >= 90% coverage, skip initialization
threshold = expected_records * 0.9
weather_sufficient = coverage['weather'] >= threshold
traffic_sufficient = coverage['traffic'] >= threshold
if weather_sufficient and traffic_sufficient:
logger.info(
"City data already initialized, skipping",
city=city.name,
weather_records=coverage['weather'],
traffic_records=coverage['traffic'],
threshold=int(threshold)
)
return True
logger.info(
"Insufficient data coverage, proceeding with initialization",
city=city.name,
existing_weather=coverage['weather'],
existing_traffic=coverage['traffic'],
expected=expected_records
)
adapter = get_adapter(
city_id,
{
"weather_config": city.weather_config,
"traffic_config": city.traffic_config
}
)
if not await adapter.validate_connection():
logger.error("Adapter validation failed", city=city.name)
return False
weather_data = await adapter.fetch_historical_weather(
start_date, end_date
)
traffic_data = await adapter.fetch_historical_traffic(
start_date, end_date
)
async with self.database_manager.get_session() as session:
repo = CityDataRepository(session)
weather_stored = await repo.bulk_store_weather(
city_id, weather_data
)
traffic_stored = await repo.bulk_store_traffic(
city_id, traffic_data
)
logger.info(
"City initialization complete",
city=city.name,
weather_records=weather_stored,
traffic_records=traffic_stored
)
return True
except Exception as e:
logger.error(
"City initialization failed",
city_id=city_id,
error=str(e)
)
return False
async def rotate_monthly_data(self):
"""
Rotate 24-month window: delete old, ingest new
Called by Kubernetes CronJob monthly
"""
enabled_cities = self.registry.get_enabled_cities()
logger.info("Starting monthly data rotation", cities=len(enabled_cities))
now = datetime.now()
cutoff_date = now - timedelta(days=24 * 30)
last_month_end = now.replace(day=1) - timedelta(days=1)
last_month_start = last_month_end.replace(day=1)
tasks = []
for city in enabled_cities:
tasks.append(
self._rotate_city_data(
city.city_id,
cutoff_date,
last_month_start,
last_month_end
)
)
results = await asyncio.gather(*tasks, return_exceptions=True)
successes = sum(1 for r in results if r is True)
logger.info(
"Monthly rotation complete",
total=len(results),
successes=successes
)
async def _rotate_city_data(
self,
city_id: str,
cutoff_date: datetime,
new_start: datetime,
new_end: datetime
) -> bool:
"""Rotate data for a single city"""
try:
city = self.registry.get_city(city_id)
if not city:
return False
logger.info(
"Rotating city data",
city=city.name,
cutoff=cutoff_date.date(),
new_month=new_start.strftime("%Y-%m")
)
async with self.database_manager.get_session() as session:
repo = CityDataRepository(session)
deleted_weather = await repo.delete_weather_before(
city_id, cutoff_date
)
deleted_traffic = await repo.delete_traffic_before(
city_id, cutoff_date
)
logger.info(
"Old data deleted",
city=city.name,
weather_deleted=deleted_weather,
traffic_deleted=deleted_traffic
)
adapter = get_adapter(city_id, {
"weather_config": city.weather_config,
"traffic_config": city.traffic_config
})
new_weather = await adapter.fetch_historical_weather(
new_start, new_end
)
new_traffic = await adapter.fetch_historical_traffic(
new_start, new_end
)
async with self.database_manager.get_session() as session:
repo = CityDataRepository(session)
weather_stored = await repo.bulk_store_weather(
city_id, new_weather
)
traffic_stored = await repo.bulk_store_traffic(
city_id, new_traffic
)
logger.info(
"New data ingested",
city=city.name,
weather_added=weather_stored,
traffic_added=traffic_stored
)
return True
except Exception as e:
logger.error(
"City rotation failed",
city_id=city_id,
error=str(e)
)
return False