REFACTOR external service and improve websocket training
This commit is contained in:
1
services/external/app/ingestion/__init__.py
vendored
Normal file
1
services/external/app/ingestion/__init__.py
vendored
Normal file
@@ -0,0 +1 @@
|
||||
"""Data ingestion module for multi-city external data"""
|
||||
20
services/external/app/ingestion/adapters/__init__.py
vendored
Normal file
20
services/external/app/ingestion/adapters/__init__.py
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
# services/external/app/ingestion/adapters/__init__.py
|
||||
"""
|
||||
Adapter registry - Maps city IDs to adapter implementations
|
||||
"""
|
||||
|
||||
from typing import Dict, Type
|
||||
from ..base_adapter import CityDataAdapter
|
||||
from .madrid_adapter import MadridAdapter
|
||||
|
||||
ADAPTER_REGISTRY: Dict[str, Type[CityDataAdapter]] = {
|
||||
"madrid": MadridAdapter,
|
||||
}
|
||||
|
||||
|
||||
def get_adapter(city_id: str, config: Dict) -> CityDataAdapter:
|
||||
"""Factory to instantiate appropriate adapter"""
|
||||
adapter_class = ADAPTER_REGISTRY.get(city_id)
|
||||
if not adapter_class:
|
||||
raise ValueError(f"No adapter registered for city: {city_id}")
|
||||
return adapter_class(city_id, config)
|
||||
131
services/external/app/ingestion/adapters/madrid_adapter.py
vendored
Normal file
131
services/external/app/ingestion/adapters/madrid_adapter.py
vendored
Normal file
@@ -0,0 +1,131 @@
|
||||
# services/external/app/ingestion/adapters/madrid_adapter.py
|
||||
"""
|
||||
Madrid city data adapter - Uses existing AEMET and Madrid OpenData clients
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
import structlog
|
||||
|
||||
from ..base_adapter import CityDataAdapter
|
||||
from app.external.aemet import AEMETClient
|
||||
from app.external.apis.madrid_traffic_client import MadridTrafficClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class MadridAdapter(CityDataAdapter):
|
||||
"""Adapter for Madrid using AEMET + Madrid OpenData"""
|
||||
|
||||
def __init__(self, city_id: str, config: Dict[str, Any]):
|
||||
super().__init__(city_id, config)
|
||||
self.aemet_client = AEMETClient()
|
||||
self.traffic_client = MadridTrafficClient()
|
||||
|
||||
self.madrid_lat = 40.4168
|
||||
self.madrid_lon = -3.7038
|
||||
|
||||
async def fetch_historical_weather(
|
||||
self,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch historical weather from AEMET"""
|
||||
try:
|
||||
logger.info(
|
||||
"Fetching Madrid historical weather",
|
||||
start=start_date.isoformat(),
|
||||
end=end_date.isoformat()
|
||||
)
|
||||
|
||||
weather_data = await self.aemet_client.get_historical_weather(
|
||||
self.madrid_lat,
|
||||
self.madrid_lon,
|
||||
start_date,
|
||||
end_date
|
||||
)
|
||||
|
||||
for record in weather_data:
|
||||
record['city_id'] = self.city_id
|
||||
record['city_name'] = 'Madrid'
|
||||
|
||||
logger.info(
|
||||
"Madrid weather data fetched",
|
||||
records=len(weather_data)
|
||||
)
|
||||
|
||||
return weather_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching Madrid weather", error=str(e))
|
||||
return []
|
||||
|
||||
async def fetch_historical_traffic(
|
||||
self,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch historical traffic from Madrid OpenData"""
|
||||
try:
|
||||
logger.info(
|
||||
"Fetching Madrid historical traffic",
|
||||
start=start_date.isoformat(),
|
||||
end=end_date.isoformat()
|
||||
)
|
||||
|
||||
traffic_data = await self.traffic_client.get_historical_traffic(
|
||||
self.madrid_lat,
|
||||
self.madrid_lon,
|
||||
start_date,
|
||||
end_date
|
||||
)
|
||||
|
||||
for record in traffic_data:
|
||||
record['city_id'] = self.city_id
|
||||
record['city_name'] = 'Madrid'
|
||||
|
||||
logger.info(
|
||||
"Madrid traffic data fetched",
|
||||
records=len(traffic_data)
|
||||
)
|
||||
|
||||
return traffic_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching Madrid traffic", error=str(e))
|
||||
return []
|
||||
|
||||
async def validate_connection(self) -> bool:
|
||||
"""Validate connection to AEMET and Madrid OpenData
|
||||
|
||||
Note: Validation is lenient - passes if traffic API works.
|
||||
AEMET rate limits may cause weather validation to fail during initialization.
|
||||
"""
|
||||
try:
|
||||
test_traffic = await self.traffic_client.get_current_traffic(
|
||||
self.madrid_lat,
|
||||
self.madrid_lon
|
||||
)
|
||||
|
||||
# Traffic API must work (critical for operations)
|
||||
if test_traffic is None:
|
||||
logger.error("Traffic API validation failed - this is critical")
|
||||
return False
|
||||
|
||||
# Try weather API, but don't fail validation if rate limited
|
||||
test_weather = await self.aemet_client.get_current_weather(
|
||||
self.madrid_lat,
|
||||
self.madrid_lon
|
||||
)
|
||||
|
||||
if test_weather is None:
|
||||
logger.warning("Weather API validation failed (likely rate limited) - proceeding anyway")
|
||||
else:
|
||||
logger.info("Weather API validation successful")
|
||||
|
||||
# Pass validation if traffic works (weather can be fetched later)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Madrid adapter connection validation failed", error=str(e))
|
||||
return False
|
||||
43
services/external/app/ingestion/base_adapter.py
vendored
Normal file
43
services/external/app/ingestion/base_adapter.py
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
# services/external/app/ingestion/base_adapter.py
|
||||
"""
|
||||
Base adapter interface for city-specific data sources
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class CityDataAdapter(ABC):
|
||||
"""Abstract base class for city-specific data adapters"""
|
||||
|
||||
def __init__(self, city_id: str, config: Dict[str, Any]):
|
||||
self.city_id = city_id
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
async def fetch_historical_weather(
|
||||
self,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch historical weather data for date range"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def fetch_historical_traffic(
|
||||
self,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch historical traffic data for date range"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def validate_connection(self) -> bool:
|
||||
"""Validate connection to data source"""
|
||||
pass
|
||||
|
||||
def get_city_id(self) -> str:
|
||||
"""Get city identifier"""
|
||||
return self.city_id
|
||||
268
services/external/app/ingestion/ingestion_manager.py
vendored
Normal file
268
services/external/app/ingestion/ingestion_manager.py
vendored
Normal file
@@ -0,0 +1,268 @@
|
||||
# services/external/app/ingestion/ingestion_manager.py
|
||||
"""
|
||||
Data Ingestion Manager - Coordinates multi-city data collection
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime, timedelta
|
||||
import structlog
|
||||
import asyncio
|
||||
|
||||
from app.registry.city_registry import CityRegistry
|
||||
from .adapters import get_adapter
|
||||
from app.repositories.city_data_repository import CityDataRepository
|
||||
from app.core.database import database_manager
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class DataIngestionManager:
|
||||
"""Orchestrates data ingestion across all cities"""
|
||||
|
||||
def __init__(self):
|
||||
self.registry = CityRegistry()
|
||||
self.database_manager = database_manager
|
||||
|
||||
async def initialize_all_cities(self, months: int = 24):
|
||||
"""
|
||||
Initialize historical data for all enabled cities
|
||||
Called by Kubernetes Init Job
|
||||
"""
|
||||
enabled_cities = self.registry.get_enabled_cities()
|
||||
|
||||
logger.info(
|
||||
"Starting full data initialization",
|
||||
cities=len(enabled_cities),
|
||||
months=months
|
||||
)
|
||||
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=months * 30)
|
||||
|
||||
tasks = [
|
||||
self.initialize_city(city.city_id, start_date, end_date)
|
||||
for city in enabled_cities
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
successes = sum(1 for r in results if r is True)
|
||||
failures = len(results) - successes
|
||||
|
||||
logger.info(
|
||||
"Data initialization complete",
|
||||
total=len(results),
|
||||
successes=successes,
|
||||
failures=failures
|
||||
)
|
||||
|
||||
return successes == len(results)
|
||||
|
||||
async def initialize_city(
|
||||
self,
|
||||
city_id: str,
|
||||
start_date: datetime,
|
||||
end_date: datetime
|
||||
) -> bool:
|
||||
"""Initialize historical data for a single city (idempotent)"""
|
||||
try:
|
||||
city = self.registry.get_city(city_id)
|
||||
if not city:
|
||||
logger.error("City not found", city_id=city_id)
|
||||
return False
|
||||
|
||||
logger.info(
|
||||
"Initializing city data",
|
||||
city=city.name,
|
||||
start=start_date.date(),
|
||||
end=end_date.date()
|
||||
)
|
||||
|
||||
# Check if data already exists (idempotency)
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CityDataRepository(session)
|
||||
coverage = await repo.get_data_coverage(city_id, start_date, end_date)
|
||||
|
||||
days_in_range = (end_date - start_date).days
|
||||
expected_records = days_in_range # One record per day minimum
|
||||
|
||||
# If we have >= 90% coverage, skip initialization
|
||||
threshold = expected_records * 0.9
|
||||
weather_sufficient = coverage['weather'] >= threshold
|
||||
traffic_sufficient = coverage['traffic'] >= threshold
|
||||
|
||||
if weather_sufficient and traffic_sufficient:
|
||||
logger.info(
|
||||
"City data already initialized, skipping",
|
||||
city=city.name,
|
||||
weather_records=coverage['weather'],
|
||||
traffic_records=coverage['traffic'],
|
||||
threshold=int(threshold)
|
||||
)
|
||||
return True
|
||||
|
||||
logger.info(
|
||||
"Insufficient data coverage, proceeding with initialization",
|
||||
city=city.name,
|
||||
existing_weather=coverage['weather'],
|
||||
existing_traffic=coverage['traffic'],
|
||||
expected=expected_records
|
||||
)
|
||||
|
||||
adapter = get_adapter(
|
||||
city_id,
|
||||
{
|
||||
"weather_config": city.weather_config,
|
||||
"traffic_config": city.traffic_config
|
||||
}
|
||||
)
|
||||
|
||||
if not await adapter.validate_connection():
|
||||
logger.error("Adapter validation failed", city=city.name)
|
||||
return False
|
||||
|
||||
weather_data = await adapter.fetch_historical_weather(
|
||||
start_date, end_date
|
||||
)
|
||||
|
||||
traffic_data = await adapter.fetch_historical_traffic(
|
||||
start_date, end_date
|
||||
)
|
||||
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CityDataRepository(session)
|
||||
|
||||
weather_stored = await repo.bulk_store_weather(
|
||||
city_id, weather_data
|
||||
)
|
||||
traffic_stored = await repo.bulk_store_traffic(
|
||||
city_id, traffic_data
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"City initialization complete",
|
||||
city=city.name,
|
||||
weather_records=weather_stored,
|
||||
traffic_records=traffic_stored
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"City initialization failed",
|
||||
city_id=city_id,
|
||||
error=str(e)
|
||||
)
|
||||
return False
|
||||
|
||||
async def rotate_monthly_data(self):
|
||||
"""
|
||||
Rotate 24-month window: delete old, ingest new
|
||||
Called by Kubernetes CronJob monthly
|
||||
"""
|
||||
enabled_cities = self.registry.get_enabled_cities()
|
||||
|
||||
logger.info("Starting monthly data rotation", cities=len(enabled_cities))
|
||||
|
||||
now = datetime.now()
|
||||
cutoff_date = now - timedelta(days=24 * 30)
|
||||
|
||||
last_month_end = now.replace(day=1) - timedelta(days=1)
|
||||
last_month_start = last_month_end.replace(day=1)
|
||||
|
||||
tasks = []
|
||||
for city in enabled_cities:
|
||||
tasks.append(
|
||||
self._rotate_city_data(
|
||||
city.city_id,
|
||||
cutoff_date,
|
||||
last_month_start,
|
||||
last_month_end
|
||||
)
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
successes = sum(1 for r in results if r is True)
|
||||
logger.info(
|
||||
"Monthly rotation complete",
|
||||
total=len(results),
|
||||
successes=successes
|
||||
)
|
||||
|
||||
async def _rotate_city_data(
|
||||
self,
|
||||
city_id: str,
|
||||
cutoff_date: datetime,
|
||||
new_start: datetime,
|
||||
new_end: datetime
|
||||
) -> bool:
|
||||
"""Rotate data for a single city"""
|
||||
try:
|
||||
city = self.registry.get_city(city_id)
|
||||
if not city:
|
||||
return False
|
||||
|
||||
logger.info(
|
||||
"Rotating city data",
|
||||
city=city.name,
|
||||
cutoff=cutoff_date.date(),
|
||||
new_month=new_start.strftime("%Y-%m")
|
||||
)
|
||||
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CityDataRepository(session)
|
||||
|
||||
deleted_weather = await repo.delete_weather_before(
|
||||
city_id, cutoff_date
|
||||
)
|
||||
deleted_traffic = await repo.delete_traffic_before(
|
||||
city_id, cutoff_date
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Old data deleted",
|
||||
city=city.name,
|
||||
weather_deleted=deleted_weather,
|
||||
traffic_deleted=deleted_traffic
|
||||
)
|
||||
|
||||
adapter = get_adapter(city_id, {
|
||||
"weather_config": city.weather_config,
|
||||
"traffic_config": city.traffic_config
|
||||
})
|
||||
|
||||
new_weather = await adapter.fetch_historical_weather(
|
||||
new_start, new_end
|
||||
)
|
||||
new_traffic = await adapter.fetch_historical_traffic(
|
||||
new_start, new_end
|
||||
)
|
||||
|
||||
async with self.database_manager.get_session() as session:
|
||||
repo = CityDataRepository(session)
|
||||
|
||||
weather_stored = await repo.bulk_store_weather(
|
||||
city_id, new_weather
|
||||
)
|
||||
traffic_stored = await repo.bulk_store_traffic(
|
||||
city_id, new_traffic
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"New data ingested",
|
||||
city=city.name,
|
||||
weather_added=weather_stored,
|
||||
traffic_added=traffic_stored
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"City rotation failed",
|
||||
city_id=city_id,
|
||||
error=str(e)
|
||||
)
|
||||
return False
|
||||
Reference in New Issue
Block a user