REFACTOR data service

This commit is contained in:
Urtzi Alfaro
2025-08-12 18:17:30 +02:00
parent 7c237c0acc
commit fbe7470ad9
149 changed files with 8528 additions and 7393 deletions

View File

@@ -0,0 +1,10 @@
# ================================================================
# services/data/app/external/apis/__init__.py
# ================================================================
"""
External API clients module - Scalable architecture for multiple cities
"""
from .traffic import TrafficAPIClientFactory
__all__ = ["TrafficAPIClientFactory"]

View File

@@ -0,0 +1,350 @@
# ================================================================
# services/data/app/external/apis/madrid_traffic_client.py
# ================================================================
"""
Madrid traffic client - Orchestration layer only
Coordinates between HTTP client, data processor, and business logic components
"""
from datetime import datetime, timedelta, timezone
from typing import Dict, List, Any, Optional, Tuple
import structlog
from .traffic import BaseTrafficClient, SupportedCity
from ..base_client import BaseAPIClient
from ..clients.madrid_client import MadridTrafficAPIClient
from ..processors.madrid_processor import MadridTrafficDataProcessor
from ..processors.madrid_business_logic import MadridTrafficAnalyzer
from ..models.madrid_models import TrafficRecord, CongestionLevel
class MadridTrafficClient(BaseTrafficClient, BaseAPIClient):
"""
Enhanced Madrid traffic client - Orchestration layer
Coordinates HTTP, processing, and business logic components
"""
# Madrid geographic bounds
MADRID_BOUNDS = {
'lat_min': 40.31, 'lat_max': 40.56,
'lon_min': -3.89, 'lon_max': -3.51
}
# Configuration constants
MAX_HISTORICAL_DAYS = 1095 # 3 years
MAX_CSV_PROCESSING_ROWS = 5000000
MEASUREMENT_POINTS_LIMIT = 20
def __init__(self):
BaseTrafficClient.__init__(self, SupportedCity.MADRID)
BaseAPIClient.__init__(self, base_url="https://datos.madrid.es")
# Initialize components
self.api_client = MadridTrafficAPIClient()
self.processor = MadridTrafficDataProcessor()
self.analyzer = MadridTrafficAnalyzer()
self.logger = structlog.get_logger()
def supports_location(self, latitude: float, longitude: float) -> bool:
"""Check if location is within Madrid bounds"""
return (self.MADRID_BOUNDS['lat_min'] <= latitude <= self.MADRID_BOUNDS['lat_max'] and
self.MADRID_BOUNDS['lon_min'] <= longitude <= self.MADRID_BOUNDS['lon_max'])
async def get_current_traffic(self, latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
"""Get current traffic data with enhanced pedestrian inference"""
try:
if not self.supports_location(latitude, longitude):
self.logger.warning("Location outside Madrid bounds", lat=latitude, lon=longitude)
return None
# Fetch XML data
xml_content = await self.api_client.fetch_current_traffic_xml()
if not xml_content:
self.logger.warning("No XML content received")
return None
# Parse XML data
traffic_points = self.processor.parse_traffic_xml(xml_content)
if not traffic_points:
self.logger.warning("No traffic points found in XML")
return None
# Find nearest traffic point
nearest_point = self.analyzer.find_nearest_traffic_point(traffic_points, latitude, longitude)
if not nearest_point:
self.logger.warning("No nearby traffic points found")
return None
# Enhance with business logic
enhanced_data = await self._enhance_traffic_data(nearest_point, latitude, longitude)
self.logger.info("Current traffic data retrieved",
point_id=nearest_point.get('measurement_point_id'),
distance=enhanced_data.get('distance_km', 0))
return enhanced_data
except Exception as e:
self.logger.error("Error getting current traffic", error=str(e))
return None
async def get_historical_traffic(self, latitude: float, longitude: float,
start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
"""Get historical traffic data with pedestrian enhancement"""
try:
if not self.supports_location(latitude, longitude):
self.logger.warning("Location outside Madrid bounds", lat=latitude, lon=longitude)
return []
# Validate date range
if (end_date - start_date).days > self.MAX_HISTORICAL_DAYS:
self.logger.warning("Date range too large, truncating",
requested_days=(end_date - start_date).days,
max_days=self.MAX_HISTORICAL_DAYS)
start_date = end_date - timedelta(days=self.MAX_HISTORICAL_DAYS)
# Fetch measurement points registry
csv_content = await self.api_client.fetch_measurement_points_csv()
if not csv_content:
self.logger.error("Failed to fetch measurement points registry")
return []
# Parse measurement points
measurement_points = self.processor.parse_measurement_points_csv(csv_content)
if not measurement_points:
self.logger.error("No measurement points found")
return []
# Find nearest measurement points
nearest_points = self.analyzer.find_nearest_measurement_points(
measurement_points, latitude, longitude, num_points=3
)
if not nearest_points:
self.logger.warning("No nearby measurement points found")
return []
# Process historical data
historical_records = await self._fetch_historical_data_enhanced(
latitude, longitude, start_date, end_date, nearest_points
)
self.logger.info("Historical traffic data retrieved",
records_count=len(historical_records),
date_range=f"{start_date.date()} to {end_date.date()}")
return historical_records
except Exception as e:
self.logger.error("Error getting historical traffic", error=str(e))
return []
async def get_events(self, latitude: float, longitude: float,
radius_km: float = 5.0) -> List[Dict[str, Any]]:
"""Get traffic events (incidents, construction, etc.)"""
# Madrid doesn't provide separate events endpoint
# Return enhanced current traffic data as events
current_data = await self.get_current_traffic(latitude, longitude)
if current_data and current_data.get('congestion_level') in ['high', 'blocked']:
return [{
'type': 'congestion',
'severity': current_data.get('congestion_level'),
'description': f"High traffic congestion at {current_data.get('measurement_point_name', 'measurement point')}",
'location': {
'latitude': current_data.get('latitude'),
'longitude': current_data.get('longitude')
},
'timestamp': current_data.get('timestamp')
}]
return []
async def _enhance_traffic_data(self, traffic_point: Dict[str, Any],
query_lat: float, query_lon: float) -> Dict[str, Any]:
"""Enhance traffic data with business logic and pedestrian inference"""
# Calculate distance
distance_km = self.analyzer.calculate_distance(
query_lat, query_lon,
traffic_point.get('latitude', 0),
traffic_point.get('longitude', 0)
)
# Classify road type
road_type = self.analyzer.classify_road_type(
traffic_point.get('measurement_point_name', '')
)
# Get congestion level
congestion_level = self.analyzer.get_congestion_level(
traffic_point.get('ocupacion', 0)
)
# Create traffic record for pedestrian inference
traffic_record = TrafficRecord(
date=datetime.now(timezone.utc),
traffic_volume=traffic_point.get('intensidad', 0),
occupation_percentage=int(traffic_point.get('ocupacion', 0)),
load_percentage=traffic_point.get('carga', 0),
average_speed=30, # Default speed
congestion_level=congestion_level,
pedestrian_count=0, # Will be calculated
measurement_point_id=traffic_point.get('measurement_point_id', ''),
measurement_point_name=traffic_point.get('measurement_point_name', ''),
road_type=road_type,
source='madrid_current_xml'
)
# Calculate pedestrian count
location_context = {
'latitude': traffic_point.get('latitude'),
'longitude': traffic_point.get('longitude'),
'measurement_point_name': traffic_point.get('measurement_point_name')
}
pedestrian_count, inference_metadata = self.analyzer.calculate_pedestrian_flow(
traffic_record, location_context
)
# Build enhanced response
enhanced_data = {
'timestamp': datetime.now(timezone.utc),
'latitude': traffic_point.get('latitude'),
'longitude': traffic_point.get('longitude'),
'measurement_point_id': traffic_point.get('measurement_point_id'),
'measurement_point_name': traffic_point.get('measurement_point_name'),
'traffic_volume': traffic_point.get('intensidad', 0),
'occupation_percentage': int(traffic_point.get('ocupacion', 0)),
'load_percentage': traffic_point.get('carga', 0),
'congestion_level': congestion_level,
'pedestrian_count': pedestrian_count,
'road_type': road_type,
'distance_km': distance_km,
'source': 'madrid_current_xml',
'city': 'madrid',
'inference_metadata': inference_metadata,
'raw_data': traffic_point
}
return enhanced_data
async def _fetch_historical_data_enhanced(self, latitude: float, longitude: float,
start_date: datetime, end_date: datetime,
nearest_points: List[Tuple[str, Dict[str, Any], float]]) -> List[Dict[str, Any]]:
"""Fetch and process historical traffic data"""
historical_records = []
try:
# Process by year and month to avoid memory issues
current_date = start_date.replace(day=1) # Start from beginning of month
while current_date <= end_date:
year = current_date.year
month = current_date.month
# Build historical URL
zip_url = self.api_client._build_historical_url(year, month)
self.logger.info("Processing historical ZIP file",
year=year, month=month, zip_url=zip_url)
# Fetch ZIP content
zip_content = await self.api_client.fetch_historical_zip(zip_url)
if not zip_content:
self.logger.warning("Failed to fetch historical ZIP", url=zip_url)
current_date = current_date.replace(month=current_date.month + 1) if current_date.month < 12 else current_date.replace(year=current_date.year + 1, month=1)
continue
# Process ZIP content with enhanced parsing
month_records = await self._process_historical_zip_enhanced(
zip_content, zip_url, latitude, longitude, nearest_points
)
# Filter by date range - ensure timezone consistency
# Make sure start_date and end_date have timezone info for comparison
start_tz = start_date if start_date.tzinfo else start_date.replace(tzinfo=timezone.utc)
end_tz = end_date if end_date.tzinfo else end_date.replace(tzinfo=timezone.utc)
filtered_records = []
for record in month_records:
record_date = record.get('date')
if not record_date:
continue
# Ensure record date has timezone info
if not record_date.tzinfo:
record_date = record_date.replace(tzinfo=timezone.utc)
# Now compare with consistent timezone info
if start_tz <= record_date <= end_tz:
filtered_records.append(record)
historical_records.extend(filtered_records)
self.logger.info("Month processing completed",
year=year, month=month,
month_records=len(month_records),
filtered_records=len(filtered_records),
total_records=len(historical_records))
# Move to next month
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
return historical_records
except Exception as e:
self.logger.error("Error fetching historical data", error=str(e))
return historical_records # Return partial results
async def _process_historical_zip_enhanced(self, zip_content: bytes, zip_url: str,
latitude: float, longitude: float,
nearest_points: List[Tuple[str, Dict[str, Any], float]]) -> List[Dict[str, Any]]:
"""Process historical ZIP file with enhanced parsing"""
try:
import zipfile
import io
import csv
import gc
historical_records = []
nearest_ids = {p[0] for p in nearest_points}
with zipfile.ZipFile(io.BytesIO(zip_content)) as zip_file:
csv_files = [f for f in zip_file.namelist() if f.lower().endswith('.csv')]
for csv_filename in csv_files:
try:
# Read CSV content
with zip_file.open(csv_filename) as csv_file:
text_content = csv_file.read().decode('utf-8', errors='ignore')
# Process CSV in chunks using processor
csv_records = await self.processor.process_csv_content_chunked(
text_content, csv_filename, nearest_ids, nearest_points
)
historical_records.extend(csv_records)
# Force garbage collection
gc.collect()
except Exception as csv_error:
self.logger.warning("Error processing CSV file",
filename=csv_filename,
error=str(csv_error))
continue
self.logger.info("Historical ZIP processing completed",
zip_url=zip_url,
total_records=len(historical_records))
return historical_records
except Exception as e:
self.logger.error("Error processing historical ZIP file",
zip_url=zip_url, error=str(e))
return []

View File

@@ -0,0 +1,257 @@
# ================================================================
# services/data/app/external/apis/traffic.py
# ================================================================
"""
Traffic API abstraction layer for multiple cities
"""
import asyncio
from abc import ABC, abstractmethod
from datetime import datetime
from enum import Enum
from typing import Dict, List, Any, Optional, Tuple
import structlog
logger = structlog.get_logger()
class SupportedCity(Enum):
"""Supported cities for traffic data collection"""
MADRID = "madrid"
BARCELONA = "barcelona"
VALENCIA = "valencia"
class BaseTrafficClient(ABC):
"""
Abstract base class for city-specific traffic clients
Defines the contract that all traffic clients must implement
"""
def __init__(self, city: SupportedCity):
self.city = city
self.logger = structlog.get_logger().bind(city=city.value)
@abstractmethod
async def get_current_traffic(self, latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
"""Get current traffic data for location"""
pass
@abstractmethod
async def get_historical_traffic(self, latitude: float, longitude: float,
start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
"""Get historical traffic data"""
pass
@abstractmethod
async def get_events(self, latitude: float, longitude: float, radius_km: float = 5.0) -> List[Dict[str, Any]]:
"""Get traffic incidents and events"""
pass
@abstractmethod
def supports_location(self, latitude: float, longitude: float) -> bool:
"""Check if this client supports the given location"""
pass
class TrafficAPIClientFactory:
"""
Factory class to create appropriate traffic clients based on location
"""
# City geographical bounds
CITY_BOUNDS = {
SupportedCity.MADRID: {
'lat_min': 40.31, 'lat_max': 40.56,
'lon_min': -3.89, 'lon_max': -3.51
},
SupportedCity.BARCELONA: {
'lat_min': 41.32, 'lat_max': 41.47,
'lon_min': 2.05, 'lon_max': 2.25
},
SupportedCity.VALENCIA: {
'lat_min': 39.42, 'lat_max': 39.52,
'lon_min': -0.42, 'lon_max': -0.32
}
}
@classmethod
def get_client_for_location(cls, latitude: float, longitude: float) -> Optional[BaseTrafficClient]:
"""
Get appropriate traffic client for given location
Args:
latitude: Query location latitude
longitude: Query location longitude
Returns:
BaseTrafficClient instance or None if location not supported
"""
try:
# Check each city's bounds
for city, bounds in cls.CITY_BOUNDS.items():
if (bounds['lat_min'] <= latitude <= bounds['lat_max'] and
bounds['lon_min'] <= longitude <= bounds['lon_max']):
logger.info("Location matched to city",
city=city.value, lat=latitude, lon=longitude)
return cls._create_client(city)
# If no specific city matches, try to find closest supported city
closest_city = cls._find_closest_city(latitude, longitude)
if closest_city:
logger.info("Using closest city for location",
closest_city=closest_city.value, lat=latitude, lon=longitude)
return cls._create_client(closest_city)
logger.warning("No traffic client available for location",
lat=latitude, lon=longitude)
return None
except Exception as e:
logger.error("Error getting traffic client for location",
lat=latitude, lon=longitude, error=str(e))
return None
@classmethod
def _create_client(cls, city: SupportedCity) -> BaseTrafficClient:
"""Create traffic client for specific city"""
if city == SupportedCity.MADRID:
from .madrid_traffic_client import MadridTrafficClient
return MadridTrafficClient()
elif city == SupportedCity.BARCELONA:
# Future implementation
raise NotImplementedError(f"Traffic client for {city.value} not yet implemented")
elif city == SupportedCity.VALENCIA:
# Future implementation
raise NotImplementedError(f"Traffic client for {city.value} not yet implemented")
else:
raise ValueError(f"Unsupported city: {city}")
@classmethod
def _find_closest_city(cls, latitude: float, longitude: float) -> Optional[SupportedCity]:
"""Find closest supported city to given coordinates"""
import math
def distance(lat1, lon1, lat2, lon2):
"""Calculate distance between two coordinates"""
R = 6371 # Earth's radius in km
dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = (math.sin(dlat/2) * math.sin(dlat/2) +
math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
math.sin(dlon/2) * math.sin(dlon/2))
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
return R * c
min_distance = float('inf')
closest_city = None
# City centers for distance calculation
city_centers = {
SupportedCity.MADRID: (40.4168, -3.7038),
SupportedCity.BARCELONA: (41.3851, 2.1734),
SupportedCity.VALENCIA: (39.4699, -0.3763)
}
for city, (city_lat, city_lon) in city_centers.items():
dist = distance(latitude, longitude, city_lat, city_lon)
if dist < min_distance and dist < 100: # Within 100km
min_distance = dist
closest_city = city
return closest_city
@classmethod
def get_supported_cities(cls) -> List[Dict[str, Any]]:
"""Get list of supported cities with their bounds"""
cities = []
for city, bounds in cls.CITY_BOUNDS.items():
cities.append({
"city": city.value,
"bounds": bounds,
"status": "active" if city == SupportedCity.MADRID else "planned"
})
return cities
class UniversalTrafficClient:
"""
Universal traffic client that delegates to appropriate city-specific clients
This is the main interface that external services should use
"""
def __init__(self):
self.factory = TrafficAPIClientFactory()
self.client_cache = {} # Cache clients for performance
async def get_current_traffic(self, latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
"""Get current traffic data for any supported location"""
try:
client = self._get_client_for_location(latitude, longitude)
if client:
return await client.get_current_traffic(latitude, longitude)
else:
logger.warning("No traffic data available for location",
lat=latitude, lon=longitude)
return None
except Exception as e:
logger.error("Error getting current traffic",
lat=latitude, lon=longitude, error=str(e))
return None
async def get_historical_traffic(self, latitude: float, longitude: float,
start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
"""Get historical traffic data for any supported location"""
try:
client = self._get_client_for_location(latitude, longitude)
if client:
return await client.get_historical_traffic(latitude, longitude, start_date, end_date)
else:
logger.warning("No historical traffic data available for location",
lat=latitude, lon=longitude)
return []
except Exception as e:
logger.error("Error getting historical traffic",
lat=latitude, lon=longitude, error=str(e))
return []
async def get_events(self, latitude: float, longitude: float, radius_km: float = 5.0) -> List[Dict[str, Any]]:
"""Get traffic events for any supported location"""
try:
client = self._get_client_for_location(latitude, longitude)
if client:
return await client.get_events(latitude, longitude, radius_km)
else:
return []
except Exception as e:
logger.error("Error getting traffic events",
lat=latitude, lon=longitude, error=str(e))
return []
def _get_client_for_location(self, latitude: float, longitude: float) -> Optional[BaseTrafficClient]:
"""Get cached or create new client for location"""
cache_key = f"{latitude:.4f},{longitude:.4f}"
if cache_key not in self.client_cache:
client = self.factory.get_client_for_location(latitude, longitude)
self.client_cache[cache_key] = client
return self.client_cache[cache_key]
def get_location_info(self, latitude: float, longitude: float) -> Dict[str, Any]:
"""Get information about traffic data availability for location"""
client = self._get_client_for_location(latitude, longitude)
if client:
return {
"supported": True,
"city": client.city.value,
"features": ["current_traffic", "historical_traffic", "events"]
}
else:
return {
"supported": False,
"city": None,
"features": [],
"message": "No traffic data available for this location"
}