Files
bakery-ia/services/data/app/external/madrid_opendata.py

535 lines
23 KiB
Python
Raw Normal View History

2025-07-18 11:51:43 +02:00
# ================================================================
# services/data/app/external/madrid_opendata.py
# ================================================================
2025-07-18 19:16:45 +02:00
"""Madrid Open Data API client for traffic and events - WITH REAL ENDPOINTS"""
2025-07-18 11:51:43 +02:00
import math
2025-07-18 19:16:45 +02:00
import xml.etree.ElementTree as ET
2025-07-18 11:51:43 +02:00
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
import structlog
from app.external.base_client import BaseAPIClient
from app.core.config import settings
logger = structlog.get_logger()
class MadridOpenDataClient(BaseAPIClient):
def __init__(self):
super().__init__(
base_url="https://datos.madrid.es/egob/catalogo",
2025-07-18 19:16:45 +02:00
api_key=None # Madrid Open Data doesn't require API key for public traffic data
2025-07-18 11:51:43 +02:00
)
2025-07-18 19:16:45 +02:00
# Real-time traffic data XML endpoint (updated every 5 minutes)
self.traffic_xml_url = "https://datos.madrid.es/egob/catalogo/300233-0-trafico-tiempo-real.xml"
# Traffic incidents XML endpoint (updated every 5 minutes)
self.incidents_xml_url = "http://informo.munimadrid.es/informo/tmadrid/incid_aytomadrid.xml"
# KML traffic intensity map (updated every 5 minutes)
self.traffic_kml_url = "https://datos.madrid.es/egob/catalogo/300233-1-intensidad-trafico.kml"
2025-07-18 11:51:43 +02:00
async def get_current_traffic(self, latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
2025-07-18 19:16:45 +02:00
"""Get current traffic data for location using real Madrid Open Data"""
2025-07-18 11:51:43 +02:00
try:
2025-07-18 19:16:45 +02:00
# Step 1: Fetch real-time traffic XML data
traffic_data = await self._fetch_traffic_xml()
if traffic_data:
# Step 2: Find nearest traffic measurement point
nearest_point = self._find_nearest_traffic_point(latitude, longitude, traffic_data)
if nearest_point:
# Step 3: Parse traffic data for the nearest point
return self._parse_traffic_measurement(nearest_point)
# Fallback to synthetic data if real data not available
logger.info("Real traffic data not available, using synthetic data")
2025-07-18 11:51:43 +02:00
return await self._generate_synthetic_traffic(latitude, longitude)
except Exception as e:
2025-07-18 19:16:45 +02:00
logger.error("Failed to get current traffic from Madrid Open Data", error=str(e))
return await self._generate_synthetic_traffic(latitude, longitude)
async def _fetch_traffic_xml(self) -> Optional[List[Dict[str, Any]]]:
"""Fetch and parse real-time traffic XML from Madrid Open Data"""
try:
# Use the direct URL fetching method from base client
xml_content = await self._fetch_xml_content(self.traffic_xml_url)
if not xml_content:
logger.warning("No XML content received from Madrid traffic API")
return None
# Parse XML content
root = ET.fromstring(xml_content)
traffic_points = []
# Madrid traffic XML structure: <trafico><pmed id="..." ...>...</pmed></trafico>
for pmed in root.findall('.//pmed'):
try:
traffic_point = {
'id': pmed.get('id'),
'latitude': float(pmed.get('y', 0)) if pmed.get('y') else None,
'longitude': float(pmed.get('x', 0)) if pmed.get('x') else None,
'intensity': int(pmed.get('intensidad', 0)) if pmed.get('intensidad') else 0,
'occupation': float(pmed.get('ocupacion', 0)) if pmed.get('ocupacion') else 0,
'load': int(pmed.get('carga', 0)) if pmed.get('carga') else 0,
'service_level': int(pmed.get('nivelServicio', 0)) if pmed.get('nivelServicio') else 0,
'speed': float(pmed.get('vmed', 0)) if pmed.get('vmed') else 0,
'error': pmed.get('error', '0'),
'measurement_date': pmed.get('fechahora', ''),
'name': pmed.get('nombre', 'Unknown'),
'type': pmed.get('tipo_elem', 'URB') # URB=Urban, C30=M-30 ring road
}
# Only add points with valid coordinates
if traffic_point['latitude'] and traffic_point['longitude']:
traffic_points.append(traffic_point)
except (ValueError, TypeError) as e:
logger.debug("Error parsing traffic point", error=str(e), point_id=pmed.get('id'))
continue
logger.info("Successfully parsed traffic data", points_count=len(traffic_points))
return traffic_points
except ET.ParseError as e:
logger.error("Failed to parse traffic XML", error=str(e))
return None
except Exception as e:
logger.error("Error fetching traffic XML", error=str(e))
return None
async def _fetch_xml_content(self, url: str) -> Optional[str]:
"""Fetch XML content from URL, handling encoding issues"""
try:
import httpx
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.get(url)
response.raise_for_status()
# Handle potential encoding issues with Spanish content
try:
return response.text
except UnicodeDecodeError:
# Try alternative encodings
for encoding in ['latin-1', 'windows-1252', 'iso-8859-1']:
try:
return response.content.decode(encoding)
except UnicodeDecodeError:
continue
logger.error("Failed to decode XML with any encoding")
return None
except Exception as e:
logger.error("Failed to fetch XML content", url=url, error=str(e))
2025-07-18 11:51:43 +02:00
return None
2025-07-18 19:16:45 +02:00
def _find_nearest_traffic_point(self, latitude: float, longitude: float, traffic_data: List[Dict]) -> Optional[Dict]:
"""Find the nearest traffic measurement point to given coordinates"""
if not traffic_data:
return None
min_distance = float('inf')
nearest_point = None
for point in traffic_data:
if point['latitude'] and point['longitude']:
distance = self._calculate_distance(
latitude, longitude,
point['latitude'], point['longitude']
)
if distance < min_distance:
min_distance = distance
nearest_point = point
# Only return if within reasonable distance (5km)
if nearest_point and min_distance <= 5.0:
logger.debug("Found nearest traffic point",
distance_km=min_distance,
point_name=nearest_point.get('name'))
return nearest_point
return None
def _calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""Calculate distance between two coordinates in km using Haversine formula"""
R = 6371 # Earth's radius in km
dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = (math.sin(dlat/2) * math.sin(dlat/2) +
math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
math.sin(dlon/2) * math.sin(dlon/2))
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
distance = R * c
return distance
def _parse_traffic_measurement(self, traffic_point: Dict) -> Dict[str, Any]:
"""Parse Madrid traffic measurement into standardized format"""
try:
# Madrid traffic service levels: 0=fluid, 1=dense, 2=congested, 3=cut
service_level_map = {
0: "low",
1: "medium",
2: "high",
3: "blocked"
}
# Estimate average speed based on service level and type
service_level = traffic_point.get('service_level', 0)
road_type = traffic_point.get('type', 'URB')
# Use real speed if available, otherwise estimate
if traffic_point.get('speed', 0) > 0:
average_speed = traffic_point['speed']
else:
# Speed estimation based on road type and service level
if road_type == 'C30': # M-30 ring road
speed_map = {0: 80, 1: 50, 2: 25, 3: 10}
else: # Urban roads
speed_map = {0: 40, 1: 25, 2: 15, 3: 5}
average_speed = speed_map.get(service_level, 20)
congestion_level = service_level_map.get(service_level, "medium")
# Calculate pedestrian estimate (higher in urban areas, lower on highways)
base_pedestrians = 100 if road_type == 'URB' else 20
hour = datetime.now().hour
# Pedestrian multiplier based on time of day
if 13 <= hour <= 15: # Lunch time
pedestrian_multiplier = 2.5
elif 8 <= hour <= 9 or 18 <= hour <= 20: # Rush hours
pedestrian_multiplier = 2.0
else:
pedestrian_multiplier = 1.0
return {
"date": datetime.now(),
"traffic_volume": traffic_point.get('intensity', 0), # vehicles/hour
"pedestrian_count": int(base_pedestrians * pedestrian_multiplier),
"congestion_level": congestion_level,
"average_speed": max(5, int(average_speed)), # Minimum 5 km/h
"occupation_percentage": traffic_point.get('occupation', 0),
"load_percentage": traffic_point.get('load', 0),
"measurement_point_id": traffic_point.get('id'),
"measurement_point_name": traffic_point.get('name'),
"road_type": road_type,
"source": "madrid_opendata"
}
except Exception as e:
logger.error("Error parsing traffic measurement", error=str(e))
return self._get_default_traffic_data()
def _get_default_traffic_data(self) -> Dict[str, Any]:
"""Get default traffic data when parsing fails"""
return {
"date": datetime.now(),
"traffic_volume": 100,
"pedestrian_count": 150,
"congestion_level": "medium",
"average_speed": 25,
"occupation_percentage": 30,
"load_percentage": 40,
"measurement_point_id": "unknown",
"measurement_point_name": "Unknown location",
"road_type": "URB",
"source": "default"
}
2025-07-18 11:51:43 +02:00
async def get_historical_traffic(self,
latitude: float,
longitude: float,
start_date: datetime,
end_date: datetime) -> List[Dict[str, Any]]:
2025-07-18 19:16:45 +02:00
"""Get historical traffic data (currently generates synthetic data)"""
2025-07-18 11:51:43 +02:00
try:
2025-07-18 19:16:45 +02:00
# Madrid provides historical data, but for now we'll generate synthetic
# In production, you would fetch from:
# https://datos.madrid.es/egob/catalogo/300233-2-trafico-historico.csv
2025-07-18 11:51:43 +02:00
return await self._generate_historical_traffic(latitude, longitude, start_date, end_date)
except Exception as e:
logger.error("Failed to get historical traffic", error=str(e))
return []
async def get_events(self, latitude: float, longitude: float, radius_km: float = 5.0) -> List[Dict[str, Any]]:
2025-07-18 19:16:45 +02:00
"""Get traffic incidents and events near location"""
2025-07-18 11:51:43 +02:00
try:
2025-07-18 19:16:45 +02:00
incidents = await self._fetch_traffic_incidents()
if incidents:
# Filter incidents by distance
nearby_incidents = []
for incident in incidents:
if incident.get('latitude') and incident.get('longitude'):
distance = self._calculate_distance(
latitude, longitude,
incident['latitude'], incident['longitude']
)
if distance <= radius_km:
incident['distance_km'] = round(distance, 2)
nearby_incidents.append(incident)
return nearby_incidents
# Fallback to synthetic events
2025-07-18 11:51:43 +02:00
return await self._generate_synthetic_events(latitude, longitude)
except Exception as e:
logger.error("Failed to get events", error=str(e))
2025-07-18 19:16:45 +02:00
return await self._generate_synthetic_events(latitude, longitude)
async def _fetch_traffic_incidents(self) -> Optional[List[Dict[str, Any]]]:
"""Fetch real traffic incidents from Madrid Open Data"""
try:
xml_content = await self._fetch_xml_content(self.incidents_xml_url)
if not xml_content:
return None
root = ET.fromstring(xml_content)
incidents = []
# Parse incident XML structure
for incidencia in root.findall('.//incidencia'):
try:
incident = {
'id': incidencia.get('id'),
'type': incidencia.findtext('tipo', 'unknown'),
'description': incidencia.findtext('descripcion', ''),
'location': incidencia.findtext('localizacion', ''),
'start_date': incidencia.findtext('fechaInicio', ''),
'end_date': incidencia.findtext('fechaFin', ''),
'impact_level': self._categorize_incident_impact(incidencia.findtext('tipo', '')),
'latitude': self._extract_coordinate(incidencia, 'lat'),
'longitude': self._extract_coordinate(incidencia, 'lon'),
'source': 'madrid_opendata'
}
incidents.append(incident)
except Exception as e:
logger.debug("Error parsing incident", error=str(e))
continue
logger.info("Successfully parsed traffic incidents", incidents_count=len(incidents))
return incidents
except Exception as e:
logger.error("Error fetching traffic incidents", error=str(e))
return None
2025-07-18 11:51:43 +02:00
2025-07-18 19:16:45 +02:00
def _extract_coordinate(self, element, coord_type: str) -> Optional[float]:
"""Extract latitude or longitude from incident XML"""
try:
coord_element = element.find(coord_type)
if coord_element is not None and coord_element.text:
return float(coord_element.text)
except (ValueError, TypeError):
pass
return None
def _categorize_incident_impact(self, incident_type: str) -> str:
"""Categorize incident impact level based on type"""
incident_type = incident_type.lower()
if any(word in incident_type for word in ['accidente', 'corte', 'cerrado']):
return 'high'
elif any(word in incident_type for word in ['obras', 'maintenance', 'evento']):
return 'medium'
else:
return 'low'
# Keep existing synthetic data generation methods as fallbacks
2025-07-18 11:51:43 +02:00
async def _generate_synthetic_traffic(self, latitude: float, longitude: float) -> Dict[str, Any]:
2025-07-18 19:16:45 +02:00
"""Generate realistic Madrid traffic data as fallback"""
2025-07-18 11:51:43 +02:00
now = datetime.now()
hour = now.hour
is_weekend = now.weekday() >= 5
# Base traffic volume
base_traffic = 100
# Madrid traffic patterns
if not is_weekend: # Weekdays
if 7 <= hour <= 9: # Morning rush
traffic_multiplier = 2.2
congestion = "high"
elif 18 <= hour <= 20: # Evening rush
traffic_multiplier = 2.5
congestion = "high"
elif 12 <= hour <= 14: # Lunch time
traffic_multiplier = 1.6
congestion = "medium"
elif 6 <= hour <= 22: # Daytime
traffic_multiplier = 1.2
congestion = "medium"
else: # Night
traffic_multiplier = 0.4
congestion = "low"
else: # Weekends
if 11 <= hour <= 14: # Weekend shopping
traffic_multiplier = 1.4
congestion = "medium"
elif 19 <= hour <= 22: # Weekend evening
traffic_multiplier = 1.6
congestion = "medium"
else:
traffic_multiplier = 0.8
congestion = "low"
2025-07-18 19:16:45 +02:00
# Calculate pedestrian traffic
2025-07-18 11:51:43 +02:00
pedestrian_base = 150
if 13 <= hour <= 15: # Lunch time
pedestrian_multiplier = 2.8
elif hour == 14: # School pickup time
pedestrian_multiplier = 3.5
elif 20 <= hour <= 22: # Dinner time
pedestrian_multiplier = 2.2
elif 8 <= hour <= 9: # Morning commute
pedestrian_multiplier = 2.0
else:
pedestrian_multiplier = 1.0
traffic_volume = int(base_traffic * traffic_multiplier)
pedestrian_count = int(pedestrian_base * pedestrian_multiplier)
# Average speed based on congestion
speed_map = {"low": 45, "medium": 25, "high": 15}
average_speed = speed_map[congestion] + (hash(f"{latitude}{longitude}") % 10 - 5)
return {
"date": now,
"traffic_volume": traffic_volume,
"pedestrian_count": pedestrian_count,
"congestion_level": congestion,
"average_speed": max(10, average_speed), # Minimum 10 km/h
2025-07-18 19:16:45 +02:00
"occupation_percentage": min(100, traffic_volume // 2),
"load_percentage": min(100, traffic_volume // 3),
"source": "synthetic"
2025-07-18 11:51:43 +02:00
}
async def _generate_historical_traffic(self,
latitude: float,
longitude: float,
start_date: datetime,
end_date: datetime) -> List[Dict[str, Any]]:
"""Generate synthetic historical traffic data"""
historical_data = []
current_date = start_date
while current_date <= end_date:
hour = current_date.hour
is_weekend = current_date.weekday() >= 5
# Base patterns similar to current traffic
base_traffic = 100
if not is_weekend:
if 7 <= hour <= 9 or 18 <= hour <= 20:
traffic_multiplier = 2.0 + (current_date.day % 5) * 0.1
elif 12 <= hour <= 14:
traffic_multiplier = 1.5
else:
traffic_multiplier = 1.0
else:
traffic_multiplier = 0.7 + (current_date.day % 3) * 0.2
# Add seasonal variations
month = current_date.month
seasonal_factor = 1.0
if month in [12, 1]: # Holiday season
seasonal_factor = 0.8
elif month in [7, 8]: # Summer vacation
seasonal_factor = 0.9
traffic_volume = int(base_traffic * traffic_multiplier * seasonal_factor)
# Determine congestion level
if traffic_volume > 160:
congestion_level = "high"
avg_speed = 15
elif traffic_volume > 120:
congestion_level = "medium"
avg_speed = 25
else:
congestion_level = "low"
avg_speed = 40
# Pedestrian count
pedestrian_base = 150
if 13 <= hour <= 15:
pedestrian_multiplier = 2.5
elif hour == 14:
pedestrian_multiplier = 3.0
else:
pedestrian_multiplier = 1.0
historical_data.append({
"date": current_date,
"traffic_volume": traffic_volume,
"pedestrian_count": int(pedestrian_base * pedestrian_multiplier),
"congestion_level": congestion_level,
"average_speed": avg_speed + (current_date.day % 10 - 5),
2025-07-18 19:16:45 +02:00
"occupation_percentage": min(100, traffic_volume // 2),
"load_percentage": min(100, traffic_volume // 3),
"source": "synthetic"
2025-07-18 11:51:43 +02:00
})
current_date += timedelta(hours=1)
return historical_data
async def _generate_synthetic_events(self, latitude: float, longitude: float) -> List[Dict[str, Any]]:
"""Generate synthetic Madrid events"""
events = []
base_date = datetime.now().date()
# Generate some sample events
sample_events = [
{
"name": "Mercado de San Miguel",
"type": "market",
"impact_level": "medium",
"distance_km": 1.2
},
{
"name": "Concierto en el Retiro",
"type": "concert",
"impact_level": "high",
"distance_km": 2.5
},
{
"name": "Partido Real Madrid",
"type": "sports",
"impact_level": "high",
"distance_km": 8.0
}
]
for i, event in enumerate(sample_events):
event_date = base_date + timedelta(days=i + 1)
events.append({
"id": f"event_{i+1}",
"name": event["name"],
"date": datetime.combine(event_date, datetime.min.time()),
"type": event["type"],
"impact_level": event["impact_level"],
"distance_km": event["distance_km"],
"latitude": latitude + (hash(event["name"]) % 100 - 50) / 1000,
"longitude": longitude + (hash(event["name"]) % 100 - 50) / 1000,
2025-07-18 19:16:45 +02:00
"source": "synthetic"
2025-07-18 11:51:43 +02:00
})
2025-07-18 19:16:45 +02:00
return events