Files
bakery-ia/services/data/app/external/madrid_opendata.py
2025-07-23 23:25:50 +02:00

1016 lines
47 KiB
Python

# ================================================================
# services/data/app/external/madrid_opendata.py - FIXED XML PARSER
# ================================================================
"""Madrid Open Data API client with fixed XML parser for actual structure"""
import math
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
import structlog
import re
from app.external.base_client import BaseAPIClient
from app.core.config import settings
import pyproj
logger = structlog.get_logger()
class MadridOpenDataClient(BaseAPIClient):
def __init__(self):
super().__init__(
base_url="https://datos.madrid.es",
api_key=None
)
# WORKING Madrid traffic endpoints (verified)
self.traffic_endpoints = [
# Primary working endpoint
"https://datos.madrid.es/egob/catalogo/202087-0-trafico-intensidad.xml",
]
async def get_current_traffic(self, latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
"""Get current traffic data for location using working Madrid endpoints"""
try:
logger.debug("Fetching Madrid traffic data", lat=latitude, lon=longitude)
# Try the working endpoint
for endpoint in self.traffic_endpoints:
try:
logger.debug("Trying traffic endpoint", endpoint=endpoint)
traffic_data = await self._fetch_traffic_xml_data(endpoint)
if traffic_data:
logger.info("Successfully fetched Madrid traffic data",
endpoint=endpoint,
points=len(traffic_data))
# Find nearest traffic measurement point
nearest_point = self._find_nearest_traffic_point(latitude, longitude, traffic_data)
if nearest_point:
parsed_data = self._parse_traffic_measurement(nearest_point)
logger.debug("Successfully parsed real Madrid traffic data",
point_name=nearest_point.get('descripcion'),
point_id=nearest_point.get('idelem'))
return parsed_data
else:
logger.debug("No nearby traffic points found",
lat=latitude, lon=longitude,
closest_distance=self._get_closest_distance(latitude, longitude, traffic_data))
except Exception as e:
logger.debug("Failed to fetch from endpoint", endpoint=endpoint, error=str(e))
continue
# If no real data available, use synthetic data
logger.info("No nearby Madrid traffic points found, using synthetic data")
return await self._generate_synthetic_traffic(latitude, longitude)
except Exception as e:
logger.error("Failed to get current traffic", error=str(e))
return await self._generate_synthetic_traffic(latitude, longitude)
async def _fetch_traffic_xml_data(self, endpoint: str) -> Optional[List[Dict[str, Any]]]:
"""Fetch and parse Madrid traffic XML data"""
try:
xml_content = await self._fetch_xml_content_robust(endpoint)
if not xml_content:
logger.debug("No XML content received", endpoint=endpoint)
return None
# Log XML structure for debugging
logger.debug("Madrid XML content preview",
length=len(xml_content),
first_500=xml_content[:500] if len(xml_content) > 500 else xml_content)
# Parse Madrid traffic XML with the correct structure
traffic_points = self._parse_madrid_traffic_xml(xml_content)
if traffic_points:
logger.debug("Successfully parsed Madrid traffic XML", points=len(traffic_points))
return traffic_points
else:
logger.warning("No traffic points found in XML", endpoint=endpoint)
return None
except Exception as e:
logger.error("Error fetching traffic XML data", endpoint=endpoint, error=str(e))
return None
def _parse_madrid_traffic_xml(self, xml_content: str) -> List[Dict[str, Any]]:
"""Parse Madrid traffic XML with correct structure (<pms><pm>...</pm></pms>)"""
traffic_points = []
try:
# Clean the XML to handle undefined entities and encoding issues
cleaned_xml = self._clean_madrid_xml(xml_content)
# Parse XML
root = ET.fromstring(cleaned_xml)
# Log XML structure
logger.debug("Madrid XML structure",
root_tag=root.tag,
children_count=len(list(root)))
# Madrid uses <pms> root with <pm> children
if root.tag == 'pms':
pm_elements = root.findall('pm')
logger.debug("Found PM elements", count=len(pm_elements))
for pm in pm_elements:
try:
traffic_point = self._extract_madrid_pm_element(pm)
# Validate essential data (coordinates and ID)
if (traffic_point.get('latitude') and
traffic_point.get('longitude') and
traffic_point.get('idelem')):
traffic_points.append(traffic_point)
# Log first few points for debugging
if len(traffic_points) <= 3:
logger.debug("Sample traffic point",
id=traffic_point['idelem'],
lat=traffic_point['latitude'],
lon=traffic_point['longitude'],
intensity=traffic_point.get('intensidad'))
except Exception as e:
logger.debug("Error parsing PM element", error=str(e))
continue
else:
logger.warning("Unexpected XML root tag", root_tag=root.tag)
logger.debug("Madrid traffic XML parsing completed", valid_points=len(traffic_points))
return traffic_points
except ET.ParseError as e:
logger.warning("Failed to parse Madrid XML", error=str(e))
# Try regex extraction as fallback
return self._extract_traffic_data_regex(xml_content)
except Exception as e:
logger.error("Error in Madrid traffic XML parsing", error=str(e))
return []
def _clean_madrid_xml(self, xml_content: str) -> str:
"""Clean Madrid XML to handle undefined entities and encoding issues"""
try:
# Remove BOM if present
xml_content = xml_content.lstrip('\ufeff')
# Remove or replace undefined entities that cause parsing errors
# Common undefined entities in Madrid data
xml_content = xml_content.replace('&nbsp;', ' ')
xml_content = xml_content.replace('&copy;', '©')
xml_content = xml_content.replace('&reg;', '®')
xml_content = xml_content.replace('&trade;', '')
# Fix unescaped ampersands (but not already escaped ones)
xml_content = re.sub(r'&(?![a-zA-Z0-9#]{1,10};)', '&amp;', xml_content)
# Remove invalid control characters
xml_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', xml_content)
# Handle Spanish characters that might be causing issues
spanish_chars = {
'ñ': 'n', 'Ñ': 'N',
'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U',
'ü': 'u', 'Ü': 'U'
}
for spanish_char, replacement in spanish_chars.items():
xml_content = xml_content.replace(spanish_char, replacement)
return xml_content
except Exception as e:
logger.warning("Error cleaning Madrid XML", error=str(e))
return xml_content
def _extract_madrid_pm_element(self, pm_element) -> Dict[str, Any]:
"""Extract traffic data from Madrid <pm> element with proper coordinate conversion"""
try:
# Based on the actual Madrid XML structure shown in logs
point_data = {}
utm_x = None
utm_y = None
# Extract all child elements
for child in pm_element:
tag = child.tag
text = child.text.strip() if child.text else ''
if tag == 'idelem':
point_data['idelem'] = text
elif tag == 'descripcion':
point_data['descripcion'] = text
elif tag == 'intensidad':
point_data['intensidad'] = self._safe_int(text)
elif tag == 'ocupacion':
point_data['ocupacion'] = self._safe_float(text)
elif tag == 'carga':
point_data['carga'] = self._safe_int(text)
elif tag == 'nivelServicio':
point_data['nivelServicio'] = self._safe_int(text)
elif tag == 'st_x':
# Store UTM X coordinate for later conversion
utm_x = text
point_data['utm_x'] = text # Keep original for debugging
elif tag == 'st_y':
# Store UTM Y coordinate for later conversion
utm_y = text
point_data['utm_y'] = text # Keep original for debugging
elif tag == 'error':
point_data['error'] = text
elif tag == 'subarea':
point_data['subarea'] = text
elif tag == 'accesoAsociado':
point_data['accesoAsociado'] = text
elif tag == 'intensidadSat':
point_data['intensidadSat'] = self._safe_int(text)
# Convert UTM coordinates to lat/lon if both are available
if utm_x and utm_y:
latitude, longitude = self._convert_utm_coordinates_accurate(utm_x, utm_y)
if latitude is not None and longitude is not None:
# Validate that coordinates are actually in Madrid area
if self._validate_madrid_coordinates(latitude, longitude):
point_data['latitude'] = latitude
point_data['longitude'] = longitude
# Log first few successful conversions for verification
if len(getattr(self, '_conversion_log_count', [])) < 3:
if not hasattr(self, '_conversion_log_count'):
self._conversion_log_count = []
self._conversion_log_count.append(1)
logger.debug("Successful UTM conversion",
idelem=point_data.get('idelem'),
utm_x=utm_x,
utm_y=utm_y,
latitude=latitude,
longitude=longitude,
descripcion=point_data.get('descripcion'))
else:
# Log invalid coordinates for debugging
logger.debug("Invalid Madrid coordinates after conversion",
idelem=point_data.get('idelem'),
utm_x=utm_x,
utm_y=utm_y,
converted_lat=latitude,
converted_lon=longitude,
descripcion=point_data.get('descripcion'))
# Don't include this point - return empty dict
return {}
else:
# Conversion failed
logger.debug("UTM conversion failed",
idelem=point_data.get('idelem'),
utm_x=utm_x,
utm_y=utm_y)
return {}
else:
# Missing coordinates
logger.debug("Missing UTM coordinates",
idelem=point_data.get('idelem'),
has_utm_x=utm_x is not None,
has_utm_y=utm_y is not None)
return {}
return point_data
except Exception as e:
logger.debug("Error extracting Madrid PM element", error=str(e))
return {}
def _convert_utm_coordinates_accurate(self, utm_x_str: str, utm_y_str: str) -> tuple[Optional[float], Optional[float]]:
"""Convert UTM coordinates to lat/lon using accurate pyproj library"""
try:
utm_x = float(utm_x_str.replace(',', '.'))
utm_y = float(utm_y_str.replace(',', '.'))
# Define UTM Zone 30N projection (EPSG:25830)
utm_proj = pyproj.Proj(proj='utm', zone=30, ellps='WGS84', preserve_units=False)
# Convert to latitude/longitude
longitude, latitude = utm_proj(utm_x, utm_y, inverse=True)
return round(latitude, 6), round(longitude, 6)
except (ValueError, TypeError, Exception):
return None, None
def _validate_madrid_coordinates(self, latitude: float, longitude: float) -> bool:
"""Validate that converted coordinates are actually in Madrid area"""
# Madrid bounds (expanded slightly to include metro area)
madrid_lat_min, madrid_lat_max = 40.31, 40.56
madrid_lon_min, madrid_lon_max = -3.89, -3.51
return (madrid_lat_min <= latitude <= madrid_lat_max and
madrid_lon_min <= longitude <= madrid_lon_max)
def _safe_int(self, value_str: str) -> int:
"""Safely convert string to int"""
try:
return int(float(value_str.replace(',', '.')))
except (ValueError, TypeError):
return 0
def _safe_float(self, value_str: str) -> float:
"""Safely convert string to float"""
try:
return float(value_str.replace(',', '.'))
except (ValueError, TypeError):
return 0.0
async def _fetch_xml_content_robust(self, url: str) -> Optional[str]:
"""Fetch XML content with robust headers for Madrid endpoints"""
try:
import httpx
# Headers optimized for Madrid Open Data
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/xml,text/xml,*/*',
'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'no-cache',
'Referer': 'https://datos.madrid.es/'
}
async with httpx.AsyncClient(
timeout=30.0,
follow_redirects=True,
headers=headers
) as client:
logger.debug("Fetching XML from Madrid endpoint", url=url)
response = await client.get(url)
logger.debug("Madrid API response",
status=response.status_code,
content_type=response.headers.get('content-type'),
content_length=len(response.content))
if response.status_code == 200:
try:
content = response.text
if content and len(content) > 100:
return content
except UnicodeDecodeError:
# Try manual encoding for Spanish content
for encoding in ['utf-8', 'latin-1', 'windows-1252', 'iso-8859-1']:
try:
content = response.content.decode(encoding)
if content and len(content) > 100:
logger.debug("Successfully decoded with encoding", encoding=encoding)
return content
except UnicodeDecodeError:
continue
return None
except Exception as e:
logger.warning("Failed to fetch Madrid XML content", url=url, error=str(e))
return None
def _extract_traffic_data_regex(self, xml_content: str) -> List[Dict[str, Any]]:
"""Extract traffic data using regex when XML parsing fails"""
traffic_points = []
try:
# Pattern to match Madrid PM elements
pm_pattern = r'<pm>(.*?)</pm>'
pm_matches = re.findall(pm_pattern, xml_content, re.DOTALL)
for pm_content in pm_matches:
try:
# Extract individual fields
idelem_match = re.search(r'<idelem>(.*?)</idelem>', pm_content)
intensidad_match = re.search(r'<intensidad>(.*?)</intensidad>', pm_content)
st_x_match = re.search(r'<st_x>(.*?)</st_x>', pm_content)
st_y_match = re.search(r'<st_y>(.*?)</st_y>', pm_content)
descripcion_match = re.search(r'<descripcion>(.*?)</descripcion>', pm_content)
if idelem_match and st_x_match and st_y_match:
idelem = idelem_match.group(1)
st_x = st_x_match.group(1)
st_y = st_y_match.group(1)
intensidad = intensidad_match.group(1) if intensidad_match else '0'
descripcion = descripcion_match.group(1) if descripcion_match else f'Point {idelem}'
# Convert coordinates
longitude = self._convert_utm_to_lon(st_x)
latitude = self._convert_utm_to_lat(st_y)
if latitude and longitude:
traffic_point = {
'idelem': idelem,
'descripcion': descripcion,
'intensidad': self._safe_int(intensidad),
'latitude': latitude,
'longitude': longitude,
'ocupacion': 0,
'carga': 0,
'nivelServicio': 0,
'error': 'N'
}
traffic_points.append(traffic_point)
except Exception as e:
logger.debug("Error parsing regex PM match", error=str(e))
continue
logger.debug("Regex extraction results", count=len(traffic_points))
return traffic_points
except Exception as e:
logger.error("Error in regex extraction", error=str(e))
return []
def _get_closest_distance(self, latitude: float, longitude: float, traffic_data: List[Dict]) -> float:
"""Get distance to closest traffic point for debugging"""
if not traffic_data:
return float('inf')
min_distance = float('inf')
for point in traffic_data:
if point.get('latitude') and point.get('longitude'):
distance = self._calculate_distance(
latitude, longitude,
point['latitude'], point['longitude']
)
min_distance = min(min_distance, distance)
return min_distance
def _find_nearest_traffic_point(self, latitude: float, longitude: float, traffic_data: List[Dict]) -> Optional[Dict]:
"""Find the nearest traffic measurement point to given coordinates"""
if not traffic_data:
return None
min_distance = float('inf')
nearest_point = None
for point in traffic_data:
if point.get('latitude') and point.get('longitude'):
distance = self._calculate_distance(
latitude, longitude,
point['latitude'], point['longitude']
)
if distance < min_distance:
min_distance = distance
nearest_point = point
# Madrid area search radius (15km)
if nearest_point and min_distance <= 15.0:
logger.debug("Found nearest Madrid traffic point",
distance_km=min_distance,
point_name=nearest_point.get('descripcion'),
point_id=nearest_point.get('idelem'))
return nearest_point
logger.debug("No nearby Madrid traffic points found",
min_distance=min_distance,
total_points=len(traffic_data))
return None
def _calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""Calculate distance between two coordinates in km using Haversine formula"""
R = 6371 # Earth's radius in km
dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = (math.sin(dlat/2) * math.sin(dlat/2) +
math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
math.sin(dlon/2) * math.sin(dlon/2))
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
distance = R * c
return distance
def _parse_traffic_measurement(self, traffic_point: Dict) -> Dict[str, Any]:
"""Parse Madrid traffic measurement into standardized format"""
try:
# Madrid traffic service levels: 0=fluid, 1=dense, 2=congested, 3=cut
service_level_map = {
0: "low",
1: "medium",
2: "high",
3: "blocked"
}
service_level = traffic_point.get('nivelServicio', 0)
# Estimate speed based on service level and road type
if service_level == 0: # Fluid
average_speed = 45
elif service_level == 1: # Dense
average_speed = 25
elif service_level == 2: # Congested
average_speed = 15
else: # Cut/Blocked
average_speed = 5
congestion_level = service_level_map.get(service_level, "medium")
# Calculate pedestrian estimate based on location
hour = datetime.now().hour
if 13 <= hour <= 15: # Lunch time
pedestrian_multiplier = 2.5
elif 8 <= hour <= 9 or 18 <= hour <= 20: # Rush hours
pedestrian_multiplier = 2.0
else:
pedestrian_multiplier = 1.0
pedestrian_count = int(100 * pedestrian_multiplier)
return {
"date": datetime.now(),
"traffic_volume": traffic_point.get('intensidad', 0),
"pedestrian_count": pedestrian_count,
"congestion_level": congestion_level,
"average_speed": average_speed,
"occupation_percentage": traffic_point.get('ocupacion', 0),
"load_percentage": traffic_point.get('carga', 0),
"measurement_point_id": traffic_point.get('idelem'),
"measurement_point_name": traffic_point.get('descripcion'),
"road_type": "URB",
"source": "madrid_opendata"
}
except Exception as e:
logger.error("Error parsing traffic measurement", error=str(e))
return self._get_default_traffic_data()
def _get_default_traffic_data(self) -> Dict[str, Any]:
"""Get default traffic data when parsing fails"""
return {
"date": datetime.now(),
"traffic_volume": 100,
"pedestrian_count": 150,
"congestion_level": "medium",
"average_speed": 25,
"occupation_percentage": 30,
"load_percentage": 40,
"measurement_point_id": "unknown",
"measurement_point_name": "Unknown location",
"road_type": "URB",
"source": "synthetic"
}
async def _generate_synthetic_traffic(self, latitude: float, longitude: float) -> Dict[str, Any]:
"""Generate realistic Madrid traffic data as fallback"""
now = datetime.now()
hour = now.hour
is_weekend = now.weekday() >= 5
base_traffic = 100
if not is_weekend:
if 7 <= hour <= 9:
traffic_multiplier = 2.2
congestion = "high"
avg_speed = 15
elif 18 <= hour <= 20:
traffic_multiplier = 2.5
congestion = "high"
avg_speed = 12
elif 12 <= hour <= 14:
traffic_multiplier = 1.6
congestion = "medium"
avg_speed = 25
else:
traffic_multiplier = 1.0
congestion = "low"
avg_speed = 40
else:
if 11 <= hour <= 14:
traffic_multiplier = 1.4
congestion = "medium"
avg_speed = 30
else:
traffic_multiplier = 0.8
congestion = "low"
avg_speed = 45
traffic_volume = int(base_traffic * traffic_multiplier)
# Pedestrian calculation
pedestrian_base = 150
if 13 <= hour <= 15:
pedestrian_count = int(pedestrian_base * 2.5)
elif 8 <= hour <= 9 or 18 <= hour <= 20:
pedestrian_count = int(pedestrian_base * 2.0)
else:
pedestrian_count = int(pedestrian_base * 1.0)
return {
"date": now,
"traffic_volume": traffic_volume,
"pedestrian_count": pedestrian_count,
"congestion_level": congestion,
"average_speed": max(10, avg_speed),
"occupation_percentage": min(100, traffic_volume // 2),
"load_percentage": min(100, traffic_volume // 3),
"measurement_point_id": "madrid_synthetic",
"measurement_point_name": "Madrid Centro (Synthetic)",
"road_type": "URB",
"source": "synthetic"
}
async def get_historical_traffic(self, latitude: float, longitude: float, start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
"""Get historical traffic data from Madrid Open Data
Args:
latitude: Location latitude
longitude: Location longitude
start_date: Start date for historical data
end_date: End date for historical data
Returns:
List of historical traffic data dictionaries
"""
try:
logger.debug("Fetching Madrid historical traffic data",
lat=latitude, lon=longitude,
start=start_date, end=end_date)
historical_data = []
# Generate historical data using synthetic generation for periods before API availability
# or when real data is not available
if (end_date - start_date).days <= 90: # Reasonable range for synthetic data
historical_data = await self._generate_historical_traffic(latitude, longitude, start_date, end_date)
logger.info("Generated synthetic historical traffic data",
records=len(historical_data))
else:
logger.warning("Date range too large for historical traffic data",
days=(end_date - start_date).days)
return []
# Try to fetch real data if API key is available and for recent dates
if hasattr(self, 'api_key') and self.api_key:
try:
real_data = await self._fetch_real_historical_traffic(latitude, longitude, start_date, end_date)
if real_data:
# Merge real data with synthetic data or replace synthetic data
historical_data = real_data
logger.info("Fetched real historical traffic data",
records=len(real_data))
except Exception as e:
logger.warning("Failed to fetch real historical data, using synthetic", error=str(e))
return historical_data
except Exception as e:
logger.error("Error getting historical traffic data", error=str(e))
return []
async def _fetch_real_historical_traffic(self, latitude: float, longitude: float, start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
"""Fetch real historical traffic data from Madrid Open Data portal
Madrid provides historical CSV files by month at:
https://datos.madrid.es/egob/catalogo/[ID]-[YEAR]-[MONTH]-trafico-historico.csv
"""
try:
historical_data = []
current_date = start_date.replace(day=1) # Start from beginning of month
while current_date <= end_date:
try:
# Madrid historical traffic CSV URL pattern
year = current_date.year
month = current_date.month
# Try different URL patterns based on Madrid Open Data structure
historical_urls = [
f"https://datos.madrid.es/egob/catalogo/300217-{year}-{month:02d}-trafico-historico.csv",
f"https://datos.madrid.es/egob/catalogo/trafico-historico-{year}-{month:02d}.csv",
f"https://datos.madrid.es/egob/catalogo/{year}{month:02d}-trafico-historico.csv"
]
for url in historical_urls:
csv_data = await self._fetch_historical_csv(url)
if csv_data:
# Parse CSV and filter by location
month_data = await self._parse_historical_csv(csv_data, latitude, longitude, start_date, end_date)
historical_data.extend(month_data)
logger.debug("Fetched historical data for month",
year=year, month=month, records=len(month_data))
break
# Move to next month
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
except Exception as e:
logger.warning("Error fetching data for month",
year=current_date.year, month=current_date.month, error=str(e))
# Move to next month even on error
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
return historical_data
except Exception as e:
logger.error("Error fetching real historical traffic data", error=str(e))
return []
async def _fetch_historical_csv(self, url: str) -> Optional[str]:
"""Fetch historical CSV data from Madrid Open Data"""
try:
import httpx
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Madrid-Traffic-Client/1.0)',
'Accept': 'text/csv,application/csv,text/plain,*/*',
'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
}
async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
logger.debug("Fetching historical CSV", url=url)
response = await client.get(url)
if response.status_code == 200:
content = response.text
if content and len(content) > 100: # Ensure we got actual data
logger.debug("Successfully fetched CSV",
url=url, size=len(content))
return content
else:
logger.debug("CSV not found", url=url, status=response.status_code)
except Exception as e:
logger.debug("Error fetching CSV", url=url, error=str(e))
return None
async def _parse_historical_csv(self, csv_content: str, latitude: float, longitude: float, start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
"""Parse Madrid historical traffic CSV and filter by location and date range"""
try:
import csv
from io import StringIO
historical_records = []
csv_reader = csv.DictReader(StringIO(csv_content), delimiter=';')
# Get the nearest measurement points to our coordinates
measurement_points = await self._get_measurement_points_near_location(latitude, longitude)
target_point_ids = [point['id'] for point in measurement_points[:3]] # Use 3 nearest points
for row in csv_reader:
try:
# Parse Madrid CSV format
# Expected columns: fecha, hora, idelem, intensidad, ocupacion, carga, nivelServicio, etc.
# Extract date and time
if 'fecha' in row and 'hora' in row:
date_str = row.get('fecha', '').strip()
time_str = row.get('hora', '').strip()
# Parse Madrid date format (usually DD/MM/YYYY)
if date_str and time_str:
try:
# Try different date formats
for date_format in ['%d/%m/%Y', '%Y-%m-%d', '%d-%m-%Y']:
try:
record_date = datetime.strptime(f"{date_str} {time_str}", f"{date_format} %H:%M")
break
except ValueError:
continue
else:
continue # Skip if no date format worked
# Check if record is in our date range
if not (start_date <= record_date <= end_date):
continue
except ValueError:
continue
else:
continue
# Check if this record is from a measurement point near our location
point_id = row.get('idelem', '').strip()
if point_id not in target_point_ids:
continue
# Parse traffic data
traffic_record = {
"date": record_date,
"traffic_volume": self._safe_int(row.get('intensidad', '0')),
"occupation_percentage": self._safe_int(row.get('ocupacion', '0')),
"load_percentage": self._safe_int(row.get('carga', '0')),
"service_level": self._safe_int(row.get('nivelServicio', '0')),
"measurement_point_id": point_id,
"measurement_point_name": row.get('descripcion', f'Point {point_id}'),
"road_type": row.get('tipo_elem', 'URB'),
"source": "madrid_opendata_historical"
}
# Calculate derived metrics
service_level = traffic_record['service_level']
if service_level == 0: # Fluid
congestion_level = "low"
avg_speed = 45
pedestrian_multiplier = 1.0
elif service_level == 1: # Dense
congestion_level = "medium"
avg_speed = 25
pedestrian_multiplier = 1.5
elif service_level == 2: # Congested
congestion_level = "high"
avg_speed = 15
pedestrian_multiplier = 2.0
else: # Cut/Blocked
congestion_level = "blocked"
avg_speed = 5
pedestrian_multiplier = 0.5
traffic_record.update({
"congestion_level": congestion_level,
"average_speed": avg_speed,
"pedestrian_count": int(100 * pedestrian_multiplier)
})
historical_records.append(traffic_record)
except Exception as e:
logger.debug("Error parsing CSV row", error=str(e))
continue
return historical_records
except Exception as e:
logger.error("Error parsing historical CSV", error=str(e))
return []
async def _get_measurement_points_near_location(self, latitude: float, longitude: float) -> List[Dict[str, Any]]:
"""Get measurement points near the specified location"""
try:
# Try to fetch current traffic data to get measurement points
current_traffic = await self._fetch_traffic_xml_data(self.traffic_endpoints[0])
if current_traffic:
# Calculate distances and sort by proximity
points_with_distance = []
for point in current_traffic:
if point.get('latitude') and point.get('longitude'):
distance = self._calculate_distance(
latitude, longitude,
point['latitude'], point['longitude']
)
points_with_distance.append({
'id': point.get('idelem'),
'distance': distance,
'latitude': point['latitude'],
'longitude': point['longitude'],
'name': point.get('descripcion', '')
})
# Sort by distance and return closest points
points_with_distance.sort(key=lambda x: x['distance'])
return points_with_distance[:5] # Return 5 closest points
# Fallback: return synthetic point IDs based on Madrid geography
return [
{'id': 'madrid_centro_01', 'distance': 1.0},
{'id': 'madrid_centro_02', 'distance': 2.0},
{'id': 'madrid_centro_03', 'distance': 3.0}
]
except Exception as e:
logger.warning("Error getting measurement points", error=str(e))
return [{'id': 'madrid_default', 'distance': 0.0}]
async def _generate_historical_traffic(self, latitude: float, longitude: float, start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
"""Generate synthetic historical traffic data for the specified period
This method creates realistic historical traffic patterns based on:
- Time of day patterns
- Day of week patterns
- Seasonal variations
- Random variations for realism
"""
try:
import random
from datetime import timedelta
historical_data = []
current_date = start_date
# Seed random for consistent but varied data
random.seed(hash(f"{latitude}{longitude}"))
while current_date <= end_date:
# Generate 24 hourly records for each day
for hour in range(24):
record_time = current_date.replace(hour=hour, minute=0, second=0, microsecond=0)
# Base traffic calculation
base_traffic = 100
hour_of_day = record_time.hour
day_of_week = record_time.weekday() # 0=Monday, 6=Sunday
month = record_time.month
# Time of day patterns
if 7 <= hour_of_day <= 9: # Morning rush
traffic_multiplier = 2.2 + random.uniform(-0.3, 0.3)
congestion = "high"
avg_speed = 15 + random.randint(-5, 5)
elif 18 <= hour_of_day <= 20: # Evening rush
traffic_multiplier = 2.5 + random.uniform(-0.4, 0.4)
congestion = "high"
avg_speed = 12 + random.randint(-3, 8)
elif 12 <= hour_of_day <= 14: # Lunch time
traffic_multiplier = 1.6 + random.uniform(-0.2, 0.2)
congestion = "medium"
avg_speed = 25 + random.randint(-5, 10)
elif 22 <= hour_of_day or hour_of_day <= 6: # Night
traffic_multiplier = 0.3 + random.uniform(-0.1, 0.2)
congestion = "low"
avg_speed = 50 + random.randint(-10, 15)
else: # Regular hours
traffic_multiplier = 1.0 + random.uniform(-0.2, 0.2)
congestion = "medium"
avg_speed = 35 + random.randint(-10, 10)
# Weekend adjustments
if day_of_week >= 5: # Weekend
if hour_of_day in [11, 12, 13, 14, 15]: # Weekend afternoon peak
traffic_multiplier *= 1.4
congestion = "medium"
else:
traffic_multiplier *= 0.7
if congestion == "high":
congestion = "medium"
# Seasonal adjustments
if month in [7, 8]: # Summer - less traffic due to vacations
traffic_multiplier *= 0.8
elif month in [11, 12]: # Holiday season - more traffic
traffic_multiplier *= 1.1
# Calculate final values
traffic_volume = max(10, int(base_traffic * traffic_multiplier))
avg_speed = max(10, min(60, avg_speed))
# Pedestrian calculation
pedestrian_base = 150
if 13 <= hour_of_day <= 15: # Lunch time
pedestrian_count = int(pedestrian_base * 2.5 * random.uniform(0.8, 1.2))
elif 8 <= hour_of_day <= 9 or 18 <= hour_of_day <= 20: # Rush hours
pedestrian_count = int(pedestrian_base * 2.0 * random.uniform(0.8, 1.2))
else:
pedestrian_count = int(pedestrian_base * 1.0 * random.uniform(0.5, 1.5))
# Create traffic record
traffic_record = {
"date": record_time,
"traffic_volume": traffic_volume,
"pedestrian_count": pedestrian_count,
"congestion_level": congestion,
"average_speed": avg_speed,
"occupation_percentage": min(100, traffic_volume // 2),
"load_percentage": min(100, traffic_volume // 3),
"measurement_point_id": f"madrid_historical_{hash(f'{latitude}{longitude}') % 1000}",
"measurement_point_name": f"Madrid Historical Point ({latitude:.4f}, {longitude:.4f})",
"road_type": "URB",
"source": "synthetic_historical"
}
historical_data.append(traffic_record)
# Move to next day
current_date += timedelta(days=1)
logger.info("Generated historical traffic data",
records=len(historical_data),
start=start_date,
end=end_date)
return historical_data
except Exception as e:
logger.error("Error generating historical traffic data", error=str(e))
return []
async def get_events(self, latitude: float, longitude: float, radius_km: float = 5.0) -> List[Dict[str, Any]]:
"""Get traffic incidents and events"""
return []