478 lines
20 KiB
Python
478 lines
20 KiB
Python
# ================================================================
|
|
# services/data/app/external/processors/madrid_processor.py
|
|
# ================================================================
|
|
"""
|
|
Data transformation and parsing for Madrid traffic data
|
|
Handles XML parsing, CSV processing, coordinate conversion, and data quality scoring
|
|
"""
|
|
|
|
import csv
|
|
import io
|
|
import math
|
|
import re
|
|
import xml.etree.ElementTree as ET
|
|
import zipfile
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
import structlog
|
|
import pyproj
|
|
|
|
from ..models.madrid_models import TrafficRecord, MeasurementPoint, CongestionLevel
|
|
|
|
|
|
class MadridTrafficDataProcessor:
|
|
"""Handles all data transformation and parsing for Madrid traffic data"""
|
|
|
|
def __init__(self):
|
|
self.logger = structlog.get_logger()
|
|
# UTM Zone 30N (Madrid's coordinate system)
|
|
self.utm_proj = pyproj.Proj(proj='utm', zone=30, ellps='WGS84', datum='WGS84')
|
|
self.wgs84_proj = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
|
|
|
|
def safe_int(self, value: str) -> int:
|
|
"""Safely convert string to int"""
|
|
try:
|
|
return int(float(value.replace(',', '.')))
|
|
except (ValueError, TypeError):
|
|
return 0
|
|
|
|
def _safe_float(self, value: str) -> float:
|
|
"""Safely convert string to float"""
|
|
try:
|
|
return float(value.replace(',', '.'))
|
|
except (ValueError, TypeError):
|
|
return 0.0
|
|
|
|
def clean_madrid_xml(self, xml_content: str) -> str:
|
|
"""Clean and prepare Madrid XML content for parsing"""
|
|
if not xml_content:
|
|
return ""
|
|
|
|
# Remove BOM and extra whitespace
|
|
cleaned = xml_content.strip()
|
|
if cleaned.startswith('\ufeff'):
|
|
cleaned = cleaned[1:]
|
|
|
|
# Fix common XML issues
|
|
cleaned = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;)', '&', cleaned)
|
|
|
|
# Ensure proper encoding declaration
|
|
if not cleaned.startswith('<?xml'):
|
|
cleaned = '<?xml version="1.0" encoding="UTF-8"?>\n' + cleaned
|
|
|
|
return cleaned
|
|
|
|
def convert_utm_to_latlon(self, utm_x: str, utm_y: str) -> Tuple[Optional[float], Optional[float]]:
|
|
"""Convert UTM coordinates to latitude/longitude"""
|
|
try:
|
|
utm_x_float = float(utm_x.replace(',', '.'))
|
|
utm_y_float = float(utm_y.replace(',', '.'))
|
|
|
|
# Convert from UTM Zone 30N to WGS84
|
|
longitude, latitude = pyproj.transform(self.utm_proj, self.wgs84_proj, utm_x_float, utm_y_float)
|
|
|
|
# Validate coordinates are in Madrid area
|
|
if 40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5:
|
|
return latitude, longitude
|
|
else:
|
|
self.logger.debug("Coordinates outside Madrid bounds",
|
|
lat=latitude, lon=longitude, utm_x=utm_x, utm_y=utm_y)
|
|
return None, None
|
|
|
|
except Exception as e:
|
|
self.logger.debug("UTM conversion error",
|
|
utm_x=utm_x, utm_y=utm_y, error=str(e))
|
|
return None, None
|
|
|
|
def parse_traffic_xml(self, xml_content: str) -> List[Dict[str, Any]]:
|
|
"""Parse Madrid traffic XML data"""
|
|
traffic_points = []
|
|
|
|
try:
|
|
cleaned_xml = self.clean_madrid_xml(xml_content)
|
|
root = ET.fromstring(cleaned_xml)
|
|
|
|
self.logger.debug("Madrid XML structure", root_tag=root.tag, children_count=len(list(root)))
|
|
|
|
if root.tag == 'pms':
|
|
pm_elements = root.findall('pm')
|
|
self.logger.debug("Found PM elements", count=len(pm_elements))
|
|
|
|
for pm in pm_elements:
|
|
try:
|
|
traffic_point = self._extract_madrid_pm_element(pm)
|
|
|
|
if self._is_valid_traffic_point(traffic_point):
|
|
traffic_points.append(traffic_point)
|
|
|
|
# Log first few points for debugging
|
|
if len(traffic_points) <= 3:
|
|
self.logger.debug("Sample traffic point",
|
|
id=traffic_point['idelem'],
|
|
lat=traffic_point['latitude'],
|
|
lon=traffic_point['longitude'],
|
|
intensity=traffic_point.get('intensidad'))
|
|
|
|
except Exception as e:
|
|
self.logger.debug("Error parsing PM element", error=str(e))
|
|
continue
|
|
else:
|
|
self.logger.warning("Unexpected XML root tag", root_tag=root.tag)
|
|
|
|
self.logger.debug("Madrid traffic XML parsing completed", valid_points=len(traffic_points))
|
|
return traffic_points
|
|
|
|
except ET.ParseError as e:
|
|
self.logger.warning("Failed to parse Madrid XML", error=str(e))
|
|
return self._extract_traffic_data_regex(xml_content)
|
|
except Exception as e:
|
|
self.logger.error("Error in Madrid traffic XML parsing", error=str(e))
|
|
return []
|
|
|
|
def _extract_madrid_pm_element(self, pm_element) -> Dict[str, Any]:
|
|
"""Extract traffic data from Madrid <pm> element with coordinate conversion"""
|
|
try:
|
|
point_data = {}
|
|
utm_x = utm_y = None
|
|
|
|
# Extract all child elements
|
|
for child in pm_element:
|
|
tag, text = child.tag, child.text.strip() if child.text else ''
|
|
|
|
if tag == 'idelem':
|
|
point_data['idelem'] = text
|
|
elif tag == 'descripcion':
|
|
point_data['descripcion'] = text
|
|
elif tag == 'intensidad':
|
|
point_data['intensidad'] = self.safe_int(text)
|
|
elif tag == 'ocupacion':
|
|
point_data['ocupacion'] = self._safe_float(text)
|
|
elif tag == 'carga':
|
|
point_data['carga'] = self.safe_int(text)
|
|
elif tag == 'nivelServicio':
|
|
point_data['nivelServicio'] = self.safe_int(text)
|
|
elif tag == 'st_x': # UTM X coordinate
|
|
utm_x = text
|
|
point_data['utm_x'] = text
|
|
elif tag == 'st_y': # UTM Y coordinate
|
|
utm_y = text
|
|
point_data['utm_y'] = text
|
|
elif tag == 'error':
|
|
point_data['error'] = text
|
|
elif tag in ['subarea', 'accesoAsociado', 'intensidadSat']:
|
|
point_data[tag] = text
|
|
|
|
# Convert coordinates
|
|
if utm_x and utm_y:
|
|
latitude, longitude = self.convert_utm_to_latlon(utm_x, utm_y)
|
|
|
|
if latitude and longitude:
|
|
point_data.update({
|
|
'latitude': latitude,
|
|
'longitude': longitude,
|
|
'measurement_point_id': point_data.get('idelem'),
|
|
'measurement_point_name': point_data.get('descripcion'),
|
|
'timestamp': datetime.now(timezone.utc),
|
|
'source': 'madrid_opendata_xml'
|
|
})
|
|
|
|
return point_data
|
|
else:
|
|
self.logger.debug("Invalid coordinates after conversion",
|
|
idelem=point_data.get('idelem'), utm_x=utm_x, utm_y=utm_y)
|
|
return {}
|
|
else:
|
|
self.logger.debug("Missing UTM coordinates", idelem=point_data.get('idelem'))
|
|
return {}
|
|
|
|
except Exception as e:
|
|
self.logger.debug("Error extracting PM element", error=str(e))
|
|
return {}
|
|
|
|
def _is_valid_traffic_point(self, traffic_point: Dict[str, Any]) -> bool:
|
|
"""Validate traffic point data"""
|
|
required_fields = ['idelem', 'latitude', 'longitude']
|
|
return all(field in traffic_point and traffic_point[field] for field in required_fields)
|
|
|
|
def _extract_traffic_data_regex(self, xml_content: str) -> List[Dict[str, Any]]:
|
|
"""Fallback regex-based extraction if XML parsing fails"""
|
|
traffic_points = []
|
|
|
|
try:
|
|
# Pattern to match PM elements
|
|
pm_pattern = r'<pm>(.*?)</pm>'
|
|
pm_matches = re.findall(pm_pattern, xml_content, re.DOTALL)
|
|
|
|
for pm_content in pm_matches:
|
|
traffic_point = {}
|
|
|
|
# Extract key fields
|
|
patterns = {
|
|
'idelem': r'<idelem>(.*?)</idelem>',
|
|
'descripcion': r'<descripcion>(.*?)</descripcion>',
|
|
'intensidad': r'<intensidad>(.*?)</intensidad>',
|
|
'ocupacion': r'<ocupacion>(.*?)</ocupacion>',
|
|
'st_x': r'<st_x>(.*?)</st_x>',
|
|
'st_y': r'<st_y>(.*?)</st_y>'
|
|
}
|
|
|
|
for field, pattern in patterns.items():
|
|
match = re.search(pattern, pm_content)
|
|
if match:
|
|
traffic_point[field] = match.group(1).strip()
|
|
|
|
# Convert coordinates
|
|
if 'st_x' in traffic_point and 'st_y' in traffic_point:
|
|
latitude, longitude = self.convert_utm_to_latlon(
|
|
traffic_point['st_x'], traffic_point['st_y']
|
|
)
|
|
|
|
if latitude and longitude:
|
|
traffic_point.update({
|
|
'latitude': latitude,
|
|
'longitude': longitude,
|
|
'intensidad': self.safe_int(traffic_point.get('intensidad', '0')),
|
|
'ocupacion': self._safe_float(traffic_point.get('ocupacion', '0')),
|
|
'measurement_point_id': traffic_point.get('idelem'),
|
|
'measurement_point_name': traffic_point.get('descripcion'),
|
|
'timestamp': datetime.now(timezone.utc),
|
|
'source': 'madrid_opendata_xml_regex'
|
|
})
|
|
|
|
traffic_points.append(traffic_point)
|
|
|
|
self.logger.debug("Regex extraction completed", points=len(traffic_points))
|
|
return traffic_points
|
|
|
|
except Exception as e:
|
|
self.logger.error("Error in regex extraction", error=str(e))
|
|
return []
|
|
|
|
def parse_measurement_points_csv(self, csv_content: str) -> Dict[str, Dict[str, Any]]:
|
|
"""Parse measurement points CSV into lookup dictionary"""
|
|
measurement_points = {}
|
|
|
|
try:
|
|
# Parse CSV with semicolon delimiter
|
|
csv_reader = csv.DictReader(io.StringIO(csv_content), delimiter=';')
|
|
|
|
processed_count = 0
|
|
for row in csv_reader:
|
|
try:
|
|
# Extract point ID and coordinates
|
|
point_id = row.get('id', '').strip()
|
|
if not point_id:
|
|
continue
|
|
|
|
processed_count += 1
|
|
|
|
# Try different coordinate field names
|
|
lat_str = ''
|
|
lon_str = ''
|
|
|
|
# Common coordinate field patterns
|
|
lat_fields = ['lat', 'latitude', 'latitud', 'y', 'utm_y']
|
|
lon_fields = ['lon', 'lng', 'longitude', 'longitud', 'x', 'utm_x']
|
|
|
|
for field in lat_fields:
|
|
if field in row and row[field].strip():
|
|
lat_str = row[field].strip()
|
|
break
|
|
|
|
for field in lon_fields:
|
|
if field in row and row[field].strip():
|
|
lon_str = row[field].strip()
|
|
break
|
|
|
|
if lat_str and lon_str:
|
|
try:
|
|
# Try direct lat/lon first
|
|
latitude = self._safe_float(lat_str)
|
|
longitude = self._safe_float(lon_str)
|
|
|
|
# If values look like UTM coordinates, convert them
|
|
if latitude > 1000 or longitude > 1000:
|
|
latitude, longitude = self.convert_utm_to_latlon(lon_str, lat_str)
|
|
if not latitude or not longitude:
|
|
continue
|
|
|
|
# Validate Madrid area
|
|
if not (40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5):
|
|
continue
|
|
|
|
measurement_points[point_id] = {
|
|
'id': point_id,
|
|
'latitude': latitude,
|
|
'longitude': longitude,
|
|
'name': row.get('nombre', row.get('descripcion', f"Point {point_id}")),
|
|
'type': row.get('tipo', 'traffic'),
|
|
'raw_data': dict(row) # Keep original data
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.debug("Error processing point coordinates",
|
|
point_id=point_id, error=str(e))
|
|
continue
|
|
|
|
except Exception as e:
|
|
self.logger.debug("Error processing CSV row", error=str(e))
|
|
continue
|
|
|
|
self.logger.info("Parsed measurement points registry",
|
|
total_points=len(measurement_points))
|
|
return measurement_points
|
|
|
|
except Exception as e:
|
|
self.logger.error("Error parsing measurement points CSV", error=str(e))
|
|
return {}
|
|
|
|
def calculate_data_quality_score(self, row: Dict[str, str]) -> float:
|
|
"""Calculate data quality score for a traffic record"""
|
|
try:
|
|
score = 1.0
|
|
|
|
# Check for missing or invalid values
|
|
intensidad = row.get('intensidad', '').strip()
|
|
if not intensidad or intensidad in ['N', '', '0']:
|
|
score *= 0.7
|
|
|
|
ocupacion = row.get('ocupacion', '').strip()
|
|
if not ocupacion or ocupacion in ['N', '', '0']:
|
|
score *= 0.8
|
|
|
|
error_status = row.get('error', '').strip()
|
|
if error_status and error_status != 'N':
|
|
score *= 0.6
|
|
|
|
# Check for reasonable value ranges
|
|
try:
|
|
intensidad_val = self.safe_int(intensidad)
|
|
if intensidad_val < 0 or intensidad_val > 5000: # Unrealistic traffic volume
|
|
score *= 0.7
|
|
|
|
ocupacion_val = self.safe_int(ocupacion)
|
|
if ocupacion_val < 0 or ocupacion_val > 100: # Invalid percentage
|
|
score *= 0.5
|
|
|
|
except:
|
|
score *= 0.6
|
|
|
|
return max(0.1, score) # Minimum quality score
|
|
|
|
except Exception as e:
|
|
self.logger.debug("Error calculating quality score", error=str(e))
|
|
return 0.5 # Default medium quality
|
|
|
|
async def process_csv_content_chunked(self, text_content: str, csv_filename: str,
|
|
nearest_ids: set, nearest_points: list) -> list:
|
|
"""Process CSV content in chunks to prevent memory issues"""
|
|
import csv
|
|
import io
|
|
import gc
|
|
|
|
try:
|
|
csv_reader = csv.DictReader(io.StringIO(text_content), delimiter=';')
|
|
|
|
chunk_size = 10000
|
|
chunk_records = []
|
|
all_records = []
|
|
processed_count = 0
|
|
total_rows_seen = 0
|
|
|
|
for row in csv_reader:
|
|
total_rows_seen += 1
|
|
measurement_point_id = row.get('id', '').strip()
|
|
|
|
if measurement_point_id not in nearest_ids:
|
|
continue
|
|
|
|
try:
|
|
record_data = await self.parse_historical_csv_row(row, nearest_points)
|
|
|
|
if record_data:
|
|
chunk_records.append(record_data)
|
|
processed_count += 1
|
|
|
|
if len(chunk_records) >= chunk_size:
|
|
all_records.extend(chunk_records)
|
|
chunk_records = []
|
|
gc.collect()
|
|
|
|
except Exception as e:
|
|
if processed_count < 5:
|
|
self.logger.error("Row parsing exception",
|
|
row_num=total_rows_seen,
|
|
measurement_point_id=measurement_point_id,
|
|
error=str(e))
|
|
continue
|
|
|
|
# Process remaining records
|
|
if chunk_records:
|
|
all_records.extend(chunk_records)
|
|
chunk_records = []
|
|
gc.collect()
|
|
|
|
self.logger.info("Processed CSV file",
|
|
filename=csv_filename,
|
|
total_rows_read=total_rows_seen,
|
|
processed_records=processed_count)
|
|
|
|
return all_records
|
|
|
|
except Exception as e:
|
|
self.logger.error("Error processing CSV content",
|
|
filename=csv_filename, error=str(e))
|
|
return []
|
|
|
|
async def parse_historical_csv_row(self, row: dict, nearest_points: list) -> dict:
|
|
"""Parse a single row from Madrid's historical traffic CSV"""
|
|
try:
|
|
# Extract date
|
|
fecha_str = row.get('fecha', '').strip()
|
|
if not fecha_str:
|
|
return None
|
|
|
|
try:
|
|
from datetime import datetime, timezone
|
|
date_obj = datetime.strptime(fecha_str, '%Y-%m-%d %H:%M:%S')
|
|
date_obj = date_obj.replace(tzinfo=timezone.utc)
|
|
except Exception:
|
|
return None
|
|
|
|
measurement_point_id = row.get('id', '').strip()
|
|
|
|
# Find point data
|
|
point_match = next((p for p in nearest_points if p[0] == measurement_point_id), None)
|
|
if not point_match:
|
|
return None
|
|
|
|
point_data = point_match[1]
|
|
distance_km = point_match[2]
|
|
|
|
# Extract traffic data
|
|
intensidad = self.safe_int(row.get('intensidad', '0'))
|
|
ocupacion = self.safe_int(row.get('ocupacion', '0'))
|
|
carga = self.safe_int(row.get('carga', '0'))
|
|
vmed = self.safe_int(row.get('vmed', '0'))
|
|
|
|
# Build basic result (business logic will be applied elsewhere)
|
|
result = {
|
|
'date': date_obj,
|
|
'measurement_point_id': measurement_point_id,
|
|
'point_data': point_data,
|
|
'distance_km': distance_km,
|
|
'traffic_data': {
|
|
'intensidad': intensidad,
|
|
'ocupacion': ocupacion,
|
|
'carga': carga,
|
|
'vmed': vmed
|
|
},
|
|
'data_quality_score': self.calculate_data_quality_score(row),
|
|
'raw_row': row
|
|
}
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
self.logger.debug("Error parsing historical CSV row", error=str(e))
|
|
return None |