# ================================================================ # services/data/app/external/processors/madrid_processor.py # ================================================================ """ Data transformation and parsing for Madrid traffic data Handles XML parsing, CSV processing, coordinate conversion, and data quality scoring """ import csv import io import math import re import xml.etree.ElementTree as ET import zipfile from datetime import datetime, timezone from typing import Dict, List, Any, Optional, Tuple import structlog import pyproj from ..models.madrid_models import TrafficRecord, MeasurementPoint, CongestionLevel class MadridTrafficDataProcessor: """Handles all data transformation and parsing for Madrid traffic data""" def __init__(self): self.logger = structlog.get_logger() # UTM Zone 30N (Madrid's coordinate system) self.utm_proj = pyproj.Proj(proj='utm', zone=30, ellps='WGS84', datum='WGS84') self.wgs84_proj = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84') def safe_int(self, value: str) -> int: """Safely convert string to int""" try: return int(float(value.replace(',', '.'))) except (ValueError, TypeError): return 0 def _safe_float(self, value: str) -> float: """Safely convert string to float""" try: return float(value.replace(',', '.')) except (ValueError, TypeError): return 0.0 def clean_madrid_xml(self, xml_content: str) -> str: """Clean and prepare Madrid XML content for parsing""" if not xml_content: return "" # Remove BOM and extra whitespace cleaned = xml_content.strip() if cleaned.startswith('\ufeff'): cleaned = cleaned[1:] # Fix common XML issues cleaned = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;)', '&', cleaned) # Ensure proper encoding declaration if not cleaned.startswith('\n' + cleaned return cleaned def convert_utm_to_latlon(self, utm_x: str, utm_y: str) -> Tuple[Optional[float], Optional[float]]: """Convert UTM coordinates to latitude/longitude""" try: utm_x_float = float(utm_x.replace(',', '.')) utm_y_float = float(utm_y.replace(',', '.')) # Convert from UTM Zone 30N to WGS84 longitude, latitude = pyproj.transform(self.utm_proj, self.wgs84_proj, utm_x_float, utm_y_float) # Validate coordinates are in Madrid area if 40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5: return latitude, longitude else: self.logger.debug("Coordinates outside Madrid bounds", lat=latitude, lon=longitude, utm_x=utm_x, utm_y=utm_y) return None, None except Exception as e: self.logger.debug("UTM conversion error", utm_x=utm_x, utm_y=utm_y, error=str(e)) return None, None def parse_traffic_xml(self, xml_content: str) -> List[Dict[str, Any]]: """Parse Madrid traffic XML data""" traffic_points = [] try: cleaned_xml = self.clean_madrid_xml(xml_content) root = ET.fromstring(cleaned_xml) self.logger.debug("Madrid XML structure", root_tag=root.tag, children_count=len(list(root))) if root.tag == 'pms': pm_elements = root.findall('pm') self.logger.debug("Found PM elements", count=len(pm_elements)) for pm in pm_elements: try: traffic_point = self._extract_madrid_pm_element(pm) if self._is_valid_traffic_point(traffic_point): traffic_points.append(traffic_point) # Log first few points for debugging if len(traffic_points) <= 3: self.logger.debug("Sample traffic point", id=traffic_point['idelem'], lat=traffic_point['latitude'], lon=traffic_point['longitude'], intensity=traffic_point.get('intensidad')) except Exception as e: self.logger.debug("Error parsing PM element", error=str(e)) continue else: self.logger.warning("Unexpected XML root tag", root_tag=root.tag) self.logger.debug("Madrid traffic XML parsing completed", valid_points=len(traffic_points)) return traffic_points except ET.ParseError as e: self.logger.warning("Failed to parse Madrid XML", error=str(e)) return self._extract_traffic_data_regex(xml_content) except Exception as e: self.logger.error("Error in Madrid traffic XML parsing", error=str(e)) return [] def _extract_madrid_pm_element(self, pm_element) -> Dict[str, Any]: """Extract traffic data from Madrid element with coordinate conversion""" try: point_data = {} utm_x = utm_y = None # Extract all child elements for child in pm_element: tag, text = child.tag, child.text.strip() if child.text else '' if tag == 'idelem': point_data['idelem'] = text elif tag == 'descripcion': point_data['descripcion'] = text elif tag == 'intensidad': point_data['intensidad'] = self.safe_int(text) elif tag == 'ocupacion': point_data['ocupacion'] = self._safe_float(text) elif tag == 'carga': point_data['carga'] = self.safe_int(text) elif tag == 'nivelServicio': point_data['nivelServicio'] = self.safe_int(text) elif tag == 'st_x': # UTM X coordinate utm_x = text point_data['utm_x'] = text elif tag == 'st_y': # UTM Y coordinate utm_y = text point_data['utm_y'] = text elif tag == 'error': point_data['error'] = text elif tag in ['subarea', 'accesoAsociado', 'intensidadSat']: point_data[tag] = text # Convert coordinates if utm_x and utm_y: latitude, longitude = self.convert_utm_to_latlon(utm_x, utm_y) if latitude and longitude: point_data.update({ 'latitude': latitude, 'longitude': longitude, 'measurement_point_id': point_data.get('idelem'), 'measurement_point_name': point_data.get('descripcion'), 'timestamp': datetime.now(timezone.utc), 'source': 'madrid_opendata_xml' }) return point_data else: self.logger.debug("Invalid coordinates after conversion", idelem=point_data.get('idelem'), utm_x=utm_x, utm_y=utm_y) return {} else: self.logger.debug("Missing UTM coordinates", idelem=point_data.get('idelem')) return {} except Exception as e: self.logger.debug("Error extracting PM element", error=str(e)) return {} def _is_valid_traffic_point(self, traffic_point: Dict[str, Any]) -> bool: """Validate traffic point data""" required_fields = ['idelem', 'latitude', 'longitude'] return all(field in traffic_point and traffic_point[field] for field in required_fields) def _extract_traffic_data_regex(self, xml_content: str) -> List[Dict[str, Any]]: """Fallback regex-based extraction if XML parsing fails""" traffic_points = [] try: # Pattern to match PM elements pm_pattern = r'(.*?)' pm_matches = re.findall(pm_pattern, xml_content, re.DOTALL) for pm_content in pm_matches: traffic_point = {} # Extract key fields patterns = { 'idelem': r'(.*?)', 'descripcion': r'(.*?)', 'intensidad': r'(.*?)', 'ocupacion': r'(.*?)', 'st_x': r'(.*?)', 'st_y': r'(.*?)' } for field, pattern in patterns.items(): match = re.search(pattern, pm_content) if match: traffic_point[field] = match.group(1).strip() # Convert coordinates if 'st_x' in traffic_point and 'st_y' in traffic_point: latitude, longitude = self.convert_utm_to_latlon( traffic_point['st_x'], traffic_point['st_y'] ) if latitude and longitude: traffic_point.update({ 'latitude': latitude, 'longitude': longitude, 'intensidad': self.safe_int(traffic_point.get('intensidad', '0')), 'ocupacion': self._safe_float(traffic_point.get('ocupacion', '0')), 'measurement_point_id': traffic_point.get('idelem'), 'measurement_point_name': traffic_point.get('descripcion'), 'timestamp': datetime.now(timezone.utc), 'source': 'madrid_opendata_xml_regex' }) traffic_points.append(traffic_point) self.logger.debug("Regex extraction completed", points=len(traffic_points)) return traffic_points except Exception as e: self.logger.error("Error in regex extraction", error=str(e)) return [] def parse_measurement_points_csv(self, csv_content: str) -> Dict[str, Dict[str, Any]]: """Parse measurement points CSV into lookup dictionary""" measurement_points = {} try: # Parse CSV with semicolon delimiter csv_reader = csv.DictReader(io.StringIO(csv_content), delimiter=';') processed_count = 0 for row in csv_reader: try: # Extract point ID and coordinates point_id = row.get('id', '').strip() if not point_id: continue processed_count += 1 # Try different coordinate field names lat_str = '' lon_str = '' # Common coordinate field patterns lat_fields = ['lat', 'latitude', 'latitud', 'y', 'utm_y'] lon_fields = ['lon', 'lng', 'longitude', 'longitud', 'x', 'utm_x'] for field in lat_fields: if field in row and row[field].strip(): lat_str = row[field].strip() break for field in lon_fields: if field in row and row[field].strip(): lon_str = row[field].strip() break if lat_str and lon_str: try: # Try direct lat/lon first latitude = self._safe_float(lat_str) longitude = self._safe_float(lon_str) # If values look like UTM coordinates, convert them if latitude > 1000 or longitude > 1000: latitude, longitude = self.convert_utm_to_latlon(lon_str, lat_str) if not latitude or not longitude: continue # Validate Madrid area if not (40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5): continue measurement_points[point_id] = { 'id': point_id, 'latitude': latitude, 'longitude': longitude, 'name': row.get('nombre', row.get('descripcion', f"Point {point_id}")), 'type': row.get('tipo', 'traffic'), 'raw_data': dict(row) # Keep original data } except Exception as e: self.logger.debug("Error processing point coordinates", point_id=point_id, error=str(e)) continue except Exception as e: self.logger.debug("Error processing CSV row", error=str(e)) continue self.logger.info("Parsed measurement points registry", total_points=len(measurement_points)) return measurement_points except Exception as e: self.logger.error("Error parsing measurement points CSV", error=str(e)) return {} def calculate_data_quality_score(self, row: Dict[str, str]) -> float: """Calculate data quality score for a traffic record""" try: score = 1.0 # Check for missing or invalid values intensidad = row.get('intensidad', '').strip() if not intensidad or intensidad in ['N', '', '0']: score *= 0.7 ocupacion = row.get('ocupacion', '').strip() if not ocupacion or ocupacion in ['N', '', '0']: score *= 0.8 error_status = row.get('error', '').strip() if error_status and error_status != 'N': score *= 0.6 # Check for reasonable value ranges try: intensidad_val = self.safe_int(intensidad) if intensidad_val < 0 or intensidad_val > 5000: # Unrealistic traffic volume score *= 0.7 ocupacion_val = self.safe_int(ocupacion) if ocupacion_val < 0 or ocupacion_val > 100: # Invalid percentage score *= 0.5 except: score *= 0.6 return max(0.1, score) # Minimum quality score except Exception as e: self.logger.debug("Error calculating quality score", error=str(e)) return 0.5 # Default medium quality async def process_csv_content_chunked(self, text_content: str, csv_filename: str, nearest_ids: set, nearest_points: list) -> list: """Process CSV content in chunks to prevent memory issues""" import csv import io import gc try: csv_reader = csv.DictReader(io.StringIO(text_content), delimiter=';') chunk_size = 10000 chunk_records = [] all_records = [] processed_count = 0 total_rows_seen = 0 for row in csv_reader: total_rows_seen += 1 measurement_point_id = row.get('id', '').strip() if measurement_point_id not in nearest_ids: continue try: record_data = await self.parse_historical_csv_row(row, nearest_points) if record_data: chunk_records.append(record_data) processed_count += 1 if len(chunk_records) >= chunk_size: all_records.extend(chunk_records) chunk_records = [] gc.collect() except Exception as e: if processed_count < 5: self.logger.error("Row parsing exception", row_num=total_rows_seen, measurement_point_id=measurement_point_id, error=str(e)) continue # Process remaining records if chunk_records: all_records.extend(chunk_records) chunk_records = [] gc.collect() self.logger.info("Processed CSV file", filename=csv_filename, total_rows_read=total_rows_seen, processed_records=processed_count) return all_records except Exception as e: self.logger.error("Error processing CSV content", filename=csv_filename, error=str(e)) return [] async def parse_historical_csv_row(self, row: dict, nearest_points: list) -> dict: """Parse a single row from Madrid's historical traffic CSV""" try: # Extract date fecha_str = row.get('fecha', '').strip() if not fecha_str: return None try: from datetime import datetime, timezone date_obj = datetime.strptime(fecha_str, '%Y-%m-%d %H:%M:%S') date_obj = date_obj.replace(tzinfo=timezone.utc) except Exception: return None measurement_point_id = row.get('id', '').strip() # Find point data point_match = next((p for p in nearest_points if p[0] == measurement_point_id), None) if not point_match: return None point_data = point_match[1] distance_km = point_match[2] # Extract traffic data intensidad = self.safe_int(row.get('intensidad', '0')) ocupacion = self.safe_int(row.get('ocupacion', '0')) carga = self.safe_int(row.get('carga', '0')) vmed = self.safe_int(row.get('vmed', '0')) # Build basic result (business logic will be applied elsewhere) result = { 'date': date_obj, 'measurement_point_id': measurement_point_id, 'point_data': point_data, 'distance_km': distance_km, 'traffic_data': { 'intensidad': intensidad, 'ocupacion': ocupacion, 'carga': carga, 'vmed': vmed }, 'data_quality_score': self.calculate_data_quality_score(row), 'raw_row': row } return result except Exception as e: self.logger.debug("Error parsing historical CSV row", error=str(e)) return None